### Goal: preprocess text data, convert it into a bag-of-words representation, and then apply a Naive Bayes classifier to predict sentiment.

In [None]:
import pandas as pd

# Load dataset
df = pd.read_csv('../datasets/restaurant-reviews.tsv', sep='\t', quoting=3)

### Step 1: Basic Cleaning
* convert to lowercase
* remove punctuation
* remove numbers

In [16]:
# Basic Cleaning

import re

# Function to clean a single review
def clean_review(review):
    text = review.lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    return text.strip()  # Remove leading/trailing whitespace

df['cleaned_review'] = df['Review'].apply(clean_review)

print("Sample cleaned reviews:")
print(df['cleaned_review'].head())

Sample cleaned reviews:
0                                 wow loved this place
1                                    crust is not good
2             not tasty and the texture was just nasty
3    stopped by during the late may bank holiday of...
4    the selection on the menu was great and so wer...
Name: cleaned_review, dtype: object


### Remove Stopwords and Apply Stemming
* Stopwords are words that do not contribute to the meaning of a sentence, such as "the", "is", "in", etc.
* Stemming is the process of reducing words to their root form, e.g., "running" to "run".

In [None]:
# Remove stop words and apply stemming
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer


# Download NLTK resources
nltk.download('stopwords')
# Initialize stop words and stemmer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# Function to remove stop words and stem words in a review
def preprocess_review(review):
    words = review.split()
    # Remove stop words and stem remaining words
    filtered_words = [stemmer.stem(word) for word in words if word not in stop_words]
    return ' '.join(filtered_words)

df['processed_review'] = df['cleaned_review'].apply(preprocess_review)

print("\nSample processed reviews:")
print(df['processed_review'].head())


Sample processed reviews:
0                                       wow love place
1                                           crust good
2                                   tasti textur nasti
3    stop late may bank holiday rick steve recommen...
4                              select menu great price
Name: processed_review, dtype: object


[nltk_data] Downloading package stopwords to /Users/taha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
# Convert Text To Bag-of-Words Representation
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer
vectorizer = CountVectorizer(max_features=1500)  # Limit to 1500 features for simplicity

# Fit and transform the processed reviews
X = vectorizer.fit_transform(df['processed_review'])
y = df['Liked'].values

In [19]:
# Train/Test Split
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB

# Initialize the classifier
classifier = MultinomialNB()

# Train the classifier
classifier.fit(X_train, y_train)


In [None]:
# Evaluate the Classifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Make predictions on the test set
y_pred = classifier.predict(X_test)

# Calculate and print evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.745

Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.75      0.74        96
           1       0.76      0.74      0.75       104

    accuracy                           0.74       200
   macro avg       0.74      0.75      0.74       200
weighted avg       0.75      0.74      0.75       200


Confusion Matrix:
[[72 24]
 [27 77]]


In [None]:
# Try using TF-IDF to see if it improves performance
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1500)

# Fit and transform the processed reviews
X_tfidf = tfidf_vectorizer.fit_transform(df['processed_review'])
y = df['Liked'].values

# Train/Test Split for TF-IDF
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Train a Naive Bayes Classifier
classifier_tfidf = MultinomialNB()
classifier_tfidf.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred_tfidf = classifier_tfidf.predict(X_test_tfidf)

# Calculate and print evaluation metrics for TF-IDF
print("\nTF-IDF Accuracy:", accuracy_score(y_test, y_pred_tfidf))
print("\nTF-IDF Classification Report:")
print(classification_report(y_test, y_pred_tfidf))


TF-IDF Accuracy: 0.76

TF-IDF Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.78      0.76        96
           1       0.79      0.74      0.76       104

    accuracy                           0.76       200
   macro avg       0.76      0.76      0.76       200
weighted avg       0.76      0.76      0.76       200


TF-IDF Confusion Matrix:
[[75 21]
 [27 77]]
