importing libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
from collections import Counter

# NLP Libraries
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Scikit-learn Libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score

# Imbalanced-learn
from imblearn.over_sampling import SMOTE

# XGBoost
import xgboost as xgb

# Set pandas display option
pd.set_option('display.max_colwidth', None)



: 

## data loading and cleaning

In [None]:
import pandas as pd

# Load the dataset using the correct relative path
df = pd.read_csv('../data/raw/restaurant_reviews.csv')

# Select the necessary columns and drop rows with missing ratings
df = df[['Review','Rating']]
df = df.dropna(subset=["Rating", "Review"])

# Ensure ratings are digits and convert the column to integer type
df = df[df["Rating"].apply(lambda x: str(x).isdigit())]
df["Rating"] = df["Rating"].astype(int)

# Ensure the 'Review' column is always a string
df['Review'] = df['Review'].astype(str)

print("Data loaded and cleaned successfully.")
print(f"Shape of the dataframe: {df.shape}")
df.head()



MAP rating and setiment

In [None]:
def map_sentiment(rating):
    if rating > 3:
        return "positive"
    elif rating < 3:
        return "negative"
    else:
        return "neutral"

df["Sentiment"] = df["Rating"].apply(map_sentiment)

print("Sentiment column created.")
df.head()


check class balances


In [None]:
# Plot the sentiment distribution
plt.figure(figsize=(8, 5))
sns.countplot(x="Sentiment", data=df, palette="coolwarm", order=['positive', 'negative', 'neutral'])
plt.title("Sentiment Distribution", fontsize=16)
plt.xlabel("Sentiment", fontsize=12)
plt.ylabel("Count", fontsize=12)
plt.show()

# Display the exact value counts
print("Sentiment Value Counts (Normalized):")
print(df["Sentiment"].value_counts(normalize=True))


text cleaning function

In [None]:
# Download necessary NLTK data (only needs to be done once)
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize the lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Remove non-alphabetic characters and convert to lowercase
    text = re.sub('[^a-zA-Z]', ' ', text).lower()
    
    # Tokenize the text
    words = text.split()
    
    # Remove stopwords and lemmatize
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    
    return " ".join(lemmatized_words)

print("Preprocessing function is ready.")


preprocessing

In [None]:
# Apply the preprocessing function to the Review column
# This may take a moment to run
df['Processed_Review'] = df['Review'].apply(preprocess_text)

print("Text preprocessing complete. Here's a before-and-after example:")
print("\n--- ORIGINAL REVIEW ---")
print(df['Review'].iloc[0])
print("\n--- PROCESSED REVIEW ---")
print(df['Processed_Review'].iloc[0])


TDF-IDF vectorization

In [None]:
# Initialize the TF-IDF Vectorizer
# max_features limits the vocabulary size to the top 5000 words
# ngram_range=(1,2) considers both single words and pairs of words (bigrams)
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))

# Create the feature matrix (X) and target vector (y)
X = tfidf.fit_transform(df['Processed_Review'])
y = df['Sentiment']

print("TF-IDF vectorization complete.")
print(f"Shape of the feature matrix (X): {X.shape}")


split data 

In [None]:
# Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, # for reproducibility
    stratify=y       # IMPORTANT for imbalanced data
)

print("Data splitting complete.")
print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")


Apply SMOTE for Resampling

In [None]:
print(f"Original training set distribution: {Counter(y_train)}")

# Initialize SMOTE
smote = SMOTE(random_state=42)

# Fit and apply SMOTE to the training data
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print(f"Resampled training set distribution: {Counter(y_train_resampled)}")


Logistic regression 

In [None]:
# Initialize the Logistic Regression model
# We increase max_iter to ensure convergence
model = LogisticRegression(max_iter=1000, random_state=42)

# Train the model on the resampled training data
model.fit(X_train_resampled, y_train_resampled)

print("Model training complete.")


Random forest 

In [None]:
print("--- Training and Evaluating Random Forest ---")

# Initialize and train the model
rf_model = RandomForestClassifier(n_estimators=200, random_state=42, class_weight='balanced', n_jobs=-1)
rf_model.fit(X_train_resampled, y_train_resampled_encoded)

# Make predictions and evaluate
y_pred_rf = rf_model.predict(X_test)
print("\n--- Random Forest Evaluation ---")
print(classification_report(y_test_encoded, y_pred_rf, target_names=le.classes_))




XG Boost

In [None]:
print("--- Training and Evaluating XGBoost ---")

# Initialize and train the model
xgb_model = xgb.XGBClassifier(objective='multi:softmax', num_class=3, random_state=42, use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(X_train_resampled, y_train_resampled_encoded)

# Make predictions and evaluate
y_pred_xgb = xgb_model.predict(X_test)
print("\n--- XGBoost Evaluation ---")
print(classification_report(y_test_encoded, y_pred_xgb, target_names=le.classes_))



model eval 

In [None]:
# Calculate F1 scores for the neutral class for each model
f1_neutral_log_reg = f1_score(y_test_encoded, y_pred_log_reg, labels=[le.transform(['neutral'])[0]], average='macro')
f1_neutral_rf = f1_score(y_test_encoded, y_pred_rf, labels=[le.transform(['neutral'])[0]], average='macro')
f1_neutral_xgb = f1_score(y_test_encoded, y_pred_xgb, labels=[le.transform(['neutral'])[0]], average='macro')

print(f"Logistic Regression Neutral F1-Score: {f1_neutral_log_reg:.4f}")
print(f"Random Forest Neutral F1-Score: {f1_neutral_rf:.4f}")
print(f"XGBoost Neutral F1-Score: {f1_neutral_xgb:.4f}")

# For this example, let's assume XGBoost performed best and analyze its errors
print("\n--- Analyzing Misclassified Neutral Reviews (from XGBoost model) ---")

# Decode predictions back to string labels for analysis
y_pred_xgb_labels = le.inverse_transform(y_pred_xgb)

# Create a results DataFrame
results_df = pd.DataFrame({'True_Sentiment': y_test, 'Predicted_Sentiment': y_pred_xgb_labels})
results_df['Review'] = df.loc[y_test.index, 'Review']

# Filter for misclassified neutral reviews
misclassified_neutral = results_df[
    (results_df['True_Sentiment'] == 'neutral') & 
    (results_df['Predicted_Sentiment'] != 'neutral')
]

print(f"\nNumber of misclassified neutral reviews: {len(misclassified_neutral)}")
print("Displaying 10 examples:")
print(misclassified_neutral.head(10))
