## Imports

In [27]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn import svm
from sklearn.metrics import accuracy_score, f1_score, classification_report

## Loading the Dataset

In [13]:
df = pd.read_csv("Tripadvisor_Hotel_Reviews.csv")
print("Dataset loaded successfully. Shape:", df.shape)

Dataset loaded successfully. Shape: (20491, 2)


## Preprocessing the Data

In [14]:
# Balancing the dataset with only Positive and Negative classes
df_neg = df[df["Rating"] < 3].reset_index(drop=True)  # Negative reviews (Rating < 3)
df_five = df[df["Rating"] == 5].reset_index(drop=True)  # Positive reviews (Rating == 5)
df_pos = df_five[:len(df_neg)]  # Balance with equal positive samples

# Combine the positive and negative samples
df_all = pd.concat([df_neg, df_pos], axis=0).reset_index(drop=True)
df_all["Sentiment"] = np.where(df_all["Rating"] == 5, "Positive", "Negative")  # Create sentiment labels
df_all = df_all.sample(frac=1, random_state=42).reset_index(drop=True)  # Shuffle data

print("Data Preprocessing completed. Balanced dataset shape:", df_all.shape)
print("Sentiment distribution:\n", df_all["Sentiment"].value_counts())

Data Preprocessing completed. Balanced dataset shape: (6428, 3)
Sentiment distribution:
 Sentiment
Negative    3214
Positive    3214
Name: count, dtype: int64


## Shuffling and Splitting Data

In [15]:
x_train, x_test, y_train, y_test = train_test_split(df_all["Review"], df_all["Sentiment"], test_size=0.2, random_state=42)
print("Training set size:", x_train.shape[0])
print("Test set size:", x_test.shape[0])

Training set size: 5142
Test set size: 1286


## Feature Extraction

In [16]:
vectorizer = TfidfVectorizer(min_df=5, ngram_range=(1, 2)) # Adding bigrams and minimum document frequency
x_train_vec = vectorizer.fit_transform(x_train)
x_test_vec = vectorizer.transform(x_test)
print("TF-IDF Vectorization completed. Number of features:", x_train_vec.shape[1])

TF-IDF Vectorization completed. Number of features: 19533


## Hyperparameter tuning for SVM Model

In [19]:
param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf'], 'gamma': ['scale', 'auto']}
clf_svm = RandomizedSearchCV(svm.SVC(), param_grid, scoring='f1_macro', cv=3, n_iter=10, random_state=42)  # Using randomized search for faster tuning
clf_svm.fit(x_train_vec, y_train)
print("Hyperparameter Tuning completed. Best parameters:", clf_svm.best_params_)

Hyperparameter Tuning completed. Best parameters: {'kernel': 'linear', 'gamma': 'scale', 'C': 1}


## Checking for the best model

In [20]:
best_svm = clf_svm.best_estimator # Get the best estimator after tuning
y_pred = best_svm.predict(x_test_vec)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='macro')

print("Model Evaluation:")
print("Accuracy:", accuracy * 100, "%")
print("F1 Score:", f1)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Model Evaluation:
Accuracy: 96.42301710730948 %
F1 Score: 0.9641989116120536

Classification Report:
               precision    recall  f1-score   support

    Negative       0.97      0.96      0.96       629
    Positive       0.96      0.97      0.97       657

    accuracy                           0.96      1286
   macro avg       0.96      0.96      0.96      1286
weighted avg       0.96      0.96      0.96      1286



## For testing and reviews

In [26]:
sample_reviews = ["amazing food", "good", "horrible experience", "great location but service was poor", "the food is so ass", "food is bad man"]
sample_reviews_vec = vectorizer.transform(sample_reviews)
sample_predictions = best_svm.predict(sample_reviews_vec)

print("Sample Review Predictions:")
for review, sentiment in zip(sample_reviews, sample_predictions):
    print(f"Review: '{review}' -> Predicted Sentiment: {sentiment}")

Sample Review Predictions:
Review: 'amazing food' -> Predicted Sentiment: Positive
Review: 'good' -> Predicted Sentiment: Positive
Review: 'horrible experience' -> Predicted Sentiment: Negative
Review: 'great location but service was poor' -> Predicted Sentiment: Negative
Review: 'the food is so ass' -> Predicted Sentiment: Negative
Review: 'food is bad man' -> Predicted Sentiment: Negative
