## Imports

In [9]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn import svm
from sklearn.metrics import accuracy_score, f1_score, classification_report

## Loading the Dataset

In [10]:
df = pd.read_csv("Tripadvisor_Hotel_Reviews.csv")  # Replace with your actual dataset path
print("Dataset loaded. Shape:", df.shape)

Dataset loaded. Shape: (20491, 2)


## Preprocessing the Data

In [11]:
# Data Preprocessing: Filter for Positive (Rating == 5) and Negative (Rating < 3) reviews
df_neg = df[df["Rating"] < 3]
df_pos = df[df["Rating"] == 5].sample(len(df_neg), random_state=42)
df_all = pd.concat([df_neg, df_pos], axis=0).reset_index(drop=True)

# Create Sentiment Column
df_all["Sentiment"] = np.where(df_all["Rating"] == 5, "Positive", "Negative")
df_all = df_all.sample(frac=1, random_state=42).reset_index(drop=True)  # Shuffle the dataset

## Shuffling and Splitting Data

In [4]:
x_train, x_test, y_train, y_test = train_test_split(df_all["Review"], df_all["Sentiment"], test_size=0.2, random_state=42)

Training set size: 5142
Test set size: 1286


## Feature Extraction

In [14]:
# Feature Extraction with TF-IDF Vectorizer
vectorizer = TfidfVectorizer(min_df=5, ngram_range=(1, 2))
x_train_vec = vectorizer.fit_transform(x_train)
x_test_vec = vectorizer.transform(x_test)

## Hyperparameter tuning for SVM Model

In [13]:
# Model Training with SVM and Class Weights
param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf'], 'gamma': ['scale', 'auto']}
clf_svm = RandomizedSearchCV(
    svm.SVC(class_weight='balanced'),  # Add class_weight='balanced' for handling class imbalance
    param_grid,
    scoring='f1_macro',
    cv=3,
    n_iter=10,
    random_state=42
)
clf_svm.fit(x_train_vec, y_train)

## Checking for the best model

In [15]:
# Best Model Evaluation
best_svm = clf_svm.best_estimator_
y_pred = best_svm.predict(x_test_vec)

# Results
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='macro')
print("Accuracy:", accuracy * 100, "%")
print("F1 Score:", f1)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 96.3452566096423 %
F1 Score: 0.9634189223948798

Classification Report:
               precision    recall  f1-score   support

    Negative       0.97      0.95      0.96       629
    Positive       0.96      0.97      0.96       657

    accuracy                           0.96      1286
   macro avg       0.96      0.96      0.96      1286
weighted avg       0.96      0.96      0.96      1286



## For testing and reviews

In [16]:
# Function to Predict Sentiment for Multiple Reviews
def predict_review_sentiment(reviews):
    """
    Predicts sentiment for a list of hotel reviews.
    
    Parameters:
    reviews (list of str): List of review text inputs for prediction.
    
    Returns:
    list of str: Predicted sentiment labels for each review.
    """
    # Transform input reviews using the trained TF-IDF vectorizer
    reviews_vec = vectorizer.transform(reviews)
    
    # Predict sentiment
    predictions = best_svm.predict(reviews_vec)
    
    return predictions  # Returns list of 'Positive' or 'Negative'

In [17]:
# Example: Test the function with multiple review inputs
review_inputs = [
    "The room was clean and the location was perfect!",
    "The service was terrible and the room was dirty.",
    "Had a wonderful time, the staff was very friendly.",
    "Not worth the money, very disappointed with the amenities."
]

predictions = predict_review_sentiment(review_inputs)
print("Review Inputs and Predictions:")
for review, prediction in zip(review_inputs, predictions):
    print(f"Review: '{review}' -> Predicted Sentiment: {prediction}")

Review Inputs and Predictions:
Review: 'The room was clean and the location was perfect!' -> Predicted Sentiment: Positive
Review: 'The service was terrible and the room was dirty.' -> Predicted Sentiment: Negative
Review: 'Had a wonderful time, the staff was very friendly.' -> Predicted Sentiment: Positive
Review: 'Not worth the money, very disappointed with the amenities.' -> Predicted Sentiment: Negative
