## Imports

In [11]:
import numpy as np
import pandas as pd

In [12]:
df = pd.read_csv("Tripadvisor_Hotel_Reviews.csv")

## Cleaning the Data

In [13]:
df["Rating"].value_counts()

Rating
5    9054
4    6039
3    2184
2    1793
1    1421
Name: count, dtype: int64

In [14]:
df_neg = df.loc[df["Rating"] < 3]
df_neg = df_neg.reset_index(drop=True)

In [15]:
df_five = df.loc[df["Rating"] == 5]
df_five = df_five.reset_index(drop=True)

In [16]:
df_pos = df_five.loc[:len(df_neg)]

In [17]:
df_all = pd.concat([df_neg, df_pos], axis=0)
df_all = df_all.reset_index(drop=True)

## Creating the Sentiment Column

In [18]:
df_all["Sentiment"] = np.where(df_all["Rating"] == 5, "Positive", "Negative")

In [19]:
df_all = df_all.sample(frac=1)
df_all = df_all.reset_index(drop=True)

## Splitting into test and and training

In [20]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df_all.Review, df_all.Sentiment)

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()
x_train_vec = v.fit_transform(x_train)
x_test_vec = v.transform(x_test)

## Used a classification model

In [22]:
from sklearn import svm

clf_svm = svm.SVC(kernel="linear")
clf_svm.fit(x_train_vec, y_train)

## Testing accuracy and results

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.model_selection import train_test_split

# Assuming df_all is already defined with 'Review' and 'Sentiment' columns
x_train, x_test, y_train, y_test = train_test_split(df_all.Review, df_all.Sentiment, test_size=0.2, random_state=42)

# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the training data
x_train_vec = vectorizer.fit_transform(x_train)

# Transform the test data
x_test_vec = vectorizer.transform(x_test)

# Train the SVM model
clf_svm = svm.SVC(kernel="linear")
clf_svm.fit(x_train_vec, y_train)

# Evaluate the model
accuracy = clf_svm.score(x_test_vec, y_test)
print("Accuracy:", accuracy * 100, "%")

Accuracy: 96.3452566096423 %


In [24]:
from sklearn.metrics import f1_score

f1_score(y_test, clf_svm.predict(x_test_vec), average=None)

array([0.96500372, 0.96175753])

In [29]:
# Testing the model
rev = ["amazing food"]

# Use the same vectorizer used to train the model
rev_vec = vectorizer.transform(rev)

# Predict sentiment of the new review
prediction = clf_svm.predict(rev_vec)

print("Predicted Sentiment:", prediction)

Predicted Sentiment: ['Positive']
