In [1]:
import pandas as pd

# Load the dataset
file_path = 'Reviews.csv'
reviews_df = pd.read_csv(file_path)

# Display the first few rows of the dataframe
reviews_df.head()


Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [2]:
import re
import numpy as np

# Drop rows with missing values
reviews_df.dropna(inplace=True)

# Function to clean text
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    text = text.strip()  # Remove leading/trailing whitespace
    return text

# Apply the clean_text function to the review column
reviews_df['Review'] = reviews_df['Review'].apply(clean_text)

# Display the first few rows of the cleaned dataframe
reviews_df.head()


Unnamed: 0,Review,Liked
0,wow loved this place,1
1,crust is not good,0
2,not tasty and the texture was just nasty,0
3,stopped by during the late may bank holiday of...,1
4,the selection on the menu was great and so wer...,1


In [4]:
from sklearn.model_selection import train_test_split

# Assuming the dataset has a 'Review' column and a 'Sentiment' column
X = reviews_df['Review']
y = reviews_df['Liked']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the testing data
X_test_tfidf = tfidf_vectorizer.transform(X_test)


In [6]:
from sklearn.linear_model import LogisticRegression

# Initialize the Logistic Regression model
lr_model = LogisticRegression()

# Train the model
lr_model.fit(X_train_tfidf, y_train)


In [7]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Predict the sentiments of the test set
y_pred = lr_model.predict(X_test_tfidf)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Print the classification report
print(classification_report(y_test, y_pred))

# Print the confusion matrix
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.78
              precision    recall  f1-score   support

           0       0.73      0.85      0.79        96
           1       0.84      0.71      0.77       104

    accuracy                           0.78       200
   macro avg       0.79      0.78      0.78       200
weighted avg       0.79      0.78      0.78       200

[[82 14]
 [30 74]]


In [10]:
# Function to predict sentiment of a new review
def predict_sentiment(review):
    review = clean_text(review)
    review_tfidf = tfidf_vectorizer.transform([review])
    sentiment = lr_model.predict(review_tfidf)[0]
    return sentiment

# Test the function with a new review
new_review = "The food was absolutely worst, from preparation to presentation, very bad impression pleasing."
predicted_sentiment = predict_sentiment(new_review)
print(f'Predicted Sentiment: {predicted_sentiment}')
if predicted_sentiment==0:
    print('Predicted Sentiment:Negative')
else:
    print('Predicted Sentiment:Posative')


Predicted Sentiment: 0
Predicted Sentiment:Negative
