## Read the file

In [None]:
import pandas as pd

In [None]:
data = pd.read_csv("reviews.csv", delimiter = '\t')
data

Unnamed: 0,Name,RatingValue,DatePublished,Review
0,Playa Cabana,4,2020-02-26,I was tasked with finding a spot for a group d...
1,Playa Cabana,3,2019-08-04,Went here with my friends and family. I liked ...
2,Playa Cabana,3,2019-08-24,Surprisingly good Flautas! They came as 3 roll...
3,Playa Cabana,4,2019-06-06,As a Mexican I always crave authentic Mexican ...
4,Playa Cabana,5,2020-05-25,Best tacos I've ever had. Both locations are g...
...,...,...,...,...
1915,Greedy Duck & Noodles,2,2019-04-29,Maybe I didn't know what to expect but the win...
1916,Greedy Duck & Noodles,5,2019-08-15,This is a casual family run restaurant. It's h...
1917,Greedy Duck & Noodles,5,2019-03-16,Very tasty and unique flavours. This has becom...
1918,Greedy Duck & Noodles,4,2019-10-12,"The dry noodles are pretty good, super strong ..."


## Define Sentiment Groups

Using RataingValue as labels.
- Negative (1&2) - Sentiment: 0
- Neutral (3) - Sentiment: 1
- Positive (4&5) - Sentiment: 2

Drop positive rating in order to balance the data so that have apporximately equal numbers of negative, neutral and positive ratings.

In [None]:
# Define Sentiment groups
def sentiment(rating):
  if rating <= 2:
    return 0
  elif rating == 3:
    return 1
  else:
    return 2

# Apply Sentiment group function ot the "RatingValue" and add the variable
data['Sentiment'] = data["RatingValue"].apply(sentiment)

In [None]:
data.head()

Unnamed: 0,Name,RatingValue,DatePublished,Review,Sentiment
0,Playa Cabana,4,2020-02-26,I was tasked with finding a spot for a group d...,2
1,Playa Cabana,3,2019-08-04,Went here with my friends and family. I liked ...,1
2,Playa Cabana,3,2019-08-24,Surprisingly good Flautas! They came as 3 roll...,1
3,Playa Cabana,4,2019-06-06,As a Mexican I always crave authentic Mexican ...,2
4,Playa Cabana,5,2020-05-25,Best tacos I've ever had. Both locations are g...,2


In [None]:
# Sentiment distribution
data['Sentiment'].value_counts()

2    1465
1     297
0     158
Name: Sentiment, dtype: int64

Since the sentiment groups are imbalanced, thus, choose the least value 158 as the line to downsample "Positive" and "Neutral" groups. Those 158 observations are randomly selected from the original group.

In [None]:
# Create a 'Number' column that represents the counting of observations
data['Number'] = range(1, len(data) + 1)

# Select only the three columns
data = data[['Number', 'Sentiment', 'Review']]


In [None]:
data.head()

Unnamed: 0,Number,Sentiment,Review
0,1,2,I was tasked with finding a spot for a group d...
1,2,1,Went here with my friends and family. I liked ...
2,3,1,Surprisingly good Flautas! They came as 3 roll...
3,4,2,As a Mexican I always crave authentic Mexican ...
4,5,2,Best tacos I've ever had. Both locations are g...


### Downsample Postive and Neutral groups

In [None]:
# Separate the dataset into the different sentiment categories
negative_reviews = data[data['Sentiment'] == 0]
neutral_reviews = data[data['Sentiment'] == 1]
positive_reviews = data[data['Sentiment'] == 2]

# Downsample the neutral and positive categories to 158 observations
neutral_reviews = neutral_reviews.sample(n=158, random_state=1)
positive_reviews = positive_reviews.sample(n=158, random_state=1)

# Combine the downsampled datasets
data1 = pd.concat([negative_reviews, neutral_reviews, positive_reviews])

# Check the new distribution of the balanced dataset
data1['Sentiment'].value_counts()

0    158
1    158
2    158
Name: Sentiment, dtype: int64

## Train and Test Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Split the balanced dataset into training and validation sets (80-20 split)
train_df, valid_df = train_test_split(data1, test_size=0.2, random_state=56)

# Save the training and validation sets to CSV files
train_csv_path = '/content/train.csv'
valid_csv_path = '/content/valid.csv'
train_df.to_csv(train_csv_path, index=False)
valid_df.to_csv(valid_csv_path, index=False)

train_csv_path, valid_csv_path


('/content/train.csv', '/content/valid.csv')

In [None]:
# Train a new classifier using only the training data
X_train = train_df['Review']
y_train = train_df['Sentiment']

## Tokenizing texts

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
# Vectorize the training text data
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
X_train_counts.shape

(379, 4607)

## IF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(379, 4607)

## Train a classifier

In [None]:
from sklearn.naive_bayes import MultinomialNB
# Train the classifier
clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [None]:
from sklearn.metrics import accuracy_score, classification_report
# Now we vectorize the validation set text data using the same vectorizer
# and perform evaluations
X_valid = valid_df['Review']
y_valid = valid_df['Sentiment']
X_valid_vectorized = count_vect.transform(X_valid)
X_valid_tfidf = tfidf_transformer.transform(X_valid_vectorized)

# Predict the sentiment labels for the validation set
y_valid_pred = clf.predict(X_valid_tfidf)

## Building Pipeline

In [None]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [None]:
text_clf.fit(X_train, y_train)

## Evaluate on Validation set

### SVM

In [None]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None)),
])

text_clf.fit(X_train,y_train)

predicted = text_clf.predict(X_valid)

import numpy as np
accuracy = np.mean(predicted == y_valid)
accuracy

0.6

In [None]:
from sklearn import metrics
metrics.confusion_matrix(y_valid, predicted)

array([[22,  9,  4],
       [ 4, 18, 10],
       [ 3,  8, 17]])

In [None]:
report_dict = metrics.classification_report(y_valid, predicted,
                                    target_names=['negative', 'neutral', 'positive'])
print(report_dict)

              precision    recall  f1-score   support

    negative       0.76      0.63      0.69        35
     neutral       0.51      0.56      0.54        32
    positive       0.55      0.61      0.58        28

    accuracy                           0.60        95
   macro avg       0.61      0.60      0.60        95
weighted avg       0.61      0.60      0.60        95



In [None]:
from sklearn import metrics

# Generate the classification report as a dictionary
report = metrics.classification_report(y_valid, predicted,
                                            target_names=['negative', 'neutral', 'positive'],
                                            output_dict=True)

# Extract the accuracy
accuracy = report['accuracy']

# Extract the macro-averaged F1-score
average_f1_score = report['macro avg']['f1-score']

# Extract class-wise F1-scores
f1_score_negative = report['negative']['f1-score']
f1_score_neutral = report['neutral']['f1-score']
f1_score_positive = report['positive']['f1-score']

# Generate the confusion matrix
conf_matrix = metrics.confusion_matrix(y_valid, predicted)

# Normalizing the confusion matrix
conf_matrix_normalized = confusion_matrix(y_valid, predicted,normalize='true')

In [None]:
# Print the formatted output
print(f"Accuracy: {accuracy:.2f}")

print(f"Macro-averaged F1 score: {average_f1_score:.2f}")

print(f"Class-wise F1 scores:\nnegative: {f1_score_negative:.2f}\
\nneutral: {f1_score_neutral:.2f}\npositive: {f1_score_positive:.2f}")

print("Confusion_matrix:")
print(pd.DataFrame(conf_matrix_normalized, index=['negative', 'neutral', 'positive'],
                   columns=['negative', 'neutral', 'positive']).round(2))


Accuracy: 0.60
****************************************
Macro-averaged F1 score: 0.60
****************************************
Class-wise F1 scores:
negative: 0.69
neutral: 0.54
positive: 0.58
****************************************
Confusion_matrix:
          negative  neutral  positive
negative      0.63     0.26      0.11
neutral       0.12     0.56      0.31
positive      0.11     0.29      0.61


## Grid Search - Parameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'clf__alpha': (1e-2, 1e-3),
}

In [None]:
gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=-1)

In [None]:
gs_clf = gs_clf.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# 1. Examine the best parameters and score
best_parameters = gs_clf.best_params_
best_score = gs_clf.best_score_
print("Best Parameters:", best_parameters)

print("Best Cross-Validation Score:", best_score)


# 2. Evaluate the best estimator on the test set
best_estimator = gs_clf.best_estimator_
predictions = best_estimator.predict(X_valid)

# Calculate and print performance metrics
accuracy = accuracy_score(y_valid, predictions)
print(f"Accuracy: {accuracy:.2f}")


# Print the classification report
print("Classification Report:")
print(classification_report(y_valid, predictions))



# Generate the classification report as a dictionary
report = metrics.classification_report(y_valid, predictions,
                                            target_names=['negative', 'neutral', 'positive'],
                                            output_dict=True)
# Class-wise F1-scores and macro average f1
average_f1_score = report['macro avg']['f1-score']
f1_score_negative = report['negative']['f1-score']
f1_score_neutral = report['neutral']['f1-score']
f1_score_positive = report['positive']['f1-score']
print(f"Macro-averaged F1 score: {average_f1_score:.2f}")

print(f"Class-wise F1 scores:\nnegative: {f1_score_negative:.2f}\
\nneutral: {f1_score_neutral:.2f}\npositive: {f1_score_positive:.2f}")


# Print the confusion matrix
conf_matrix = confusion_matrix(y_valid, predictions)
print("Confusion Matrix:")
print(pd.DataFrame(conf_matrix, index=['negative', 'neutral', 'positive'],
                   columns=['negative', 'neutral', 'positive']).round(2))


# Normalizing the confusion matrix
conf_matrix_normalized = confusion_matrix(y_valid, predictions, normalize='true')
print("Normalized Confusion matrix:")
print(pd.DataFrame(conf_matrix_normalized, index=['negative', 'neutral', 'positive'],
                   columns=['negative', 'neutral', 'positive']).round(2))


Best Parameters: {'clf__alpha': 0.01, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}
************************************************************
Best Cross-Validation Score: 0.593719298245614
************************************************************
Accuracy: 0.63
************************************************************
Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.77      0.74        35
           1       0.59      0.50      0.54        32
           2       0.57      0.61      0.59        28

    accuracy                           0.63        95
   macro avg       0.62      0.63      0.62        95
weighted avg       0.63      0.63      0.63        95

************************************************************
Macro-averaged F1 score: 0.62
************************************************************
Class-wise F1 scores:
negative: 0.74
neutral: 0.54
positive: 0.59
******************************************