In [None]:
import pandas as pd
import numpy as np 

# NLTK for pre-processing
import nltk


import string
# Spacy for pre-processing
import spacy
from spacy.lang.en import English

import warnings
warnings.filterwarnings('ignore') 

from sklearn.linear_model import LogisticRegression

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS

from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.model_selection import  train_test_split

SOURCE_FILE_PATH = "reviews.csv"
TRAIN_FILE_PATH = "train.csv"
VALID_FILE_PATH = "valid.csv"

In [None]:
df = pd.read_csv(SOURCE_FILE_PATH, sep='\t')
df

### Sentiment classify 
+ 0 = negative (ratings 1 & 2)
+ 1 = neutral (rating 3)
+ 2 = positive (ratings 4 & 5) 

In [None]:
def map_to_sentiment(rating_value):
    if rating_value >= 4:
        return 2
    if rating_value == 3:
        return 1
    return 0

df['Sentiment'] = df['RatingValue'].apply(map_to_sentiment)
df

In [None]:
np.unique(df['Sentiment'], return_counts=True)

In [None]:
df['Number'] = df.index + 1
df

In [None]:
df = df[['Number','Sentiment', 'Review']]
df.set_index('Number', inplace=True)
df

### Pre-processing 

In [None]:
# nltk.download('stopwords')
# from nltk.corpus import stopwords

# nlp = spacy.load('en_core_web_sm')
# punctuations = string.punctuation

# STOPLIST = set(stopwords.words('english') + list(ENGLISH_STOP_WORDS))
# SYMBOLS = string.punctuation

# def data_cleaning(doc):
#     doc = nlp(doc, disable=['parser', 'ner'])
#     tokens = [str(token).lower() for token in doc]
#     tokens = [token for token in tokens if token not in STOPLIST and token not in SYMBOLS]
#     return ' '.join(tokens)

# df['Review'] = df['Review'].apply(data_cleaning)
# df

In [None]:
# find the total number of each kind of reviews
negative_num = df[df['Sentiment'] == 0].shape[0]  
neutral_num = df[df['Sentiment'] == 1].shape[0]   
positive_num = df[df['Sentiment'] == 2].shape[0] 

print(f"Negative review count: {negative_num}")
print(f"Neutral review count: {neutral_num}")
print(f"Positive review count: {positive_num}")

In [None]:
# we want to balance the numbers from each class with 300 observations 
positive_samples = df[df['Sentiment'] == 2].sample(n = 300, random_state=42)

In [None]:
neutral_samples = df[df['Sentiment'] == 1].sample(n=300, random_state=42, replace=True)

In [None]:
review_copies_needed = 300 // negative_num
remainder = 300 % negative_num

In [None]:
negative_reviews = df[df['Sentiment'] == 0]
negative_samples =pd.concat([negative_reviews] * review_copies_needed + [negative_reviews.tail(remainder)])

In [None]:
balanced_data = pd.concat([negative_samples, neutral_samples, positive_samples])
print(balanced_data)

### split the data into training and valid

In [None]:
train, valid = train_test_split(balanced_data, test_size=0.3, random_state=42)
train.to_csv(TRAIN_FILE_PATH)
valid.to_csv(VALID_FILE_PATH)

### Modeling 

In [None]:
train = pd.read_csv(TRAIN_FILE_PATH)
X_train = train['Review']
y_train = train['Sentiment']

In [None]:
pipe = Pipeline([('tokenizer', CountVectorizer()),
                 ('tdidftransformer', TfidfTransformer()),
                 ('categicalmodel', LogisticRegression())])
pipe.fit(X_train, y_train)

In [None]:
def evaluate(filename):

    new_df = pd.read_csv(filename)
    X_new = new_df['Review']
    y_new = new_df['Sentiment']
    
    y_pred = pipe.predict(X_new)
    print('Accuracy:', metrics.accuracy_score(y_new, y_pred))
    print('Average F1 Score:', metrics.f1_score(y_new, y_pred, average='macro'))
    f1_scores = metrics.f1_score(y_new, y_pred, average=None)
    print('Class-wise F1 scores:')
    print(f'  negative: {f1_scores[0]:.3f}')
    print(f'   neutral: {f1_scores[1]:.3f}')
    print(f'    positive: {f1_scores[2]:.3f}')
    confusion_matrix = metrics.confusion_matrix(y_new, y_pred)
    print('Confusion_matrix:')
    print('            negative neutral positive')
    print(f'negative     {confusion_matrix[0][0]}        {confusion_matrix[0][1]}       {confusion_matrix[0][2]}')
    print(f'neutral      {confusion_matrix[1][0]}        {confusion_matrix[1][1]}       {confusion_matrix[1][2]}')
    print(f'positive     {confusion_matrix[2][0]}        {confusion_matrix[2][1]}       {confusion_matrix[2][2]}')


In [None]:
evaluate(VALID_FILE_PATH)