## Import Necessary Libraries

In [63]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import wordnet, stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from transformers import pipeline
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch
import pickle
lemmatizer  = WordNetLemmatizer()



## Load Datasets

In [None]:
reviews_train = pd.read_csv('./1429_1.csv') # Used for training and fine tunning
reviews_test = pd.read_csv('./Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products.csv') # Used for testing
#reviews_2 = pd.read_csv('./Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products_May19.csv') DUPLICATE INFORMATION


## Exploratory Data Analysis

In [None]:
display(reviews_train.shape)
display(reviews_train.columns)
display(reviews_train.sample(1))
reviews = reviews_train[['categories', 'reviews.rating', 'reviews.text']]
display(reviews.dtypes)
# Check for missing values in the DataFrame
missing_values = pd.isnull(reviews)
# Count missing values in each column
missing_counts = missing_values.sum()
# Count columns with missing values
columns_with_missing = missing_counts[missing_counts > 0].count()
# Check if all columns have missing values
all_columns_missing = missing_counts.all()
# Calculate the total number of missing values
total_missing_values = missing_counts.sum()

# Display the results
print("Missing Values in Each Column:\n", missing_counts)
print("\nNumber of Columns with Missing Values:", columns_with_missing)
print("All Columns Have Missing Values:", all_columns_missing)
print("\nTotal Missing Values in the DataFrame:", total_missing_values)

## Feature Preparation

### Erase rows with missing values

In [66]:
reviews = reviews.dropna().reset_index()
reviews_test = reviews_test.dropna().reset_index()

### Bonus: summarize by Categories

In [None]:
# Group by 'category' and 'rating' and concatenate text entries
grouped_reviews = reviews.groupby(['categories', 'reviews.rating'])['reviews.text'].apply(lambda x: ' '.join(x)).reset_index()


#### Import summarization model from Hgging Face

In [None]:
# Load the summarization pipeline with a pre-trained model
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
summary = []
text = grouped_reviews['reviews.text']

for sentence in text.values:
    summary.append(summarizer(sentence))

# Add summarized text to a new column
grouped_reviews['summary'] = pd.Series(summary)
# Display the result
print(grouped_reviews[['category', 'rating', 'summary']])

#### Export dataframe into csv for plotting purposes

In [None]:
grouped_reviews.to_csv('categories_summary.csv')

### Change values of column 'reviews.rating' for standardization purposes

In [None]:

reviews['reviews.rating'] = reviews['reviews.rating'].replace({1.0: 'negative', 2.0: 'negative', 3.0: 'negative', 
                                               4.0: 'neutral', 5.0: 'positive'})

display(reviews['reviews.rating'].value_counts())


### Display the information about our cleaned dataset

In [None]:
display(reviews.info())


### Build a smaller sample for efficiency and training purposes. Balance the dataset through the min of value counts

In [None]:
min_samples = reviews['reviews.rating'].value_counts().min()

# Filter for each sentiment class
positive_reviews = reviews[reviews['reviews.rating'] == 'positive'].sample(n=min_samples, random_state=42, replace=True)
negative_reviews = reviews[reviews['reviews.rating'] == 'negative'].sample(n=min_samples, random_state=42, replace=True)
neutral_reviews = reviews[reviews['reviews.rating'] == 'neutral'].sample(n=500, random_state=42, replace=True)

# Concatenate the sampled DataFrames
balanced_reviews = pd.concat([positive_reviews, negative_reviews, neutral_reviews])

# Shuffle the resulting DataFrame to mix the classes
balanced_reviews = balanced_reviews.sample(frac=1, random_state=42).reset_index(drop=True)

# Now balanced_df contains 3000 of each sentiment class
print(balanced_reviews['reviews.rating'].value_counts())
reviews_sample = balanced_reviews


## Define our functions

In [13]:
def data_cleaning(text):
    """
    This function processes each setence and applies regex patterns to remove undesired characters.
    In this case we built it detele characters that should be equally translated by computers and humans:
    - special characters
    - numerical characters/digits
    - single characthers
    - multiple spaces (for cleaning purposes)

    Argument: text/corpus/document/sentence; string
    """

    # Remove numbers
    text_no_special_characters = re.sub(r'[^A-Za-z\s]+', ' ', str(text))

    # Remove all single characters (e.g., 'a', 'b', 'c' that appear as standalone)
    text_no_single_charac = re.sub(r'\b\w\b', '', text_no_special_characters)

    # Clean up extra spaces left after removing single characters
    text_cleaned = re.sub(r'\s+', ' ', text_no_single_charac).strip()

    # Transform data to lowercase
    text_cleaned = text_cleaned.lower()

    return text_cleaned


def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""

    tag = nltk.pos_tag([word])[0][1][0]
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    
    return tag_dict.get(tag, wordnet.NOUN)

def data_processing(text):
    """
    This function processes each sentence in the following order:
    1. Tokenize each word of the sentence.
    2. Remove stopwords and stem words, if any word is in the 'stopwords.words("english")' list.
    3. Lemmatize every word not in the stopwords list
    4. Join all the tokens per row, to rebuild the sentences.

    Argument: text/corpus/document/sentence; string
    """
    tolkenize_words = nltk.word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word,get_wordnet_pos(word)) for word in tolkenize_words if word not in stopwords.words("english")]
    text_processed = ' '.join(lemmatized_words)  # Join the words back into a single string

    return text_processed



def plot_confusion_matrix(y_true, y_pred):
    """
    Plots the confusion matrix using seaborn heatmap.

    Args:
    y_true: list or array of true labels
    y_pred: list or array of predicted labels
    labels: list of label names (optional)
    normalize: boolean, whether to normalize the confusion matrix
    """
    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred, labels=['negative', 'neutral', 'positive'])
    
    # Plot the heatmap
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap='Blues',
                xticklabels=['negative', 'neutral', 'positive'], yticklabels=['negative', 'neutral', 'positive'])
    
    # Add labels and title
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.show()

def plot_vec_count_matrix(X_vectorized, vectorizer):
    # Convert to DataFrame for easier handling
    count_df = pd.DataFrame(X_vectorized.toarray(), columns=vectorizer.get_feature_names_out())
    term_sums = count_df.sum().sort_values(ascending=False)

    # Plot the most common terms
    plt.figure(figsize=(10, 6))
    term_sums.head(10).plot(kind='bar', color='skyblue')
    plt.title("Top 10 Most Frequent Terms")
    plt.xlabel("Terms")
    plt.ylabel("Frequency")
    plt.show()


## Data cleaning and Data Processing

In [None]:
reviews_cleaned = reviews_sample['reviews.text'].apply(data_cleaning)
reviews_processed = reviews_cleaned.apply(data_processing)

reviews_test_cleaned = reviews_test['reviews.text'].apply(data_cleaning)
reviews_test_processed = reviews_test_cleaned.apply(data_processing)

reviews_processed.head(), reviews_test_processed.head()

## Data Transformation

### Train and Test sets

In [15]:
X = reviews_processed
y = reviews_sample['reviews.rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.25)

### TF-IDF Transformation

In [None]:
# Create the Bag of Words model
vectorizer = TfidfVectorizer()
vectorizer_count = CountVectorizer()

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

X_train_count = vectorizer_count.fit_transform(X_train)
X_test_count = vectorizer_count.transform(X_test)

with open('TF-IDF_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

plot_vec_count_matrix(X_train_count, vectorizer_count)

## Prediction Models

### Multinomial NB with gridSearch CV

#### Model Training and Cross Validation

In [None]:
model_NB = MultinomialNB()

# Define the hyperparameters to search
param_grid_NB = {
    'alpha':  [0.21],  # Regularization parameter
    'fit_prior': [True]  # Whether to learn class priors
}

# Create the GridSearchCV object
grid_search_NB = GridSearchCV(estimator=model_NB, param_grid=param_grid_NB, 
                           cv=5, scoring='accuracy', n_jobs=1)

# Assuming you have your data X_train and y_train
grid_search_NB.fit(X_train_tfidf, y_train)

# with open('MultinomialNB.pkl', 'wb') as f:
#     pickle.dump(grid_search_NB, f)

# Print the best parameters and best score
print("Best Parameters: ", grid_search_NB.best_params_)
print("Best Score: ", grid_search_NB.best_score_)


#### Model prediction and Metrics

In [None]:
y_pred_NB = grid_search_NB.predict(X_test_tfidf)

acc_NB=accuracy_score(y_test, y_pred_NB)
print('Accuracy: ', acc_NB)

print("\nClassification Report NB:\n", classification_report(y_test, y_pred_NB))
plot_confusion_matrix(y_test, y_pred_NB)

### Logistic Regession with GridSearch CV

#### Model Training and Cross Validation

In [None]:
# Create the Logistic Regression model
model_LR = LogisticRegression()

# Define the hyperparameters to search
param_grid_LR = {
    'C': [1, 2],  # Inverse of regularization strength, smaller values mean stronger regularization
    'max_iter': [100, 120]  # Maximum number of iterations for convergence
}

# Create the GridSearchCV object
grid_search_LR = GridSearchCV(estimator=model_LR, 
                            param_grid=param_grid_LR, 
                            cv=5, 
                            scoring='accuracy', 
                            n_jobs=-1)

# Train the model
grid_search_LR.fit(X_train_tfidf, y_train)

# with open('LogisticRegression.pkl', 'wb') as f:
#     pickle.dump(grid_search_LR, f)

# Print the best parameters and best score
print("Best Parameters: ", grid_search_LR.best_params_)
print("Best Score: ", grid_search_LR.best_score_)


#### Model prediction and Metrics

In [None]:
y_pred_LR = grid_search_LR.predict(X_test_tfidf)

acc_LR=accuracy_score(np.asarray(y_test), y_pred_LR)
print('Accuracy: ', acc_LR)

print("\nClassification Report LR:\n", classification_report(np.asarray(y_test), y_pred_LR))
plot_confusion_matrix(y_test, y_pred_LR)

### SVC with GridSearch CV

#### Model training and Cross Validation

In [None]:
# Create the Logistic Regression model
model_SVC = SVC()

# Define the hyperparameters to search
param_grid_SVC = {
            'kernel': ['linear', 
            'rbf', 'sigmoid']
}

# Create the GridSearchCV object
grid_search_SVC = GridSearchCV(estimator=model_SVC, 
                    param_grid=param_grid_SVC, 
                    cv=5, 
                    scoring='accuracy')

# Train the model
grid_search_SVC.fit(X_train_tfidf, y_train)

# with open('SVC.pkl', 'wb') as f:
#     pickle.dump(grid_search_SVC, f)

# Print the best parameters and best score
print("Best Parameters: ", grid_search_SVC.best_params_)
print("Best Score: ", grid_search_SVC.best_score_)

#### Model Prediction and Metrics

In [None]:
y_pred_SVC = grid_search_SVC.predict(X_test_tfidf)

acc_SVC = accuracy_score(np.asarray(y_test), y_pred_SVC)
print('Accuracy: ', acc_SVC)

print("\nClassification Report SVC:\n", classification_report(np.asarray(y_test), y_pred_SVC))
plot_confusion_matrix(y_test, y_pred_SVC)

### Random Forest with GridSearch CV

#### Model Training and Cross Validation

In [None]:
# Create the RandomForestClassifier model
model_RF = RandomForestClassifier()

# Define the hyperparameters to search
param_grid_RF = {
    'n_estimators': [100],  # Number of trees in the forest
    'max_depth': [2],  # Maximum depth of the tree
    'min_samples_split': [2],  # Minimum number of samples required to split a node
    'min_samples_leaf': [1],  # Minimum number of samples required to be at a leaf node
    'bootstrap': [True]  # Whether bootstrap samples are used when building trees
}

# Create the GridSearchCV object
grid_search_RF = GridSearchCV(estimator=model_RF, 
                            param_grid=param_grid_RF, 
                            cv=5, 
                            scoring='accuracy', 
                            n_jobs=1, 
                            verbose=1)

#Train the model
grid_search_RF.fit(X_train_tfidf, y_train)

with open('RandomForest.pkl', 'wb') as f:
    pickle.dump(grid_search_RF, f)

# Print the best parameters and best score
print("Best Parameters: ", grid_search_RF.best_params_)
print("Best Score: ", grid_search_RF.best_score_)


#### Model Prediction and Metrics

In [None]:
y_pred_RF = grid_search_RF.predict((X_test_tfidf))

acc_RF=accuracy_score(np.asarray(y_test), y_pred_RF)
print('Accuracy: ', accuracy_score)

print("\nClassification Report RF:\n", classification_report(y_test, y_pred_RF))
plot_confusion_matrix(y_test, y_pred_RF)

### Pipeline PreTrained Model without Fine Tunning

In [None]:
# Download pretrained model
model_name='cardiffnlp/twitter-roberta-base-sentiment'

# Instanciate Classifier
classifier = pipeline('sentiment-analysis', model=model_name, truncation=True, padding=True)

# Execute Classifier
ratings = classifier(reviews_cleaned.tolist())

In [None]:
len(reviews_cleaned)
len(reviews_sample['reviews.rating'])

### Pipiline with Fine Tunning

In [None]:
ratings = [rating['label'] for rating in ratings]
predicted_labels_pipeline_no_tun = list(map(lambda label: 'negative' if label == 0 else
                                      'neutral' if label == 1 else
                                      'positive' if label == 2 else
                                      'negative', ratings))

#true_labels = reviews_sample['reviews.rating']
acc_pipeline_no_tun = accuracy_score(np.asarray(reviews_sample['reviews.rating']), predicted_labels_pipeline_no_tun)
print('Accuracy: ', acc_pipeline_no_tun)

print("\nClassification Report Pipeline Model:\n", classification_report(reviews_sample['reviews.rating'], predicted_labels_pipeline_no_tun))
plot_confusion_matrix(reviews_sample['reviews.rating'], predicted_labels_pipeline_no_tun)


#### Quick Data Transformation for label compatibility in the training process

In [None]:
### transformation required for pipiline training ###
# Label inputs must be integers
labels_for_pip_with_tun = reviews_sample['reviews.rating'].replace({'negative':2, 'neutral': 1, 'positive':0})

# Repeat Train and Test set split
X = reviews_processed
y = labels_for_pip_with_tun

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

#### Fine Tunning

In [None]:
# Load pre-trained model and tokenizer
tokenizer = RobertaTokenizer.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment')
model_pretrained = RobertaForSequenceClassification.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment', num_labels=3)

# Tokenize the inputs
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, return_tensors=None)
val_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True, return_tensors=None)

# Create Dataset objects
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'label': y_train.tolist()
})
val_dataset = Dataset.from_dict({
    'input_ids': val_encodings['input_ids'],
    'label': y_test.tolist()
})

# Set training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=32,   # batch size per device during training
    per_device_eval_batch_size=32,    # batch size for evaluation
    weight_decay=0.01,                # strength of weight decay
    evaluation_strategy="epoch"     # evaluate every epoch
)

# Create Trainer instance
trainer = Trainer(
    model=model_pretrained,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Train the model
trainer.train()


#### Model predictions

In [None]:
# Use the trainer's predict method
predictions_pipeline_with_tun = trainer.predict(val_dataset)

# softmax the probabilities
softmax = torch.nn.Softmax(dim=-1)
probs = softmax(torch.tensor(predictions_pipeline_with_tun.predictions))

# Convert logits to predicted class labels
predicted_labels_tensor = torch.argmax(probs, dim=-1)


#### Replace output labels from integers to positive, negative, and neutral

In [62]:
predicted_labels_pipeline_with_tun = list(map(lambda label: 'negative' if label == 2 else
                                      'neutral' if label == 1 else
                                      'positive' if label == 0 else
                                      'negative', predicted_labels_tensor))

#### Redefine input labels from integers to positive, negative and neutral

In [72]:
true_labels =  list(map(lambda label: 'negative' if label == 2 else
                                      'neutral' if label == 1 else
                                      'positive' if label == 0 else
                                      'negative', y_test))

#### Metrics

In [None]:
#true_labels = reviews_sample['reviews.rating']
acc_pipeline_with_tun = accuracy_score(np.asarray(true_labels), predicted_labels_pipeline_with_tun)
print('Accuracy: ', acc_pipeline_with_tun)

print("\nClassification Report Pipeline Model:\n", classification_report(true_labels, predicted_labels_pipeline_with_tun))
plot_confusion_matrix(true_labels, predicted_labels_pipeline_with_tun)