<a href="https://colab.research.google.com/github/manjusys/Andrio-apps-llm-gemma-review-/blob/main/Model_ML_LLM_Gemma_review.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'google-play-store-apps:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F49864%2F274957%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20241005%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20241005T075453Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D9990b97f9dae87905048ed22e723a51eb412ac218c9d9670485470e9f28dc273e19c69278086ba0627c4b07fb332139166c45851b9dec94872805f982f5c05a3e5fea0c147b36f714b219107cf94d0c2fed9da666ad0c8eb96ae529ce7cc3310a4fac72a0b253b0b72d9446f9daf64da004d52fd907b4c42fa55eda8cf12276ec60fa7111f2268417e0d60d7314155523ee7971e08b030ca01a1ed2a46139802a2a480f0f945919d4efd5a080979446b01e72193c25d89d1f9c32bab4aa1b195371a06db7ee98be0fbb5700eafc7b623439345d6a71fb6dc8936c99061115d8b99e8079fc33698f70fdabe32d5c56fb5d48bcfb2ea02679d7d6b29d51f0cf701'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


<a id="1"></a>
# <div style="text-align:center; border-radius:15px 50px; padding:7px; color:white; margin:0; font-size:110%; font-family:Pacifico; background-color:#0073e6; overflow:hidden"><b> LLM Gemma - Analysis Review Google Play Score</b></div>

# Part 1 - Business Problem: Sentiment Analysis for App Reviews

**Context:**
The mobile app industry is highly competitive, with millions of applications available across various platforms. User reviews provide valuable feedback on user experience, functionality, and satisfaction. Analyzing these reviews is crucial for app developers and companies to understand user sentiment, improve user engagement, and guide product development.

**Problem Statement:**
The goal is to build a sentiment classification model that uses app reviews to automatically categorize user sentiment into positive, negative, or neutral categories. This will enable companies to gain real-time insights into how their apps are being perceived and take proactive measures to enhance user satisfaction and retention.

**Business Questions:**
1. **How are users reacting to the app?**
   - Analyze the proportion of positive, negative, and neutral reviews.
   
2. **What factors influence user sentiment the most?**
   - Use the sentiment polarity and subjectivity scores to identify patterns and key factors driving user satisfaction or dissatisfaction.

3. **How can the app's performance be improved based on user feedback?**
   - By categorizing sentiment, identify common issues in negative reviews and highlight features appreciated in positive reviews.

4. **How can the app development and marketing teams use sentiment analysis to guide decisions?**
   - Generate insights to support data-driven decisions for feature improvements, bug fixes, or marketing strategies.

**Solution Approach:**
Develop a Large Language Model (LLM) using the Gemma platform to automatically classify app reviews based on user sentiment. This model will process the `Translated_Review` and provide a sentiment classification, along with sentiment polarity and subjectivity, to give a comprehensive view of the user experience.

In [None]:
# Installing packages
!pip install transformers
!pip install sentence_transformers
!pip install transformers accelerate
!pip install bitsandbytes
!pip install faiss-cpu
!pip install torch
!pip install PyPDF2
!pip install nltk
!pip install watermark
!pip install accelerate deepspeed
!pip install transformers sentence_transformers faiss-cpu torch PyPDF2 nltk

In [None]:
# Import of libraries

# System libraries
import re
import unicodedata
import itertools
from datasets import Dataset

# Library for file manipulation
import pandas as pd
import numpy as np
import pandas

# Data visualization
import seaborn as sns
import matplotlib.pylab as pl
import matplotlib as m
import matplotlib as mpl
import matplotlib.pyplot as plt
import plotly.express as px
from matplotlib import pyplot as plt

# Configuration for graph width and layout
sns.set_theme(style='whitegrid')
palette='viridis'

## LLM
# Importing necessary libraries from PyTorch and Hugging Face Transformers
# PyTorch is a deep learning framework used for model training and inference
import torch

# AutoTokenizer: Automatically loads a pre-trained tokenizer for encoding text
# AutoModelForCausalLM: Loads a pre-trained model for causal language modeling (e.g., for text generation)
# pipeline: Provides an easy-to-use interface to perform tasks like text generation, sentiment analysis, etc.
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sentence_transformers import SentenceTransformer

# Warnings remove alerts
import warnings
warnings.filterwarnings("ignore")

# Python version
from platform import python_version
print('Python version in this Jupyter Notebook:', python_version())

# Load library versions
import watermark

# Library versions
%reload_ext watermark
%watermark -a "Library versions" --iversions

# Part 2 - Database

In [None]:
# Database
df = pd.read_csv("/kaggle/input/google-play-store-apps/googleplaystore_user_reviews.csv")

# Viewing dataset
df

In [None]:
# Viewing first 5 data
df.head()

In [None]:
# Viewing 5 latest data
df.tail()

In [None]:
# Info data
df.info()

In [None]:
# Type data
df.dtypes

In [None]:
# Viewing rows and columns
df.shape

# Part 3 - Data cleaning

In [None]:
print("Checking for missing values in each column:")
print(df.isnull().sum())

In [None]:
# Calculate the total number of rows and the number of missing values in the 'Translated_Review' column.
# Then, print the percentage of missing values in the 'Translated_Review' column.
total_rows = len(df)
missing_translated_review = df['Translated_Review'].isnull().sum()
print(f"Percentage of missing Translated_Review: {(missing_translated_review/total_rows)*100:.2f}%")

In [None]:
# Drop rows that have missing values in the 'Translated_Review' or 'Sentiment' columns
df = df.dropna(subset=['Translated_Review', 'Sentiment'])

# Print the count of missing values for each column after dropping the rows with missing data
print(df.isnull().sum())

# Display the DataFrame
df

# Part 4 - Text Preprocessing

In [None]:
import re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

### Download the necessary resources from nltk (tokenizers and stopwords corpus)
# Punkt tokenizer for word tokenization
nltk.download('punkt')

# Stopwords list in multiple languages
nltk.download('stopwords')

# Initialize the Porter stemmer and load English stopwords
stemmer = PorterStemmer()

# Create a set of English stopwords for efficient lookup
stop_words = set(stopwords.words('english'))

# Function to clean the text by removing URLs, handles, and punctuation
def clean_text(text):
    if isinstance(text, str):
        # Convert text to lowercase
        text = text.lower()

        # Remove URLs (http, https, and www links)
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

        # Remove markdown-style links [text](link)
        text = re.sub(r'\[.*?\]\(.*?\)', '', text)

        # Remove handles (@username mentions)
        text = re.sub(r'@\w+', '', text)

        # Remove punctuation and special characters
        text = text.translate(str.maketrans('', '', string.punctuation))

        return text
    else:
        return text

# Function to tokenize the text into individual words
def tokenize_text(text):
    if isinstance(text, str):
        return word_tokenize(text)
    else:
        return text

# Function to remove stopwords from the tokenized text
def remove_stopwords(tokens):
    if isinstance(tokens, list):
        return [word for word in tokens if word not in stop_words]
    else:
        return tokens

# Function to apply stemming to the tokens
def stem_tokens(tokens):
    if isinstance(tokens, list):
        return [stemmer.stem(token) for token in tokens]
    else:
        return tokens

### Apply the functions to the DataFrame
# Clean the text
df['Cleaned_Review'] = df['Translated_Review'].apply(clean_text)

# Tokenize the cleaned text
df['Tokenized_Review'] = df['Cleaned_Review'].apply(tokenize_text)

# Apply stemming to the tokenized words
df['Stemmed_Review'] = df['Tokenized_Review'].apply(stem_tokens)

# Remove stopwords from the tokenized text
df['No_Stopwords_Review'] = df['Tokenized_Review'].apply(remove_stopwords)

# Display the first few rows of the DataFrame to visualize the dataset
df.head()

# Part 5 - Exploratory data analysis

In [None]:
# Sentiment Analysis
# Calculate and display the distribution of sentiment labels in the dataset
sentiment_distribution = df['Sentiment'].value_counts()
print("Sentiment Distribution:")
print(sentiment_distribution)
print()

# Descriptive statistics for sentiment polarity and subjectivity
# Display basic statistics (mean, std, min, max, etc.) for the polarity and subjectivity columns
sentiment_stats = df[['Sentiment_Polarity', 'Sentiment_Subjectivity']].describe()
print("Sentiment Polarity and Subjectivity Statistics:")
print(sentiment_stats)

In [None]:
# Gráfico da distribuição de sentimentos com melhorias
plt.figure(figsize=(10, 8))

# Countplot com ordenação por contagem e cores personalizadas
sns.countplot(data=df, x='Sentiment', palette='Set2', order=df['Sentiment'].value_counts().index)

# Adicionar título mais descritivo
plt.title('Distribution of Sentiments in Reviews', fontsize=16)

# Adicionar rótulos aos eixos
plt.xlabel('Sentiment', fontsize=14)
plt.ylabel('Count', fontsize=14)

# Adicionar rótulos de dados (contagem) nas barras
for p in plt.gca().patches:
    plt.gca().annotate(f'{p.get_height()}', (p.get_x() + p.get_width() / 2., p.get_height()),
                       ha='center', va='baseline', fontsize=12, color='black', xytext=(0, 5), textcoords='offset points')

# Mostrar o gráfico
plt.show()

In [None]:
# Improved Visualization of Sentiment Polarity vs Subjectivity
plt.figure(figsize=(10, 6))

# Scatter plot with improvements: transparency, marker size, and adjusted legend
sns.scatterplot(x='Sentiment_Polarity',
                y='Sentiment_Subjectivity',
                data=df,
                hue='Sentiment',
                palette='Set1',
                alpha=0.6,  # Transparency to reduce overlap
                s=70)       # Adjust marker size for better clarity

# Improve title and axis labels for better clarity
plt.title('Sentiment Polarity vs Subjectivity', fontsize=16)
plt.xlabel('Sentiment Polarity', fontsize=14)
plt.ylabel('Sentiment Subjectivity', fontsize=14)

# Move the legend to a better location outside the plot
plt.legend(title='Sentiment', bbox_to_anchor=(1.05, 1), loc='upper left')

# Show the improved plot
plt.tight_layout()  # Adjust layout to avoid cutting off labels or legend
plt.grid(False)
plt.show()

In [None]:
from wordcloud import WordCloud

# Function to generate and plot a word cloud
def plot_wordcloud(text, title):
    # Join the list of reviews into a single string and generate the word cloud
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(text))

    # Plot the word cloud
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')  # Turn off axis lines and labels
    plt.title(title, fontsize=20)
    plt.show()

# Filter the reviews based on sentiment
positive_reviews = df[df['Sentiment'] == 'Positive']['Cleaned_Review']
negative_reviews = df[df['Sentiment'] == 'Negative']['Cleaned_Review']
neutral_reviews = df[df['Sentiment'] == 'Neutral']['Cleaned_Review']

# Generate word clouds for each sentiment
plot_wordcloud(positive_reviews, 'Word Cloud for Positive Sentiment')
plot_wordcloud(negative_reviews, 'Word Cloud for Negative Sentiment')
plot_wordcloud(neutral_reviews, 'Word Cloud for Neutral Sentiment')

In [None]:
from collections import Counter

# Combine all tokens into a single list
all_tokens = [token for tokens in df['Tokenized_Review'] for token in tokens]

# Count the frequency of tokens
token_counts = Counter(all_tokens)

# Get the top 20 most common tokens
common_tokens = token_counts.most_common(20)  # Limiting to top 20

# Separate tokens and their frequencies
tokens, frequencies = zip(*common_tokens)

# Create a bar plot for the most frequent tokens
plt.figure(figsize=(12, 6))
sns.barplot(x=list(frequencies), y=list(tokens), palette='husl')  # Changed palette

# Improved title and axis labels
plt.title('Top 20 Most Common Tokens in Reviews', fontsize=16)
plt.xlabel('Frequency', fontsize=14)
plt.ylabel('Tokens', fontsize=14)

# Add gridlines for easier reading of bar heights
plt.grid(axis='x', linestyle='--', alpha=0.7)

# Display the plot
plt.tight_layout()  # Ensure layout is clean and labels fit well
plt.show()

In [None]:
# reviews App
reviews_per_app = df['App'].value_counts()
print("Reviews per App:")
print(reviews_per_app.head(10))

# Improved version of the top 10 apps by number of reviews bar plot
plt.figure(figsize=(12, 7))  # Increase the figure size for better spacing

# Plot with improved palette and rotation
sns.barplot(x=reviews_per_app.head(10).index, y=reviews_per_app.head(10).values, palette='Blues_d')

# Rotate the x-axis labels to 90 degrees for easier reading
plt.xticks(rotation=90)

# Add data labels on top of each bar
for i, value in enumerate(reviews_per_app.head(10).values):
    plt.text(i, value + 5, str(value), ha='center', fontsize=10)

# Improved title and axis labels
plt.title('Top 10 Apps by Number of Reviews', fontsize=16)
plt.xlabel('App', fontsize=14)
plt.ylabel('Number of Reviews', fontsize=14)

# Display the plot
plt.tight_layout()
plt.show()


In [None]:
# Analyze sentiment distribution by app
sentiment_by_app = df.groupby('App')['Sentiment'].value_counts(normalize=True).unstack()

# Display the top 10 apps
sentiment_by_app_top = sentiment_by_app.head(10)  # Limiting to top 10 apps

plt.figure(figsize=(12, 8))

# Plot stacked bar chart with an improved color palette
sentiment_by_app_top.plot(kind='bar', stacked=True, colormap='Set1', figsize=(12, 8))

# Rotate x-axis labels to 90 degrees for better readability
plt.xticks(rotation=90)

# Improve title and axis labels
plt.title('Sentiment Distribution for Top 10 Apps', fontsize=16)
plt.xlabel('App', fontsize=14)
plt.ylabel('Proportion', fontsize=14)

# Move the legend outside the plot
plt.legend(title='Sentiment', bbox_to_anchor=(1.05, 1), loc='upper left')

# Add gridlines for better readability
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Adjust layout to ensure the plot fits well with the legend
plt.tight_layout()

# Show the improved plot
plt.show()


In [None]:
from collections import Counter

# Function to count the most common words in reviews
def get_most_common_words(reviews, num_words=10):
    # Flatten the list of lists into a single list of words
    words = [word for review in reviews for word in review]
    return Counter(words).most_common(num_words)

# Get the most common words for each sentiment type
positive_words = get_most_common_words(df[df['Sentiment'] == 'Positive']['No_Stopwords_Review'], num_words=10)
negative_words = get_most_common_words(df[df['Sentiment'] == 'Negative']['No_Stopwords_Review'], num_words=10)
neutral_words = get_most_common_words(df[df['Sentiment'] == 'Neutral']['No_Stopwords_Review'], num_words=10)

# Display results in a clear format
print("Most Common Positive Words:")
for word, count in positive_words:
    print(f"{word}: {count}")

print("\nMost Common Negative Words:")
for word, count in negative_words:
    print(f"{word}: {count}")

print("\nMost Common Neutral Words:")
for word, count in neutral_words:
    print(f"{word}: {count}")

In [None]:
# Function to plot the most common words for a given sentiment
def plot_most_common_words(common_words, sentiment, color):
    words, counts = zip(*common_words)

    # Create the bar plot
    plt.figure(figsize=(10, 6))
    sns.barplot(x=list(counts), y=list(words), palette=color)

    # Set titles and labels
    plt.title(f'Most Common Words in {sentiment} Reviews', fontsize=16)
    plt.xlabel('Frequency', fontsize=14)
    plt.ylabel('Words', fontsize=14)

    # Display the plot
    plt.show()

# Plot for Positive Sentiment
plot_most_common_words(positive_words, 'Positive', 'Greens')

# Plot for Negative Sentiment
plot_most_common_words(negative_words, 'Negative', 'Reds')

# Plot for Neutral Sentiment
plot_most_common_words(neutral_words, 'Neutral', 'Blues')

In [None]:
# Calculate mean polarity and subjectivity by sentiment
mean_polarity_subjectivity = df.groupby('Sentiment')[['Sentiment_Polarity', 'Sentiment_Subjectivity']].mean()

# Print the result for verification
print("Mean Polarity and Subjectivity by Sentiment:")
print(mean_polarity_subjectivity)

# Bar plot of average polarity and subjectivity by sentiment
plt.figure(figsize=(10, 6))

# Create a bar plot with side-by-side bars instead of stacked bars
mean_polarity_subjectivity.plot(kind='bar', figsize=(15, 6), colormap='viridis', width=0.8)

# Improve title and axis labels
plt.title('Average Sentiment Polarity and Subjectivity by Sentiment', fontsize=16)
plt.xlabel('Sentiment', fontsize=14)
plt.ylabel('Average Values', fontsize=14)

# Add data labels on top of the bars
for i in range(len(mean_polarity_subjectivity)):
    for j in range(len(mean_polarity_subjectivity.columns)):
        plt.text(i - 0.2 + j * 0.4,
                 mean_polarity_subjectivity.iloc[i, j] + 0.02 * (-1 if mean_polarity_subjectivity.iloc[i, j] < 0 else 1),
                 round(mean_polarity_subjectivity.iloc[i, j], 2),
                 ha='center', fontsize=12)

# Display the legend
plt.legend(loc='upper left', fontsize=12)

# Show the plot
plt.tight_layout()
plt.show()

# Section A) Machine learning models

# Part 6) Vectorizing text with Tfidf Vectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Step 1: Convert text data into numerical features using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limiting to top 5000 features
X = tfidf_vectorizer.fit_transform(df['Cleaned_Review'])
y = df['Sentiment']
tfidf_vectorizer

In [None]:
# Optional: If 'Sentiment' is not already numeric, you can use label encoding
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
le

# Part 7 - Target column split and test

In [None]:
# Importing library
from sklearn.model_selection import train_test_split

# Training and testing division
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Viewing training data
print("Viewing rows and columns given by X train", X_train.shape)

# Viewing test data
print("Viewing rows and columns given y train", y_train.shape)

In [None]:
# Convert your TF-IDF sparse matrix to a dense matrix
X_train_dense = X_train.toarray()
X_test_dense = X_test.toarray()

Here, we conducted the training of the model using a train-test split. We adopted an 80/20 division, where 80% of the data was used for training and the remaining 20% was reserved for testing. This procedure is crucial for accurately evaluating the model's performance. The training set allows the model to learn patterns and relationships within the data, while the test set, which the model has not seen during training, is used to validate its ability to generalize and predict new data. Additionally, this approach helps identify and mitigate issues such as overfitting, ensuring that the model not only memorizes the training data but also performs well on unseen data.

# Part 8) Machine learning model training

In [None]:
# Importing libraries
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# Models to be evaluated
models = [
            # Naive Bayes Model (requires dense matrix)
            GaussianNB(),

            # Decision Tree Model
            DecisionTreeClassifier(random_state=42),

            # Random forest model
            RandomForestClassifier(n_estimators=100, random_state=42),

            # Logistic regression model
            LogisticRegression(random_state=50),

            # Ada Boost Model
            AdaBoostClassifier(random_state=45),

            # XGBoost Model (can use sparse matrix)
            XGBClassifier(tree_method='gpu_hist', random_state=42),

            # LightGBM Model (can use sparse matrix)
            LGBMClassifier(num_leaves=31,
                           boosting_type='gbdt',
                           bagging_fraction=0.9,
                           learning_rate=0.05,
                           feature_fraction=0.9,
                           bagging_freq=50,
                           verbose=50,
                           device='gpu'),

            # K-Nearest Neighbors Model
            KNeighborsClassifier(n_neighbors=13),

            # Gradient Boosting Classifier
            GradientBoostingClassifier(random_state=42)]

# Evaluate each model
for i, model in enumerate(models):
    # For GaussianNB (requires dense matrix)
    if isinstance(model, GaussianNB):
        model.fit(X_train_dense, y_train)
        train_accuracy = accuracy_score(y_train, model.predict(X_train_dense))
        test_accuracy = accuracy_score(y_test, model.predict(X_test_dense))
    else:
        # For all other models
        model.fit(X_train, y_train)
        train_accuracy = accuracy_score(y_train, model.predict(X_train))
        test_accuracy = accuracy_score(y_test, model.predict(X_test))

    print(f"Model {i+1}: {type(model).__name__}")
    print(f"Training Accuracy: {train_accuracy:.4f}")
    print(f"Testing Accuracy: {test_accuracy:.4f}")
    print("-----------------")

**Part 8.1 - Feature importances**

- Feature importances refers to the measure of how important each feature is for a machine learning model in making predictions or classifications. In other words, it is a way to quantify the impact or contribution of each feature to the decisions made by the model. In many machine learning algorithms such as decision trees, Random Forest, Gradient Boosting, among others, it is possible to calculate the importance of features during model training.

- This is done by observing how each feature influences the decisions made by the model when dividing the data into decision tree nodes or by weighing the features in other model structures.

- Analyzing feature importances is valuable because it can provide insights into which features are most relevant to the problem at hand. This information can be used to optimize the model, remove irrelevant or redundant features, identify important factors for prediction, and even assist in interpreting the model's results.

In [None]:
# Train models that support feature importances
models_with_feature_importances = [("DecisionTreeClassifier", DecisionTreeClassifier(random_state=42)),
                                   ("RandomForestClassifier", RandomForestClassifier(n_estimators=100, random_state=42)),
                                   ("XGBClassifier", XGBClassifier(random_state=42)),
                                   ("LGBMClassifier", LGBMClassifier(random_state=42))]

# Get feature names from the TfidfVectorizer
feature_names = tfidf_vectorizer.get_feature_names_out()

# Iterate over models
for model_name, model in models_with_feature_importances:

    # Train model
    model.fit(X_train, y_train)

    # Get importance of features
    if hasattr(model, 'feature_importances_'):
        feature_importances = model.feature_importances_
    else:
        # If the model does not have feature_importances_, continue to the next model
        print(f"{model_name} does not support feature importances.")
        continue

    # Create DataFrame for easier viewing
    feature_importances_df = pd.DataFrame({'Feature': feature_names,
                                           'Importance': feature_importances})

    # Sort by importance
    feature_importances_df = feature_importances_df.sort_values(by='Importance', ascending=False)

    # Plot
    plt.figure(figsize=(10, 6))
    sns.barplot(x='Importance', y='Feature', data=feature_importances_df[:10])
    plt.title(f"Top 10 Features - {model_name}")
    plt.xlabel('Importance')
    plt.ylabel('Feature')
    plt.grid(False)
    plt.show()

# Part 9) Evaluation metrics

In [None]:
from sklearn.metrics import confusion_matrix,classification_report

# Define your sentiment labels
labels = ['Positive',
          'Neutral',
          'Negative']

# Convert your TF-IDF sparse matrix to a dense matrix for models that require dense input
X_train_dense = X_train.toarray()
X_test_dense = X_test.toarray()

# Evaluate each model
for i, model in enumerate(models):
    # Check if the model requires dense data (like GaussianNB)
    if isinstance(model, GaussianNB):
        model.fit(X_train_dense, y_train)
        y_train_pred = model.predict(X_train_dense)
        y_test_pred = model.predict(X_test_dense)
    else:
        model.fit(X_train, y_train)
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)

    print(f"Model {i+1}: {type(model).__name__}")
    print(f"Training Accuracy: {train_accuracy}")
    print(f"Testing Accuracy: {test_accuracy}")
    print()

    # Calculate the confusion matrix
    cm = confusion_matrix(y_test, y_test_pred)

    print(f'Confusion matrix for Model {i+1}: {type(model).__name__} \n\n', cm)

    # Plot the confusion matrix with annotations for three classes
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False,
                xticklabels=labels, yticklabels=labels)

    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.title(f"Confusion Matrix - Model {i+1}: {type(model).__name__}")
    plt.show()

    print("------------------")

# Part 10) Model Evaluation

In [None]:
# Importing necessary libraries
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score
from sklearn.preprocessing import label_binarize

# Convert sparse matrix to dense for models that require dense input
X_train_dense = X_train.toarray()
X_test_dense = X_test.toarray()

# Binarize the output labels for multiclass ROC-AUC
n_classes = len(np.unique(y_train))

# Adjust this depending on the number of classes
y_test_bin = label_binarize(y_test, classes=[0, 1, 2])

# Models to be evaluated
models = [
    GaussianNB(),
    DecisionTreeClassifier(random_state=42),
    KNeighborsClassifier(),
    RandomForestClassifier(n_estimators=100, random_state=42),
    LogisticRegression(random_state=42, max_iter=1000),
    AdaBoostClassifier(random_state=42),
    GradientBoostingClassifier(random_state=42),
    XGBClassifier(random_state=42),
    LGBMClassifier(),
    CatBoostClassifier(task_type='GPU', iterations=1000, learning_rate=0.1, depth=6, verbose=0, random_state=42)
]

# Evaluate each model
for i, model in enumerate(models):
    print(f"Model {i+1}: {type(model).__name__}")

    # Check if the model requires dense data (like GaussianNB)
    if isinstance(model, (GaussianNB, KNeighborsClassifier)):
        model.fit(X_train_dense, y_train)
        y_train_pred = model.predict(X_train_dense)
        y_test_pred = model.predict(X_test_dense)
        y_probs = model.predict_proba(X_test_dense)
    else:
        # For all other models, use sparse matrices
        model.fit(X_train, y_train)
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
        if hasattr(model, "predict_proba"):
            y_probs = model.predict_proba(X_test)
        else:
            print(f"{type(model).__name__} does not support predict_proba, skipping AUC/ROC plot.")
            continue

    # Calculate accuracy
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    print(f"Training Accuracy: {train_accuracy:.4f}")
    print(f"Testing Accuracy: {test_accuracy:.4f}")

    # Calculate ROC curve and AUC for each class (multiclass)
    if 'y_probs' in locals():
        fpr = dict()
        tpr = dict()
        roc_auc = dict()

        for j in range(n_classes):
            fpr[j], tpr[j], _ = roc_curve(y_test_bin[:, j], y_probs[:, j])
            roc_auc[j] = roc_auc_score(y_test_bin[:, j], y_probs[:, j])

        # Plot ROC curve for each class
        plt.figure()
        colors = ['blue', 'red', 'green']
        for j, color in enumerate(colors):
            plt.plot(fpr[j], tpr[j], color=color, lw=2, label=f'Class {j} (AUC = {roc_auc[j]:.2f})')

        plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'Multiclass ROC Curve - Model {i+1}: {type(model).__name__}')
        plt.legend(loc="lower right")
        plt.grid(False)
        plt.show()

    print("------------------")

**10.1 - Classification report**

In [None]:
# Define sentiment labels (assuming 3-class classification)
sentiment_labels = ['Positive', 'Neutral', 'Negative']

# Convert sparse matrix to dense for models that require dense input
X_train_dense = X_train.toarray()
X_test_dense = X_test.toarray()

# Models to be evaluated
models = [
          GaussianNB(),
          DecisionTreeClassifier(random_state=42),
          KNeighborsClassifier(),
          RandomForestClassifier(n_estimators=100, random_state=42),
          LogisticRegression(random_state=42, max_iter=1000),
          AdaBoostClassifier(random_state=42),
          XGBClassifier(random_state=42),
          LGBMClassifier(),
          CatBoostClassifier(task_type='GPU', iterations=1000, learning_rate=0.1, depth=6, verbose=0, random_state=42)]

# Evaluate each model
for i, model in enumerate(models):

    print(f"Model {i+1}: {type(model).__name__}")

    # For models that require dense matrices
    if isinstance(model, (GaussianNB, KNeighborsClassifier)):
        model.fit(X_train_dense, y_train)
        y_train_pred = model.predict(X_train_dense)
        y_test_pred = model.predict(X_test_dense)
    else:
        # For models that work with sparse matrices
        model.fit(X_train, y_train)
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

    # Calculate accuracy
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)

    print(f"Training Accuracy: {train_accuracy}")
    print(f"Testing Accuracy: {test_accuracy}")

    # Generate classification report with sentiment labels
    report = classification_report(y_test, y_test_pred, target_names=sentiment_labels)
    print()
    print("Classification Report:")
    print(report)
    print("=======================================")

# Part 11 - Result models

In [None]:
# Convert sparse matrix to dense for models that require dense input
X_train_dense = X_train.toarray()
X_test_dense = X_test.toarray()

# Models to be evaluated
models = [GaussianNB(),
          DecisionTreeClassifier(random_state=42),
          KNeighborsClassifier(),
          RandomForestClassifier(n_estimators=100, random_state=42),
          LogisticRegression(random_state=42, max_iter=1000),
          AdaBoostClassifier(random_state=42),
          XGBClassifier(random_state=42),
          GradientBoostingClassifier(random_state=42),
          LGBMClassifier()]

# List to store metrics for each model
metricas = []

# Evaluate each model
for model in models:
    print(f"Evaluating {type(model).__name__}")

    # For models that require dense matrices
    if isinstance(model, (GaussianNB, KNeighborsClassifier)):
        model.fit(X_train_dense, y_train)
        train_accuracy = accuracy_score(y_train, model.predict(X_train_dense))
        test_accuracy = accuracy_score(y_test, model.predict(X_test_dense))
        report = classification_report(y_test, model.predict(X_test_dense), output_dict=True)
    else:
        # For models that work with sparse matrices
        model.fit(X_train, y_train)
        train_accuracy = accuracy_score(y_train, model.predict(X_train))
        test_accuracy = accuracy_score(y_test, model.predict(X_test))
        report = classification_report(y_test, model.predict(X_test), output_dict=True)

    # Extract metrics of interest from the report
    metrics = {"Model": type(model).__name__,
               "Accuracy": test_accuracy,
               "Precision": report['weighted avg']['precision'],
               "Recall": report['weighted avg']['recall'],
               "F1-score": report['weighted avg']['f1-score'],
               "Support": report['weighted avg']['support']}
    metricas.append(metrics)

# Convert the list of dictionaries into a DataFrame
df_metricas = pd.DataFrame(metricas)

# Function to highlight the maximum value in each column
def highlight_max(s):
    is_max = s == s.max()
    return ['background-color: yellow' if v else '' for v in is_max]

# Apply the highlighting function
df_metricas_styled = df_metricas.style.apply(highlight_max, subset=['Accuracy', 'Precision', 'Recall', 'F1-score'])

# Display the styled DataFrame with metrics
df_metricas_styled

# Part 11 - Conclusion

Based on the performance metrics of the different models, here are the key insights:

1. **LGBMClassifier** emerges as the best-performing model across all metrics, achieving the highest **Accuracy (92.84%)**, **Precision (92.97%)**, **Recall (92.84%)**, and **F1-score (92.80%)**. This suggests that LightGBM is highly effective for this classification task, possibly due to its ability to handle large datasets efficiently while maintaining high predictive performance.

2. **DecisionTreeClassifier** and **RandomForestClassifier** also performed well, with Decision Tree achieving an **Accuracy of 90.88%** and Random Forest reaching **Accuracy of 91.04%**. These models are known for their interpretability and capability of handling nonlinear relationships, which could explain their strong results.

3. **LogisticRegression** also performed admirably, with an **Accuracy of 91.45%**, suggesting that even a simpler, linear model can achieve strong performance on this dataset.

4. **KNeighborsClassifier** and **GaussianNB** struggled in comparison, with KNeighborsClassifier achieving only **32.82% Accuracy**, and GaussianNB scoring **46.14% Accuracy**. This indicates that these models are less suitable for this specific task, possibly due to their assumptions about data distribution or inability to handle complex relationships in the data.

5. **AdaBoostClassifier** and **GradientBoostingClassifier** showed moderate performance, with AdaBoostClassifier achieving an **Accuracy of 79.74%**, and GradientBoostingClassifier at **76.77%**. While they performed better than KNeighbors and GaussianNB, they were outclassed by models like LGBM, Random Forest, and Decision Tree.

### Recommendation:
- **LightGBM** is the optimal model for this problem, considering its superior performance in all key metrics.
- **Random Forest** and **Decision Tree** can also be considered as reliable models, especially if interpretability is a priority.
- It's recommended to avoid **KNeighborsClassifier** and **GaussianNB** for this task as their performance is subpar compared to other models.

Overall, **LGBMClassifier** should be used in production for this classification task due to its high predictive power, speed, and ability to handle large and complex datasets effectively.

# Section B) Generative AI LLM Gemma-2-2b

# Part 12 - Model LLM Gemma

In [None]:
# Importing libraries natural language processing
import nltk
from nltk.tokenize import sent_tokenize

# Downloading package nlp punkt
nltk.download('punkt')

# Authentication with HUGGING FACE
import os
HUGGING_FACE_ACCESS_TOKEN = os.environ['HUGGING_FACE_ACCESS_TOKEN'] = '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'

In [None]:
from transformers import AutoModelForCausalLM
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, BertTokenizer, DataCollatorForLanguageModeling, Trainer, TrainingArguments, AutoModelForCausalLM
import torch

# Model parameters: specifying the model name to use
model_name = 'google/gemma-2-2b-it'

# Load the pre-trained Causal Language Model with specific configurations
# torch_dtype is set to float16 to optimize GPU memory usage for faster training or inference
# The Hugging Face access token is needed for models hosted on private repositories
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,  # Using half precision for memory efficiency
    token=HUGGING_FACE_ACCESS_TOKEN  # Authentication token for Hugging Face if required
).to('cuda')  # Moving model to GPU

# Check if GPU is available and use it if possible, otherwise, use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to the correct device (GPU or CPU)
model.to(device)

In [None]:
# Load the tokenizer with the specified token
tokenizer = AutoTokenizer.from_pretrained(model_name, token=HUGGING_FACE_ACCESS_TOKEN)

In [None]:
# Function to create the prompt with a size limit
def criar_prompt(texto):
    # Shorten the text to a maximum of 512 characters
    texto_curto = texto[:512]

    # Create the prompt for sentiment classification
    # It asks the model to classify the text as Positive, Negative, or Neutral
    prompt = f"Classify the sentiment of the following text as Positive, Negative, or Neutral:\n\n'{texto_curto}'\n\nSentiment:"

    # Return the created prompt
    return prompt

In [None]:
# Function to classify sentiment with reduced generation time
def classificar_sentimento(model, tokenizer, texto):
    # Create the prompt using the earlier function
    prompt = criar_prompt(texto)

    # Tokenize the prompt and move the tensors to the correct device (GPU or CPU)
    inputs = tokenizer(prompt, return_tensors='pt').to(device)

    # Generate the model output with a reduced max_length for faster inference
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=inputs['input_ids'].shape[1] + 2)  # Limiting max_length to input length + 2

    # Decode the model output to extract the sentiment classification
    resposta = tokenizer.decode(outputs[0], skip_special_tokens=True)
    sentimento = resposta.split('Sentimento:')[-1].strip()  # Extract the sentiment part from the response
    return sentimento

# Define the number of epochs
EPOCHS = 2

# Loop to run sentiment classification over multiple epochs
for epoch in range(EPOCHS):
    print(f"Epoch {epoch + 1}/{EPOCHS}")

    # Apply the sentiment classification function to each review in the DataFrame
    df['Predicted_Sentiment'] = df['Cleaned_Review'].apply(lambda x: classificar_sentimento(model, tokenizer, x))

    # Display the reviews along with the original and predicted sentiment
    print(df[['Cleaned_Review', 'Sentiment', 'Predicted_Sentiment']])


# Save dataframe
df.to_csv("dataset_LLM.csv")

In [None]:
# Example of incorrect values seen previously
# We replace truncated values with their correct equivalents
sentimento_correcao = {'**__': 'Neutral',
                       '**Ne': 'Neutral',
                       '**Posi': 'Positive',
                       '**Neg': 'Negative',
                       '': 'Neutral',   # Assuming empty values are Neutral
                       '*': 'Neutral'   # Assuming '*' means Neutral
                      }

# Correcting the values in the 'Predicted_Sentiment' column
df['Predicted_Sentiment'] = df['Predicted_Sentiment'].map(sentimento_correcao)

# Check the corrected values
print("Corrected values in 'Predicted_Sentiment':")
print(df['Predicted_Sentiment'].unique())