In [1]:
import pandas as pd
import re
import nltk
import spacy
import gradio as gr
import os
import joblib


from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from matplotlib import pyplot as plt

# load the spacy model without the parser and named entity recognition for faster processing
nlp = spacy.load('en_core_web_sm', disable=["parser", "ner"])
nltk.download('stopwords')
nltk.download('punkt_tab')

dataset = pd.read_csv('C:/Users/Gicano Brothers/Documents/POP Repositories/Sentiment-Analysis/data/raw/IMDB Dataset.csv')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to C:\Users\Gicano
[nltk_data]     Brothers\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\Gicano
[nltk_data]     Brothers\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:
def clean_text(text):
    text = BeautifulSoup(text, "html.parser").get_text()  # Removes HTML
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Keeping only letters
    text = text.lower()  # Converts all text to lowercase
    text = re.sub(r'\s+', ' ', text).strip()  # Removes unnecessary spaces 
    return text

dataset['cleaned_review'] = dataset['review'].apply(clean_text) 
dataset.drop(columns=['review'], inplace=True)

(dataset.head())  # Checking the result of preprocessing done in this cell

  text = BeautifulSoup(text, "html.parser").get_text()  # Removes HTML


Unnamed: 0,sentiment,cleaned_review
0,positive,one of the other reviewers has mentioned that ...
1,positive,a wonderful little production the filming tech...
2,positive,i thought this was a wonderful way to spend ti...
3,negative,basically theres a family where a little boy j...
4,positive,petter matteis love in the time of money is a ...


In [3]:
# TODO: Write this whole cell into a function so i dont need to comment it out (done, kinda, I realized a simple if statement would suffice)
# Function to lemmatize tokens in batches
def lemmatize_in_batches(cleaned_reviews):
    # Process in batches using nlp.pipe for efficiency
    docs = nlp.pipe(cleaned_reviews, batch_size=1000)
    # Extract lemmatized tokens
    lemmatized_reviews = [[token.lemma_ for token in doc] for doc in docs]
    return lemmatized_reviews

file_path = 'C:/Users/Gicano Brothers/Documents/POP Repositories/Sentiment-Analysis/data/processed/IMDB_Dataset_Processed.csv'
if os.path.exists(file_path):
    print('Lematized dataset already exists, loading it')
    dataset = pd.read_csv('C:/Users/Gicano Brothers/Documents/POP Repositories/Sentiment-Analysis/data/processed/IMDB_Dataset_Processed.csv')
else:
    print('Running Lemmatization (This will take a while)')
    # Apply lemmatization in batches to the tokenized_review column
    dataset['lemmatized_review'] = lemmatize_in_batches(dataset['cleaned_review'])

# Saving the preprocessed dataset to a CSV file, to save time since it takes 10 minutes to lemmatize the dataset
dataset.to_csv('C:/Users/Gicano Brothers/Documents/POP Repositories/Sentiment-Analysis/data/processed/IMDB_Dataset_Processed.csv', index=False)

(dataset.head()) # Checking the result of preprocessing done in this cell

Lematized dataset already exists, loading it


Unnamed: 0,sentiment,lemmatized_review
0,positive,"['one', 'of', 'the', 'other', 'reviewer', 'hav..."
1,positive,"['a', 'wonderful', 'little', 'production', 'th..."
2,positive,"['I', 'think', 'this', 'be', 'a', 'wonderful',..."
3,negative,"['basically', 'there', 's', 'a', 'family', 'wh..."
4,positive,"['petter', 'matteis', 'love', 'in', 'the', 'ti..."


In [None]:
# Wrote this code to check if the NLTK punkt tokenizer is available
# Since the cell below wasnt running because in kaggle importing "punkt" was enough
# I had to download "punkt_tab" instead
try:
    word_tokenize("This is a test sentence.")
    print("NLTK punkt tokenizer is available!")
except LookupError as e:
    print(e)

NLTK punkt tokenizer is available!


In [5]:
stop_words = set(stopwords.words('english'))

# Breaking down all the words into individual strings from lemmatized_review column
# And removing irrelevant words like (of, and, the, is, etc.)
def tokenize_text(text):
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words] # Removing stopwords
    return tokens

dataset['tokenized_review'] = dataset['lemmatized_review'].apply(tokenize_text)
dataset.drop(columns=['lemmatized_review'], inplace=True)
               
(dataset.head()) # Checking the result of preprocessing done in this cell

Unnamed: 0,sentiment,tokenized_review
0,positive,"[[, 'one, ', ,, 'of, ', ,, 'the, ', ,, 'other,..."
1,positive,"[[, ', ', ,, 'wonderful, ', ,, 'little, ', ,, ..."
2,positive,"[[, ', I, ', ,, 'think, ', ,, 'this, ', ,, 'be..."
3,negative,"[[, 'basically, ', ,, 'there, ', ,, 's, ', ,, ..."
4,positive,"[[, 'petter, ', ,, 'matteis, ', ,, 'love, ', ,..."


In [6]:
# Convert tokenized data to strings for TF-IDF
dataset['processed_review'] = dataset['tokenized_review'].apply(' '.join)
dataset['sentiment'] = dataset['sentiment'].map({'positive': 1, 'negative': 0})

# Vectorize the text
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(dataset['processed_review'])
y = dataset['sentiment']  # Assuming sentiment column exists
print(y)    

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

best_c = None # Placeholder for best hyperparameter

# Used to find the best hyperparameter for the model
def grid_search():
    # Define parameter grid
    param_grid = {'C': [0.1, 1, 10, 100]}

    # Perform grid search
    grid_search = GridSearchCV(LogisticRegression(class_weight='balanced', random_state=42), 
                            param_grid, cv=5, scoring='f1')
    grid_search.fit(X_train, y_train)

    # Best parameters and score
    return grid_search.best_params_['C']
   
# Check if best_c is already known or perform grid search
if best_c is None:  # Ensures 'best_c' is only computed if not already defined
    best_c = grid_search()
    print("this block runs")
    

# Reinitialize the final model with the best hyperparameter and balanced class weights
# Tying max_iter to 500, 2000, 50000 to avoid convergence warnings
# didnt work even with 50000
# got lower evaluation metrics on all boards when using max_iter and feature scaling
model = LogisticRegression(class_weight="balanced", random_state=42, C=best_c)    
model.fit(X_train, y_train)

0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: sentiment, Length: 50000, dtype: int64


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


this block runs


In [7]:
# Function to generate a bar graph for metrics
def generate_metrics_plot():
    
    y_true = y_test  # Actual labels
    y_pred = model.predict(X_test)  # Predicted labels
    
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
    values = [accuracy, precision, recall, f1]  
    
    plt.close('all')  # Close all existing plots
    
    # Plotting
    plt.figure(figsize=(6, 4))
    plt.bar(metrics, values, color=['blue', 'green', 'orange', 'red'])
    plt.ylim(0, 1)
    plt.title('Model Metrics')
    plt.xlabel('Metric')
    plt.ylabel('Score')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    
    # Return the plot
    return plt

def predict_sentiment_with_metrics(input_text):
    # Clean the input text
    cleaned_text = clean_text(input_text)
    # Lemmatize the cleaned text
    lemmatized_text = lemmatize_text_spacy(cleaned_text)
    # Convert cleaned text to features using TF-IDF vectorizer
    text_vectorized = vectorizer.transform([lemmatized_text])
    # Make prediction using the trained logistic regression model
    prediction = model.predict(text_vectorized)
    sentiment = "Positive" if prediction[0] == 1 else "Negative"
    
    # Generate the bar graph for metrics
    metrics_plot = generate_metrics_plot()
    
    # Return prediction and the plot
    return sentiment, metrics_plot

# TODO: Make a bar graph for the metrics (Done)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred)) # Shows Actual Numerical values of the metrics

              precision    recall  f1-score   support

           0       0.90      0.88      0.89      4961
           1       0.89      0.91      0.90      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [8]:
# TODO: Save and Load model so that it can be used in the gr.Interface (done)
# Saving model if it doesn't exist yet
if not os.path.exists('C:/Users/Gicano Brothers/Documents/POP Repositories/Sentiment-Analysis/models/lr_sentiment_analysis_model.pkl'):
    joblib.dump(model, 'C:/Users/Gicano Brothers/Documents/POP Repositories/Sentiment-Analysis/models/lr_sentiment_analysis_model.pkl')
    model = joblib.load('C:/Users/Gicano Brothers/Documents/POP Repositories/Sentiment-Analysis/models/lr_sentiment_analysis_model.pkl')

def lemmatize_text_spacy(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

# Function to predict sentiment
def predict_sentiment(text):
    # Clean the input text
    cleaned_text = clean_text(text)
    # Lemmatize the cleaned text
    lemmatized_text = lemmatize_text_spacy(cleaned_text)
    # Convert cleaned text to features using TF-IDF vectorizer
    text_vectorized = vectorizer.transform([lemmatized_text])
    
    # Make prediction using the trained logistic regression model
    prediction = model.predict(text_vectorized)
    
    # Return sentiment as either "Positive" or "Negative"
    return "Positive" if prediction[0] == 1 else "Negative"

iface = gr.Interface(
    fn=predict_sentiment_with_metrics, 
    inputs=gr.Textbox(label="Review",lines=1, placeholder="Enter a review..."),
    outputs=[
        gr.Textbox(label="Sentiment Prediction"),
        gr.Plot(label="Model Metrics"),
    ],  # Outputs sentiment and metrics
    live=True,  # Optional: to update output as you type
    title="Sentiment Analysis", 
    description="Enter a review to predict if it's positive or negative. (Provide more than 5 words for better accuracy)"
)

iface.launch()

* Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


