In [157]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
from datasets import load_dataset
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
import gradio as gr
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import joblib
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.svm import LinearSVC
import spacy

In [2]:
# Importing the dataset
data = pd.read_csv("data/IMDB_Dataset.csv")
data.head()

In [4]:
# Label encoding
data['sentiment'] = data['sentiment'].map({'positive': 1, 'negative': 0})
data['sentiment'].head()

In [198]:
# Split the data into target and features
X = data['review']
y = data['sentiment']
# Do a train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [199]:
# Tokenizer parameters
MAX_VOCAB_SIZE = 10000  # Max words in vocabulary
MAX_SEQUENCE_LENGTH = 200  # Max length of each review
EMBEDDING_DIM = 100  # Word embedding size

# Tokenization
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

# Convert text to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Padding sequences
X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_SEQUENCE_LENGTH, padding="post", truncating="post")
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_SEQUENCE_LENGTH, padding="post", truncating="post")

In [93]:
# LSTM model
model = Sequential([
    Embedding(input_dim=MAX_VOCAB_SIZE, output_dim=EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH),
    LSTM(128, return_sequences=True),  # LSTM layer
    Dropout(0.3),  # Prevent overfitting
    LSTM(64),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')  # Sigmoid for binary classification
])

# Compile the model
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [94]:
# Fit the model
history = model.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_data=(X_test_pad, y_test))

Epoch 1/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m299s[0m 234ms/step - accuracy: 0.5261 - loss: 0.6896 - val_accuracy: 0.7293 - val_loss: 0.5789
Epoch 2/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m268s[0m 214ms/step - accuracy: 0.6598 - loss: 0.6053 - val_accuracy: 0.8483 - val_loss: 0.3584
Epoch 3/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m286s[0m 229ms/step - accuracy: 0.8717 - loss: 0.3243 - val_accuracy: 0.8639 - val_loss: 0.3120
Epoch 4/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m288s[0m 230ms/step - accuracy: 0.9171 - loss: 0.2248 - val_accuracy: 0.8821 - val_loss: 0.3006
Epoch 5/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m288s[0m 231ms/step - accuracy: 0.9458 - loss: 0.1645 - val_accuracy: 0.8645 - val_loss: 0.3279


In [95]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test_pad, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 77ms/step - accuracy: 0.8643 - loss: 0.3273
Test Accuracy: 0.8645


In [None]:
# Example with sample review
sample_review = ["This movie was absolutely amazing!"]
sample_seq = tokenizer.texts_to_sequences(sample_review)
sample_pad = pad_sequences(sample_seq, maxlen=MAX_SEQUENCE_LENGTH, padding="post")
prediction = model.predict(sample_pad)

print("Positive" if prediction[0][0] > 0.5 else "Negative")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 82ms/step
Positive


In [None]:
# Define the predict_review function
def predict_review(review):
    sample_seq = tokenizer.texts_to_sequences(review)
    sample_pad = pad_sequences(sample_seq, maxlen=MAX_SEQUENCE_LENGTH, padding="post")
    prediction = model.predict(sample_pad)
    return "Positive" if prediction[0][0] > 0.5 else "Negative"

In [145]:
# Run metrics on the model
y_pred_prob = model.predict(X_test_pad)
y_pred = (y_pred_prob > 0.5).astype(int)
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nAccuracy:", accuracy_score(y_test, y_pred))

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 74ms/step
Confusion Matrix:
 [[3882 1079]
 [ 276 4763]]

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.78      0.85      4961
           1       0.82      0.95      0.88      5039

    accuracy                           0.86     10000
   macro avg       0.87      0.86      0.86     10000
weighted avg       0.87      0.86      0.86     10000


Accuracy: 0.8645


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 140ms/step


In [196]:
# gradio interface
import gradio as gr

func = gr.Interface(
    fn=predict_review, 
    inputs=gr.Textbox(
    label="Enter your review here",
    lines=3,
    max_lines=5,
    interactive=True  
), 
    outputs=gr.Textbox(label='Review')
)
func.launch(share=True)

* Running on local URL:  http://127.0.0.1:7875

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.




In [195]:
# Improved Gradio Interface
with gr.Blocks(css=".gradio-container {background-color: #266fd9; font-family: Arial;}") as demo:
    gr.Markdown("## 🌟 R2D2 🌟")
    gr.Markdown("Enter your review, and we'll analyze its sentiment!")

    with gr.Row():
        input_text = gr.Textbox(
            label="📝 Enter your review",
            lines=3,
            max_lines=5,
            interactive=True,
            placeholder="Type your review here..."
        )

    analyze_button = gr.Button("🔍 Analyze Sentiment")

    with gr.Row():
        output_label = gr.Textbox(label="🔽 Sentiment Result", interactive=False)

    analyze_button.click(predict_review, input_text, output_label)

demo.launch()

* Running on local URL:  http://127.0.0.1:7874

To create a public link, set `share=True` in `launch()`.




In [151]:
# Import rotten tomatoes dataset
splits = {'train': 'train.parquet', 'validation': 'validation.parquet', 'test': 'test.parquet'}
df_reviews = pd.read_parquet("hf://datasets/cornell-movie-review-data/rotten_tomatoes/" + splits["train"], engine='fastparquet')

In [152]:
# Rename 'text' column
df_reviews = df_reviews.rename(columns={'text': 'review'})

In [153]:
import spacy

# Load English NLP model
nlp = spacy.load("en_core_web_sm")

# List of movie reviews
reviews = df_reviews["review"].tolist()

In [None]:
# Process each review and extract entities
for review in reviews:
    doc = nlp(review)
    print(f"Review: {review}")
    for ent in doc.ents:
        print(f"  Entity: {ent.text}, Label: {ent.label_}")
    print("-" * 50)

In [None]:
# Convert column to list
reviews_imdb = data["review"].tolist()
# Process each review and extract entities
for review in reviews_imdb:
    doc = nlp(review)
    print(f"Review: {review}")
    for ent in doc.ents:
        print(f"  Entity: {ent.text}, Label: {ent.label_}")
    print("-" * 50)

In [154]:
# Initialize 'count' column to None
df_reviews['count'] = None

In [155]:
# Process each review and count the number of PERSON entities
for review in reviews:
    count = 0
    doc = nlp(review)
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            count += 1
    df_reviews['count'][review] = count

In [158]:
# Iterate through the DataFrame using `iterrows()`
for idx, row in df_reviews.iterrows():
    count = 0
    doc = nlp(row["review"])  # Process the review text
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            count += 1
    df_reviews.at[idx, "count"] = count  # Assign count correctly
print(df_reviews)

                                                 review  label count
0     the rock is destined to be the 21st century's ...      1     2
1     the gorgeously elaborate continuation of " the...      1     2
2                        effective but too-tepid biopic      1     0
3     if you sometimes like to go to the movies to h...      1     0
4     emerges as something rare , an issue movie tha...      1     0
...                                                 ...    ...   ...
8525  any enjoyment will be hinge from a personal th...      0     0
8526  if legendary shlockmeister ed wood had ever ma...      0     2
8527  hardly a nuanced portrait of a young woman's b...      0     0
8528    interminably bleak , to say nothing of boring .      0     0
8529  things really get weird , though not particula...      0     0

[8530 rows x 3 columns]


In [49]:
# Filter the DataFrame for positive reviews
df_reviews_positive = df_reviews.drop(df_reviews[df_reviews['label'] == 0].index)
df_reviews_positive['count'].value_counts()

count
0    3626
1     528
2      94
3      13
4       3
5       1
Name: count, dtype: int64

In [50]:
# Filter the DataFrame for negative reviews
df_reviews_negative = df_reviews.drop(df_reviews[df_reviews['label'] == 1].index)
df_reviews_negative['count'].value_counts()

count
0    3710
1     489
2      55
3      10
4       1
Name: count, dtype: int64

In [51]:
# Initialize 'all_entities_count' column to None
df_reviews['all_entities_count'] = None

In [52]:
# Process each review and count all entities
for idx, row in df_reviews.iterrows():
    count = 0
    doc = nlp(row["review"])
    for ent in doc.ents:
        count += 1
    df_reviews.at[idx, "all_entities_count"] = count
print(df_reviews)

                                                 review  label count  \
0     the rock is destined to be the 21st century's ...      1     2   
1     the gorgeously elaborate continuation of " the...      1     2   
2                        effective but too-tepid biopic      1     0   
3     if you sometimes like to go to the movies to h...      1     0   
4     emerges as something rare , an issue movie tha...      1     0   
...                                                 ...    ...   ...   
8525  any enjoyment will be hinge from a personal th...      0     0   
8526  if legendary shlockmeister ed wood had ever ma...      0     2   
8527  hardly a nuanced portrait of a young woman's b...      0     0   
8528    interminably bleak , to say nothing of boring .      0     0   
8529  things really get weird , though not particula...      0     0   

     all_entities_count  
0                     3  
1                     2  
2                     0  
3                     0  
4    

In [53]:
# Filter the DataFrame for positive reviews
df_reviews_positive2 = df_reviews.drop(df_reviews[df_reviews['label'] == 0].index)
df_reviews_positive2['all_entities_count'].value_counts()

all_entities_count
0    2639
1    1108
2     381
3     104
4      22
5       8
6       2
7       1
Name: count, dtype: int64

In [54]:
# Filter the DataFrame for negative reviews
df_reviews_negative2 = df_reviews.drop(df_reviews[df_reviews['label'] == 1].index)
df_reviews_negative2['all_entities_count'].value_counts()

all_entities_count
0    2850
1    1017
2     286
3      83
4      24
5       4
6       1
Name: count, dtype: int64

In [164]:
# Import the IMDB dataset
imdb_df = pd.read_csv("data/IMDB_Dataset.csv")
imdb_df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [165]:
# Change the name of the review column to "text" and the sentiment column to "label"
imdb_df.rename(columns={'review': 'text', 'sentiment': 'label'}, inplace=True)
imdb_df.head()

Unnamed: 0,text,label
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [166]:
# Change the label column to have positive be 1 and negative be 0
imdb_df['label'] = imdb_df['label'].map({'positive': 1, 'negative': 0})
imdb_df.head()

Unnamed: 0,text,label
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [167]:
# Set the features variable to the "text" column
X = imdb_df['text']
# Set the target variable to the "label" column
y = imdb_df['label']
# Split the data into train, test, split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [168]:
custom_stopwords = ['a', 'about', 'an', 'and', 'are', 'as', 'at', 'be', 'been', 'but', 'by', 'can', \
             'even', 'ever', 'for', 'from', 'get', 'had', 'has', 'have', 'he', 'her', 'hers', 'his', \
             'how', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'just', 'me', 'my', 'of', 'on', 'or', \
             'see', 'seen', 'she', 'so', 'than', 'that', 'the', 'their', 'there', 'they', 'this', \
             'to', 'was', 'we', 'were', 'what', 'when', 'which', 'who', 'will', 'with', 'you']

In [169]:
# Build a pipeline using TfidfVectorizer(), with custom_stopwords and LinearSVC
text_clf = Pipeline([('tfidf', TfidfVectorizer(stop_words=custom_stopwords, ngram_range=(1,2), max_df=0.9, min_df=2, sublinear_tf=True)),
                     ('clf', LinearSVC(C=1, loss='squared_hinge', penalty='l2', dual=True))])

# Fit the data to the model
text_clf.fit(X_train, y_train)

In [170]:
# Build a pipeline using TfidfVectorizer(), with custom_stopwords and LinearSVC
text_clf2 = Pipeline([('tfidf', TfidfVectorizer(stop_words=custom_stopwords, ngram_range=(1,2), max_df=0.9, min_df=2, sublinear_tf=True)),
                     ('clf', MultinomialNB())])

# Fit the data to the model
# Convert the sparse matrix output of TfidfVectorizer to a dense array
text_clf2.fit(X_train, y_train)

In [171]:
# Validate the model by checking the model accuracy with model.score
print('Train accuracy:', text_clf.score(X_train, y_train))
print('Test accuracy:', text_clf.score(X_test, y_test))

Train accuracy: 0.9998
Test accuracy: 0.9171


In [172]:
# Validate the model by checking the model accuracy with model.score
print('Train accuracy:', text_clf2.score(X_train, y_train))
print('Test accuracy:', text_clf2.score(X_test, y_test))

Train accuracy: 0.961275
Test accuracy: 0.8938


In [173]:
# Create a confusion matrix on the test data and predictions
predictions = text_clf.predict(X_test)
print(metrics.confusion_matrix(y_test,predictions))
print(metrics.classification_report(y_test,predictions))
print(metrics.accuracy_score(y_test,predictions))

[[4508  453]
 [ 376 4663]]
              precision    recall  f1-score   support

           0       0.92      0.91      0.92      4961
           1       0.91      0.93      0.92      5039

    accuracy                           0.92     10000
   macro avg       0.92      0.92      0.92     10000
weighted avg       0.92      0.92      0.92     10000

0.9171


In [174]:
# Create a confusion matrix on the test data and predictions
predictions = text_clf2.predict(X_test)
print(metrics.confusion_matrix(y_test,predictions))
print(metrics.classification_report(y_test,predictions))
print(metrics.accuracy_score(y_test,predictions))

[[4497  464]
 [ 598 4441]]
              precision    recall  f1-score   support

           0       0.88      0.91      0.89      4961
           1       0.91      0.88      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000

0.8938


In [175]:
# Test a sample review "I thought the movie was slow and I wasn't very engaged"
sample = text_clf.predict(["I thought the movie was slow and I wasn't very engaged"])
sample

array([0], dtype=int64)

In [176]:
# Test a sample review "I thought the movie was slow and I wasn't very engaged"
sample = text_clf2.predict(["I thought the movie was slow and I wasn't very engaged"])
sample

array([0], dtype=int64)

In [177]:
# Test another sample review "This movie brought me so much joy. I loved the part with the animals"
sample = text_clf.predict(["This movie brought me so much joy. I loved the part with the animals"])
sample

array([1], dtype=int64)

In [178]:
# Test another sample review "This movie brought me so much joy. I loved the part with the animals"
sample = text_clf2.predict(["This movie brought me so much joy. I loved the part with the animals"])
sample

array([1], dtype=int64)

In [179]:
# Save the model
vectorizer = text_clf.named_steps['tfidf']
joblib.dump(vectorizer, 'vectorizer.pkl')
model = text_clf.named_steps['clf']
joblib.dump(model, 'model.pkl')

['model.pkl']

In [180]:
#Load the saved model
new_vectorizer = joblib.load('vectorizer.pkl')
new_model = joblib.load('model.pkl')

In [159]:
# Load the rt dataset
rt = load_dataset("cornell-movie-review-data/rotten_tomatoes")

README.md:   0%|          | 0.00/7.46k [00:00<?, ?B/s]

train.parquet:   0%|          | 0.00/699k [00:00<?, ?B/s]

validation.parquet:   0%|          | 0.00/90.0k [00:00<?, ?B/s]

test.parquet:   0%|          | 0.00/92.2k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8530 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1066 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1066 [00:00<?, ? examples/s]

In [160]:
# Extract the train dataset from the rt
rt_df = rt['train'].to_pandas()

In [161]:
rt_df.head()

Unnamed: 0,text,label
0,the rock is destined to be the 21st century's ...,1
1,"the gorgeously elaborate continuation of "" the...",1
2,effective but too-tepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,"emerges as something rare , an issue movie tha...",1


In [181]:
# Assuming the text column is named 'review'
new_texts = rt_df['text'].astype(str)  # Convert to string if necessary

# Transform using the existing vectorizer
new_features = new_vectorizer.transform(new_texts)

In [182]:
# Get actual labels
y_true = rt_df['label']

# Predict using the trained model
y_pred = new_model.predict(new_features)

# Calculate accuracy
accuracy = accuracy_score(y_true, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Detailed evaluation
print(classification_report(y_true, y_pred))

Accuracy: 0.7462
              precision    recall  f1-score   support

           0       0.77      0.70      0.73      4265
           1       0.72      0.80      0.76      4265

    accuracy                           0.75      8530
   macro avg       0.75      0.75      0.75      8530
weighted avg       0.75      0.75      0.75      8530



In [183]:
# Load your vectorizer and transform the dataset
vectorizer = joblib.load('vectorizer.pkl')
X = vectorizer.transform(rt_df['text'])  # Text features
y = rt_df['label']  # Target labels

# Initialize MultinomialNB
model = MultinomialNB()

# Perform 5-fold cross-validation (you can adjust the number of folds)
cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')

# Output the accuracy scores for each fold
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean accuracy: {cv_scores.mean()} ± {cv_scores.std()}")

Cross-validation scores: [0.76143025 0.77784291 0.7919109  0.78077374 0.77256741]
Mean accuracy: 0.7769050410316529 ± 0.009993063925498744


In [184]:
#Download Vader package
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\chees\AppData\Roaming\nltk_data...


True

In [187]:
# gradio interface low quality
func = gr.Interface(
    fn=vader_review,
    inputs=gr.Textbox(
    label="Enter your review here",
    lines=3,
    max_lines=5,
    interactive=True
),
    outputs=gr.Textbox(label='Review')
)
func.launch(share=True)

* Running on local URL:  http://127.0.0.1:7866

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.




In [193]:
# Better gradio interface


# Sentiment Analysis Function returning sentiment and score
def vader_review(text):
    analyzer = SentimentIntensityAnalyzer()
    score = analyzer.polarity_scores(text)['compound']
    if score >= 0.05:
        return "😊 Positive", score
    elif score <= -0.05:
        return "😠 Negative", score
    else:
        return "😐 Neutral", score

# Improved Gradio Interface
with gr.Blocks(css=".gradio-container {background-color: #df3120; font-family: Arial;}") as demo:
    gr.Markdown("## 🌟 Vader 🌟")
    gr.Markdown("Enter your review, and we'll analyze its sentiment!")

    with gr.Row():
        input_text = gr.Textbox(
            label="📝 Enter your review",
            lines=3,
            max_lines=5,
            interactive=True,
            placeholder="Type your review here..."
        )

    analyze_button = gr.Button("🔍 Analyze Sentiment")

    with gr.Row():
        output_label = gr.Textbox(label="🔽 Sentiment Result", interactive=False)

    analyze_button.click(vader_review, input_text, output_label)

demo.launch()

* Running on local URL:  http://127.0.0.1:7872

To create a public link, set `share=True` in `launch()`.




In [194]:
# Sentiment Analysis Function returning sentiment and score using text_clf
def mj_review(text):
    score = text_clf.predict([text])[0]
    if score == 1:
        return "😊 Positive"
    elif score == 0:
        return "😠 Negative"

# Improved Gradio Interface
with gr.Blocks(css=".gradio-container {background-color: #3e8b4f; font-family: Arial;}") as demo:
    gr.Markdown("## 🌟 Skywalker 🌟")
    gr.Markdown("Enter your review, and we'll analyze its sentiment!")

    with gr.Row():
        input_text = gr.Textbox(
            label="📝 Enter your review",
            lines=3,
            max_lines=5,
            interactive=True,
            placeholder="Type your review here..."
        )

    analyze_button = gr.Button("🔍 Analyze Sentiment")

    with gr.Row():
        output_label = gr.Textbox(label="🔽 Sentiment Result", interactive=False)

    analyze_button.click(mj_review, input_text, output_label)

demo.launch()

* Running on local URL:  http://127.0.0.1:7873

To create a public link, set `share=True` in `launch()`.


