In [20]:
import numpy as np
import pandas as pd
import re
import spacy
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
pd.options.mode.chained_assignment = None
from nltk.corpus import wordnet
import os
import shutil

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec, FastText
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm 

In [21]:
!pip install gensim



In [22]:
google_model = api.load("word2vec-google-news-300")

In [10]:
df = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv',encoding="ISO-8859-1")

In [11]:
def preprocess_text(text):
    if not isinstance(text, str):
        text = str(text)
        
    text = text.lower()
    
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)

    text = text.replace("br", "").replace("<", "").replace(">", "")

    tokens = word_tokenize(text)
    
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return ' '.join(tokens)

df['processed_text'] = df['review'].apply(preprocess_text)

print("Original vs Preprocessed Text:")
for i in range(3):
    print(f"\nOriginal: {df['review'].iloc[i][:100]}...")
    print(f"Processed: {df['processed_text'].iloc[i][:100]}...")

Original vs Preprocessed Text:

Original: One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. The...
Processed: one reviewer mentioned watching oz episode youll hooked right exactly happened first thing struck oz...

Original: A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-B...
Processed: wonderful little production filming technique unassuming oldtimebbc fashion give comforting sometime...

Original: I thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air con...
Processed: thought wonderful way spend time hot summer weekend sitting air conditioned theater watching lighthe...


In [12]:
X = df['processed_text']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"\nData split into training ({len(X_train)} samples) and testing ({len(X_test)} samples).")


Data split into training (40000 samples) and testing (10000 samples).


In [15]:
def get_average_word_vector(tokens, wv_model, vector_size=100):
    if hasattr(wv_model, 'wv'):
        vectors = [wv_model.wv[word] for word in tokens if word in wv_model.wv]
    else:
        vectors = [wv_model[word] for word in tokens if word in wv_model]
    
    return np.mean(vectors, axis=0) if vectors else np.zeros(vector_size)

In [23]:
df['average_word_vector_google'] = df['processed_text'].apply(lambda x: get_average_word_vector(x.split(), google_model, 300))

X = df['average_word_vector_google'].dropna().tolist()
y = df['sentiment'].dropna().tolist()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

google_classifier = LogisticRegression()
google_classifier.fit(X_train, y_train)

google_y_pred = google_classifier.predict(X_test)

google_accuracy = accuracy_score(y_test, google_y_pred)
print(f"Google News Accuracy: {google_accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, google_y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, google_y_pred))

Google News Accuracy: 0.8504
Classification Report:
              precision    recall  f1-score   support

    negative       0.85      0.85      0.85      4961
    positive       0.85      0.85      0.85      5039

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000

Confusion Matrix:
[[4211  750]
 [ 746 4293]]


In [24]:
from gensim.models import Word2Vec
from gensim.models import FastText

sent = df['processed_text'].apply(lambda x: x.split())

In [25]:
sg_model = Word2Vec(sentences=sent, vector_size=100, window=3, min_count=1, sg=1, workers=4)

df['average_word_vector_sg'] = df['processed_text'].apply(lambda x: get_average_word_vector(x.split(), sg_model))

X = df['average_word_vector_sg'].dropna().tolist()
y = df['sentiment'].dropna().tolist()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

sg_classifier = LogisticRegression()
sg_classifier.fit(X_train, y_train)

sg_y_pred = sg_classifier.predict(X_test)

sg_accuracy = accuracy_score(y_test, sg_y_pred)
print(f"SG Accuracy: {sg_accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, sg_y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, sg_y_pred))

SG Accuracy: 0.8675
Classification Report:
              precision    recall  f1-score   support

    negative       0.87      0.86      0.87      4961
    positive       0.87      0.87      0.87      5039

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000

Confusion Matrix:
[[4279  682]
 [ 643 4396]]


In [26]:
cbow_model = Word2Vec(sentences=sent, vector_size=100, window=3, min_count=1, sg=0, workers=4)

df['average_word_vector_cbow'] = df['processed_text'].apply(lambda x: get_average_word_vector(x.split(), cbow_model))

X = df['average_word_vector_cbow'].dropna().tolist()
y = df['sentiment'].dropna().tolist()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

cbow_classifier = LogisticRegression()
cbow_classifier.fit(X_train, y_train)

cbow_y_pred = cbow_classifier.predict(X_test)

cbow_accuracy = accuracy_score(y_test, cbow_y_pred)
print(f"CBOW Accuracy: {cbow_accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, cbow_y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, cbow_y_pred))

CBOW Accuracy: 0.8531
Classification Report:
              precision    recall  f1-score   support

    negative       0.86      0.84      0.85      4961
    positive       0.85      0.86      0.86      5039

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000

Confusion Matrix:
[[4182  779]
 [ 690 4349]]


In [27]:
ft_model = FastText(sentences=sent, vector_size=100, window=3, min_count=1, sg=1, workers=4)

df['average_word_vector_ft'] = df['processed_text'].apply(lambda x: get_average_word_vector(x.split(), ft_model))

X = df['average_word_vector_ft'].dropna().tolist()
y = df['sentiment'].dropna().tolist()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

ft_classifier = LogisticRegression()
ft_classifier.fit(X_train, y_train)

ft_y_pred = ft_classifier.predict(X_test)

ft_accuracy = accuracy_score(y_test, ft_y_pred)
print(f"FastText Accuracy: {ft_accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, ft_y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, ft_y_pred))

FastText Accuracy: 0.8643
Classification Report:
              precision    recall  f1-score   support

    negative       0.87      0.86      0.86      4961
    positive       0.86      0.87      0.87      5039

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000

Confusion Matrix:
[[4251  710]
 [ 647 4392]]
