<h1 align="center">Machine Learning for NLP</h1>
    <h2 align="center">IMDB Sentiment Analysis</h2>
    <h3 align="center">Zahra Amini</h3>
<div style="width: 100%; text-align: center;">
    <table>
        <tr>
            <td>
                <a class="link" href="https://t.me/Zahraamini_ai">Telegram</a><br>
                <a class="link" href="https://www.linkedin.com/in/zahraamini-ai/">LinkedIn</a><br>
                <a class="link" href="https://www.youtube.com/@AcademyHobot">YouTube</a><br>
            </td>
            <td>
                <a class="link" href="https://github.com/aminizahra">GitHub</a><br>
                <a class="link" href="https://www.kaggle.com/aminizahra">Kaggle</a><br>
                <a class="link" href="https://www.instagram.com/zahraamini_ai/">Instagram</a><br>
            </td>
        </tr>
    </table>
</div>

# import Libraries

In [1]:
import pandas as pd
import re
import string
import os
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

In [3]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from nltk.tokenize import word_tokenize
import contractions

### Download necessary resources from NLTK

In [7]:
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download("punkt")  # For tokenization

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Load Data

In [None]:
# Dataset URL: https://ai.stanford.edu/~amaas/data/sentiment/

In [9]:
def load_imdb_data(base_path, dataset_type="train"):
    data = {"review":[], "sentiment":[]}
    dataset_path = os.path.join(base_path, dataset_type)

    for sentiment in ["pos", "neg"]:
        folder_path = os.path.join(dataset_path, sentiment)
        sentiment_label = 1 if sentiment == "pos" else 0

        for file_name in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, 'r', encoding='utf-8') as file:
                review = file.read().strip()
                data["review"].append(review)
                data["sentiment"].append(sentiment_label)
    return pd.DataFrame(data)

In [11]:
# Define the path to the dataset
base_path = r"C:\PC\MyCourses\ML_NLP_Filoger\Session09&10\aclImdb"

### Load the training dataset

In [13]:
train_data = load_imdb_data(base_path, dataset_type="train")

In [27]:
train_data

Unnamed: 0,review,sentiment
0,Bromwell High is a cartoon comedy. It ran at t...,1
1,Homelessness (or Houselessness as George Carli...,1
2,Brilliant over-acting by Lesley Ann Warren. Be...,1
3,This is easily the most underrated film inn th...,1
4,This is not the typical Mel Brooks film. It wa...,1
...,...,...
24995,"Towards the end of the movie, I felt it was to...",0
24996,This is the kind of movie that my enemies cont...,0
24997,I saw 'Descent' last night at the Stockholm Fi...,0
24998,Some films that you pick up for a pound turn o...,0


In [29]:
train_data["sentiment"].value_counts()

sentiment
1    12500
0    12500
Name: count, dtype: int64

### Load the test dataset

In [14]:
test_data = load_imdb_data(base_path, dataset_type="test")

# Preprocess Text

### Initialize the lemmatizer and stop words

In [15]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

### Add domain-specific stop words

In [17]:
additional_stopwords = {"movie", "film", "character", "story", "plot", "series"}
stop_words.update(additional_stopwords)

In [18]:
# Path to dataset and vocab file
base_path = r"C:\PC\MyCourses\ML_NLP_Filoger\Session09&10\aclImdb"
vocab_file_path = os.path.join(base_path, "imdb.vocab")

# Load the vocabulary into a set
# Load the vocabulary file with utf-8 encoding
with open(vocab_file_path, "r", encoding="utf-8") as f:
    vocab = set(f.read().splitlines())

# print(f"Loaded {len(vocab)} words from the vocabulary.")

### Preprocess Function

In [19]:
def expand_contractions(text):
    return contractions.fix(text) #I'm -> I am

In [25]:
def preprocess_text(text, vocab):
    #S1
    text = expand_contractions(text)
    #S2
    text = re.sub(r"<.*?>", "", text)
    #S3
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    #S4 @ !
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    #S5
    text = text.lower()
    #S6
    tokens = word_tokenize(text)
    #S7
    tokens = [lemmatizer.lemmatize(word)
             for word in tokens
             if word in vocab and word not in stop_words and len(word)>2]
    #S8
    cleaned_text = " ".join(tokens)
    return cleaned_text

### Apply preprocessing and vocabulary filtering

In [31]:
train_data["cleaned_review"] = train_data["review"].apply(lambda x: preprocess_text(x, vocab))

In [32]:
train_data

Unnamed: 0,review,sentiment,cleaned_review
0,Bromwell High is a cartoon comedy. It ran at t...,1,bromwell high cartoon comedy ran time program ...
1,Homelessness (or Houselessness as George Carli...,1,homelessness houselessness george carlin state...
2,Brilliant over-acting by Lesley Ann Warren. Be...,1,brilliant overacting lesley ann warren best dr...
3,This is easily the most underrated film inn th...,1,easily underrated inn brook cannon sure flawed...
4,This is not the typical Mel Brooks film. It wa...,1,typical mel brook much less slapstick movie ac...
...,...,...,...
24995,"Towards the end of the movie, I felt it was to...",0,towards end felt technical felt like classroom...
24996,This is the kind of movie that my enemies cont...,0,kind enemy content watch time bloody true watc...
24997,I saw 'Descent' last night at the Stockholm Fi...,0,saw descent last night stockholm festival one ...
24998,Some films that you pick up for a pound turn o...,0,film pick pound turn rather good century film ...


# X, y

In [35]:
# X = df.drop("target")
# y = df["target"]
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = train_data["cleaned_review"]
y_train = train_data["sentiment"]

In [45]:
X_train

0        bromwell high cartoon comedy ran time program ...
1        homelessness houselessness george carlin state...
2        brilliant overacting lesley ann warren best dr...
3        easily underrated inn brook cannon sure flawed...
4        typical mel brook much less slapstick movie ac...
                               ...                        
24995    towards end felt technical felt like classroom...
24996    kind enemy content watch time bloody true watc...
24997    saw descent last night stockholm festival one ...
24998    film pick pound turn rather good century film ...
24999    one dumbest film ever seen rip nearly ever typ...
Name: cleaned_review, Length: 25000, dtype: object

In [43]:
# y_train

# Extract features using TF-IDF

In [47]:
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))

In [49]:
X_train_tfidf = vectorizer.fit_transform(X_train)

In [50]:
X_train_tfidf

<25000x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 2176378 stored elements in Compressed Sparse Row format>

# Naive Bayes classifier Model

## Train

In [53]:
model = MultinomialNB(alpha=0.1)

In [57]:
model.fit(X_train_tfidf, y_train)

## Evaluate the model's performance on the train data

In [71]:
y_train_pred = model.predict(X_train_tfidf)
y_train_prob = model.predict_proba(X_train_tfidf)[:,1]

In [73]:
y_train_pred

array([0, 1, 1, ..., 0, 0, 0], dtype=int64)

In [75]:
y_train_prob

array([0.46553759, 0.72270954, 0.68320921, ..., 0.2232797 , 0.03586977,
       0.0045004 ])

In [77]:
train_accuracy = accuracy_score(y_train, y_train_pred)

In [79]:
train_accuracy

0.88728

In [81]:
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       0.90      0.87      0.89     12500
           1       0.88      0.90      0.89     12500

    accuracy                           0.89     25000
   macro avg       0.89      0.89      0.89     25000
weighted avg       0.89      0.89      0.89     25000



## Predict on test data

In [83]:
test_data["cleaned_review"] = test_data["review"].apply(lambda x: preprocess_text(x, vocab))

In [84]:
X_test_tfidf = vectorizer.transform(test_data["cleaned_review"])

In [87]:
y_test_pred = model.predict(X_test_tfidf)
y_test_prob = model.predict_proba(X_test_tfidf)[:,1]

## Evaluate the model's performance on the test data

In [91]:
# Evaluate the model's performance on the test data
print("Evaluating the model...")
accuracy = accuracy_score(test_data["sentiment"], y_test_pred)
print(f"Accuracy on test data: {accuracy:.4f}")

# Calculate ROC-AUC (if probabilities are available)
test_roc_auc = roc_auc_score(test_data["sentiment"], y_test_prob)
print(f"ROC-AUC on testing data: {test_roc_auc:.4f}")

# Display the classification report
print("\nClassification Report:")
print(classification_report(test_data["sentiment"], y_test_pred))

Evaluating the model...
Accuracy on test data: 0.8482
ROC-AUC on testing data: 0.9253

Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.86      0.85     12500
           1       0.86      0.84      0.85     12500

    accuracy                           0.85     25000
   macro avg       0.85      0.85      0.85     25000
weighted avg       0.85      0.85      0.85     25000

