In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
from pandas_datareader import data as web
import datetime
from statistics import mean,median,mode,stdev
import re
import json
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import nltk
import re
from nltk.corpus import stopwords

### Project: Sentiment Analysis on Movie Reviews

Objective: The goal of this project is to build a sentiment analysis model that can classify movie reviews as either positive or negative.

#### Steps:

<b>Data Collection</b>: Use the IMDB movie reviews dataset. This dataset contains 50,000 movie reviews that are labeled as either positive or negative. You can download the dataset from this link.

<b>Data Cleaning</b>: Clean the text data by removing unnecessary characters, converting all text to lowercase, and removing stop words.

<b>Data Exploration</b>: Explore the dataset to understand the distribution of positive and negative reviews. You can also explore the most common words in positive and negative reviews.

<b>Feature Extraction</b>: Use techniques like Bag of Words, TF-IDF, or word embeddings to convert the text data into numerical features that can be used by a machine learning model.

<b>Model Building</b>: Build a machine learning model to classify the reviews. You can start with a simple model like Logistic Regression, and then try more complex models like SVM or Random Forest. You can also experiment with deep learning models like LSTM or GRU.

<b>Model Evaluation</b>: Evaluate the performance of your model using appropriate metrics like accuracy, precision, recall, and F1 score. Also, create a confusion matrix to understand the performance of your model in more detail.

<b>Hyperparameter Tuning</b>: Tune the hyperparameters of your model to improve its performance.

<b>Model Interpretation</b>: Try to understand why your model is making certain predictions. You can use techniques like SHAP or LIME for model interpretation.

<i>Tools: Python, Pandas, Scikit-learn, NLTK, Keras</i>



### 1. Data Cleaning 

In [3]:
df = pd.read_csv("IMDB Dataset.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
# Download stopwords from nltk, do so
nltk.download('stopwords')

# Convert to lowercase
df['review_cleaned'] = df['review'].str.lower()

# Remove punctuation and any other non-alphabet characters
df['review_cleaned'] = df['review_cleaned'].apply(lambda x: re.sub(r'[^a-z\s]', '', x))

# Remove stopwords
stopwords = set(stopwords.words('english'))
df['review_cleaned'] = df['review_cleaned'].apply(lambda x: ' '.join(word for word in x.split() if word not in stopwords))
df = df.drop('review', axis=1)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Joshu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### 2. EDA

In [5]:
df['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [6]:
# average word counts of various reviews
df['word_count'] = df['review_cleaned'].apply(lambda x: len(str(x).split()))
positive_ave = df[df['sentiment'] == 'positive']['word_count'].mean()
negative_ave = df[df['sentiment'] == 'negative']['word_count'].mean()

print("positive average word count is", positive_ave)
print("negative average word count is", negative_ave)

positive average word count is 122.84004
negative average word count is 120.27868


In [7]:
df['word_count'].describe()

count    50000.000000
mean       121.559360
std         91.591626
min          3.000000
25%         65.000000
50%         90.000000
75%        148.000000
max       1440.000000
Name: word_count, dtype: float64

In [8]:
from collections import Counter
df['most_common_words'] = df['review_cleaned'].apply(lambda x: Counter(str(x).split()).most_common())

print(df['most_common_words'])

0        [(oz, 5), (violence, 4), (br, 3), (show, 3), (...
1        [(br, 6), (well, 3), (little, 2), (production,...
2        [(thought, 2), (comedy, 2), (may, 2), (br, 2),...
3        [(jake, 4), (parents, 3), (br, 3), (movie, 3),...
4        [(one, 6), (br, 5), (mr, 3), (people, 3), (dif...
                               ...                        
49995    [(movie, 4), (like, 2), (always, 2), (classic,...
49996    [(bad, 4), (br, 2), (better, 2), (watch, 2), (...
49997    [(catholic, 3), (tragedy, 3), (taught, 2), (ce...
49998    [(one, 4), (like, 2), (im, 1), (going, 1), (di...
49999    [(movie, 7), (movies, 3), (far, 2), (even, 2),...
Name: most_common_words, Length: 50000, dtype: object


In [9]:
def common_words(df):
    common_words = []
    for i in range(len(df)):
        common_words.append(df['most_common_words'].iloc[i][0])


    max_values = {}
    for item in common_words:
        first_value = item[0]
        second_value = item[1]
        if first_value in max_values:
            if second_value > max_values[first_value][1]:
                max_values[first_value] = item
        else:
            max_values[first_value] = item

    common_words = list(max_values.values())

    #sort values
    common_words = sorted(common_words, key=lambda x: x[1], reverse=True)

    return common_words

In [10]:
df_neg = df[df['sentiment'] == 'negative']
df_pos = df[df['sentiment'] == 'positive']   

df_common_words = pd.DataFrame()
df_common_words['positive'] = common_words(df_pos)
df_common_words['negative'] = pd.Series(common_words(df_neg))


df_common_words.head(20)

# not really the most useful ngl

Unnamed: 0,positive,negative
0,"(br, 59)","(br, 80)"
1,"(marty, 36)","(like, 31)"
2,"(tony, 35)","(victor, 31)"
3,"(rob, 34)","(movie, 28)"
4,"(custer, 33)","(film, 27)"
5,"(titanic, 33)","(zombie, 27)"
6,"(match, 30)","(jesse, 27)"
7,"(scarlett, 30)","(trivialboring, 26)"
8,"(sam, 29)","(puppet, 25)"
9,"(ring, 27)","(timon, 25)"


### Feature Extraction

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X1 = vectorizer.fit_transform(df['review_cleaned'])

# Now, X is a matrix where each row corresponds to a document and each column is a word from the vocabulary.
# The entries of this matrix are the frequencies of each word in each document.

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X2 = vectorizer.fit_transform(df['review_cleaned'])

# Now, X is a matrix where each row corresponds to a document and each column is a word from the vocabulary.
# The entries of this matrix are the TF-IDF scores of each word in each document.


#### Explanation of TFIDF:

Term Frequency-Inverse Document Frequency. It's a numerical statistic that reflects how important a word is to a document in a collection or corpus.

Term Frequency (TF): This summarizes how often a given word appears within a document.

Inverse Document Frequency (IDF): This downscales words that appear a lot across documents. A word is not of much use to us if it’s appearing in all the documents.

In [17]:
import gensim

# Train Word2Vec model
model = gensim.models.Word2Vec(df['review_cleaned'], min_count=1)

# Now, model is a Word2Vec model that can be used to create word embeddings.
# You can get the embedding of a word by calling model[word].


### Model Building

Model Building: Build a machine learning model to classify the reviews. You can start with a simple model like Logistic Regression, and then try more complex models like SVM or Random Forest. You can also experiment with deep learning models like LSTM or GRU.

Model Evaluation: Evaluate the performance of your model using appropriate metrics like accuracy, precision, recall, and F1 score. Also, create a confusion matrix to understand the performance of your model in more detail.

Hyperparameter Tuning: Tune the hyperparameters of your model to improve its performance.

Model Interpretation: Try to understand why your model is making certain predictions. You can use techniques like SHAP or LIME for model interpretation.

In [19]:
df.columns = ['labels','reviews','word_count','common_words']

In [20]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Assuming df is your DataFrame, 'reviews' is the column with the reviews, and 'labels' is the column with the labels
X = df['reviews']
y = df['labels']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Create a pipeline that first transforms the text data into TF-IDF features, then trains a logistic regression model
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LogisticRegression()),
])

# Train the model
pipeline.fit(X_train, y_train)

# Predict the labels for the test set
y_pred = pipeline.predict(X_test)


### Model Evaluation

In [22]:
# Print a classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.91      0.88      0.89      5044
    positive       0.88      0.91      0.89      4956

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [23]:
pipeline.predict(X_test.iloc[0:2])

array(['negative', 'negative'], dtype=object)

In [29]:
df_own = pd.DataFrame()
df_own['reviews'] = ['this is a terrible movie lmao','this is a great amazing awesome movie','some good parts but overall terrible','this movie is somewhat decent, but not that great too','absolutely incredible','absolute trash']
df_own['predictions'] = pipeline.predict(df_own['reviews'])
df_own


Unnamed: 0,reviews,predictions
0,this is a terrible movie lmao,negative
1,this is a great amazing awesome movie,positive
2,some good parts but overall terrible,negative
3,"this movie is somewhat decent, but not that gr...",positive
4,absolutely incredible,positive
5,absolute trash,negative


In [30]:
# topic modelling
import gensim
from gensim import corpora

# Sample documents
documents = [
    "Machine learning is an exciting field with numerous applications.",
    "Natural language processing helps in understanding and generating human language.",
    "Deep learning is a subfield of machine learning that focuses on neural networks.",
    "Topic modeling can discover hidden topics in a collection of documents.",
    "Python is a popular programming language for data analysis and machine learning."
]

# Tokenize the documents
tokenized_docs = [doc.split() for doc in documents]

# Remove stopwords; using a function because we're overkill
def remove_elements(list_of_lists, removal_list):
    return [[item for item in sublist if item not in removal_list] for sublist in list_of_lists]

td = remove_elements(tokenized_docs,stopwords)

# Create a dictionary from the tokenized documents
dictionary = corpora.Dictionary(td)

# Create a document-term matrix
doc_term_matrix = [dictionary.doc2bow(doc) for doc in tokenized_docs]

# Build the LDA model
lda_model = gensim.models.LdaModel(doc_term_matrix, num_topics=2, id2word=dictionary, passes=10)

# Print the topics
for idx, topic in lda_model.print_topics():
    print(f"Topic #{idx+1}: {topic}")


Topic #1: 0.093*"learning" + 0.040*"machine" + 0.040*"neural" + 0.040*"subfield" + 0.040*"focuses" + 0.040*"Deep" + 0.040*"networks." + 0.040*"exciting" + 0.040*"field" + 0.040*"applications."
Topic #2: 0.077*"language" + 0.046*"machine" + 0.046*"Natural" + 0.046*"helps" + 0.046*"processing" + 0.046*"language." + 0.046*"generating" + 0.046*"human" + 0.046*"understanding" + 0.046*"programming"
