In [5]:
# Text Classification either binary, multiclass, or multilabel
# Author: Muhammad Humayun Khan
# The different approaches for the text classification is using the
# 1. Machine Learning approach where text processing is done using BoW/n-grams and then algorithms like SVM, Random Forest, etc. are used.
# 2. Deep Learning approach where text processing is done using word embeddings like Word2Vec and algorithms like LSTM, CNN, etc. are used.


import pandas as pd
import numpy as np

dataset_path = 'datasets/IMDB Dataset.csv'

temp_df = pd.read_csv(dataset_path)

In [6]:
df = temp_df.iloc[:10000]

In [7]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [8]:
df['review'][1]

'A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well d

In [9]:
df['sentiment'].value_counts()

sentiment
positive    5028
negative    4972
Name: count, dtype: int64

In [10]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [11]:
# check for the duplicate rows
df.duplicated().sum()

17

In [13]:
# As we got the duplicate rows, we can drop them
df = df.drop_duplicates()
df.duplicated().sum()  # Check again to ensure duplicates are removed

0

In [14]:
# now applying the text preprocessing
import re   
import string
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply preprocessing to the 'review' column
df['review'] = df['review'].apply(preprocess_text)
# Check the first few rows after preprocessing
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive


In [16]:
# Now assign X and y variables
X = df.iloc[:, 0:1]
y = df['sentiment']

In [17]:
X

Unnamed: 0,review
0,one of the other reviewers has mentioned that ...
1,a wonderful little production the filming tech...
2,i thought this was a wonderful way to spend ti...
3,basically theres a family where a little boy j...
4,petter matteis love in the time of money is a ...
...,...
9995,fun entertaining movie about wwii german spy j...
9996,give me a break how can anyone say that this i...
9997,this movie is a bad movie but after watching a...
9998,this is a movie that was probably made to ente...


In [18]:
y

0       positive
1       positive
2       positive
3       negative
4       positive
          ...   
9995    positive
9996    negative
9997    negative
9998    negative
9999    positive
Name: sentiment, Length: 9983, dtype: object

In [20]:
# Since the label y has english text as positive and negative, we need to convert it to numerical values
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

y

array([1, 1, 1, ..., 0, 0, 1], dtype=int64)

In [21]:
# Now train-test split the data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((7986, 1), (1997, 1), (7986,), (1997,))

In [22]:
# Applying the Bag of Words (BoW) model
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the training data
X_train_bow = vectorizer.fit_transform(X_train['review'])

# Transform the test data
X_test_bow = vectorizer.transform(X_test['review'])

# Check the shape of the transformed data
X_train_bow.shape, X_test_bow.shape



((7986, 72267), (1997, 72267))

In [23]:
# Now apply the Guassian Naive Bayes algorithm
from sklearn.naive_bayes import GaussianNB

# Initialize the Gaussian Naive Bayes classifier
gnb = GaussianNB()

# Fit the model on the training data
gnb.fit(X_train_bow.toarray(), y_train)



0,1,2
,priors,
,var_smoothing,1e-09


In [25]:
# Now predict the sentiment of the test data
y_pred = gnb.predict(X_test_bow.toarray())

# Evaluate the model's performance
from sklearn.metrics import accuracy_score,confusion_matrix
accuracy_score(y_test,y_pred)



0.6299449173760641

In [26]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[670, 315],
       [424, 588]], dtype=int64)

In [27]:
# Now apply the Random Forest Classifier Algorithm
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model on the training data
rf_classifier.fit(X_train_bow, y_train)

# Predict the sentiment of the test data
y_pred_rf = rf_classifier.predict(X_test_bow)

# Evaluate the model's performance
accuracy_rf = accuracy_score(y_test, y_pred_rf)


In [28]:
accuracy_rf

0.8412618928392589

In [29]:
# let's use the TF-IDF vectorization instead of BoW
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['review'])

# Transform the test data
X_test_tfidf = tfidf_vectorizer.transform(X_test['review'])

# Now apply the Random Forest Classifier Algorithm with TF-IDF
rf_classifier_tfidf = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model on the training data
rf_classifier_tfidf.fit(X_train_tfidf, y_train)

# Predict the sentiment of the test data
y_pred_rf_tfidf = rf_classifier_tfidf.predict(X_test_tfidf)

# Evaluate the model's performance with TF-IDF
accuracy_rf_tfidf = accuracy_score(y_test, y_pred_rf_tfidf)

accuracy_rf_tfidf

0.8407611417125689

In [None]:
# 2. Deep Learning Approach using Word2Vec
import gensim
from nltk.tokenize import word_tokenize, sent_tokenize
from gensim.utils import simple_preprocess


In [31]:
# tokenize the sentences and preprocess them
story = []
for doc in df['review']:
    sentences = sent_tokenize(doc)
    for sentence in sentences:
        words = simple_preprocess(sentence)
        story.append(words)


In [32]:
# Create the Word2Vec model
word2vec_model = gensim.models.Word2Vec(window=10, min_count=2)

In [33]:
# build the vocabulary
word2vec_model.build_vocab(story)


In [34]:
# train the Word2Vec model
word2vec_model.train(story, total_examples=word2vec_model.corpus_count, epochs=word2vec_model.epochs)

(8085062, 10745090)

In [35]:
# total words in the vocabulary
len(word2vec_model.wv.index_to_key)

35248

In [36]:
# Now i want to get the vector of the review
def document_vector(doc):
    # Remove out of vocabulary words
    doc = [word for word in doc.split() if word in word2vec_model.wv.index_to_key]
    return np.mean(word2vec_model.wv[doc], axis=0)
    

In [None]:
# Now pass the first review to get its vector in the array format as shown in the below output
document_vector(df['review'][0])

array([ 0.13929889,  0.17233326, -0.133492  , -0.05404548,  0.03774758,
       -0.65360475,  0.5364431 ,  0.26926064, -0.48987284, -0.5445647 ,
        0.05837487, -0.69777274,  0.11599866, -0.11325346, -0.18278979,
        0.00988584,  0.38939783,  0.540873  ,  0.16590713,  0.06126416,
        0.14330137,  0.41959143, -0.18710737, -0.28329048, -0.19707759,
        0.06969315, -0.36779347, -0.2956169 , -0.26134822, -0.04685388,
       -0.60933554, -0.06679299,  0.0898404 , -0.20628928, -0.07764117,
        0.6506732 ,  0.15472746, -0.10624271, -0.1000631 , -0.72793967,
       -0.41491625, -0.2406234 ,  0.12463425,  0.02922337,  0.4908266 ,
        0.25863498, -0.15514606, -0.26007384, -0.01079391, -0.08371252,
       -0.22362141, -0.22352767,  0.1303302 , -0.292728  ,  0.5181693 ,
       -0.11247294,  0.5243303 , -0.39340433, -0.24410631,  0.4931332 ,
        0.24474755,  0.07263039,  0.36889827,  0.16103733, -0.36665705,
        0.23567599,  0.15163715,  0.5622477 , -0.01515205,  0.41

In [None]:
# Now we can send the total reviews to get their vectors
X = []
for doc in df['review']:
    X.append(document_vector(doc))

In [None]:
X = np.array(X)
X.shape

In [None]:
# Now label the y to the numerical values
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

y = encoder.fit_transform(df['sentiment'])
y

In [None]:
# Now train-test split the data
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [None]:
# apply the algorithm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf = RandomForestClassifier()
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
accuracy_score(y_test,y_pred)