# IMDB Sentiment Analysis

Create DataFrame from `aclImdb` folder.

In [2]:
import pyprind
import pandas as pd
import os
import sys

basepath = 'aclImdb'

labels = {
    'pos': 1,
    'neg': 0
}

pbar = pyprind.ProgBar(50000, stream=sys.stdout)

reviews = []
sentiments = []

for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in os.listdir(path):
            with open(os.path.join(path, file), 'r', encoding='utf-8') as infile:
                txt = infile.read()
            reviews.append(txt)
            sentiments.append(labels[l])
            pbar.update()

df = pd.DataFrame({'review': reviews, 'sentiment': sentiments})
df.columns = ['review', 'sentiment']

# Shuffle the DataFrame
# df = df.sample(frac=1, random_state=1).reset_index(drop=True)

print(df.shape)
print(df.head())

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:12
(50000, 2)
                                              review  sentiment
0  Based on an actual story, John Boorman shows t...          1
1  This is a gem. As a Film Four production - the...          1
2  I really like this show. It has drama, romance...          1
3  This is the best 3-D experience Disney has at ...          1
4  Of the Korean movies I've seen, only three had...          1


In [3]:
import numpy as np

np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('movie_data.csv', index=False)

In [4]:
df = pd.read_csv('movie_data.csv')
df.head()

Unnamed: 0,review,sentiment
0,My family and I normally do not watch local mo...,1
1,"Believe it or not, this was at one time the wo...",0
2,"After some internet surfing, I found the ""Home...",0
3,One of the most unheralded great works of anim...,1
4,"It was the Sixties, and anyone with long hair ...",0


In [5]:
df.shape

(50000, 2)

Bag-of-words

In [6]:
# import count vectorizer 

from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer()
docs = np.array([
    'The sun is shining',
    'The weather is sweet',
    'The sun is shining, the weather is sweet, and one and one is two'])
bag = count.fit_transform(docs)
print(count.vocabulary_)
print(bag.toarray())

{'the': 6, 'sun': 4, 'is': 1, 'shining': 3, 'weather': 8, 'sweet': 5, 'and': 0, 'one': 2, 'two': 7}
[[0 1 0 1 1 0 1 0 0]
 [0 1 0 0 0 1 1 0 1]
 [2 3 2 1 1 1 2 1 1]]


In [9]:
# do the same with tfidf 

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(use_idf=True, norm='l2', smooth_idf=True)

np.set_printoptions(precision=2)
print(tfidf.fit_transform(docs).toarray())

[[0.   0.43 0.   0.56 0.56 0.   0.43 0.   0.  ]
 [0.   0.43 0.   0.   0.   0.56 0.43 0.   0.56]
 [0.5  0.45 0.5  0.19 0.19 0.19 0.3  0.25 0.19]]


In [10]:
import re

def preprocessor(text):
    """
    Remove HTML tags and emoticons from a text.

    Parameters:
    text (str): The input text containing HTML tags and emoticons.

    Returns:
    str: The text with all HTML tags and emoticons removed.
    """
    # Remove HTML tags
    text = re.sub('<[^>]*>', '', text)
    
    # Remove emoticons
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    
    return text.strip()

  emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
  text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')


In [13]:
preprocessor('<a>hello</a> world :)')

'hello world :)'

In [14]:
df['review'] = df['review'].apply(preprocessor)

In [16]:
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values

X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [18]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

def tokenizer(text):
    return text.split()

stop = stopwords.words('english')

tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)
small_param_grid = [
    {
        'vect__ngram_range': [(1, 1)],
        'vect__stop_words': [None],
        'vect__tokenizer': [tokenizer, tokenizer_porter],
        'clf__penalty': ['l2'],
        'clf__C': [1.0, 10.0]
    }, 
    {
        'vect__ngram_range': [(1, 1)],
        'vect__stop_words': [stop, None],
        'vect__tokenizer': [tokenizer],
        'vect__use_idf': [False],
        'vect__norm': [None],
        'clf__penalty': ['l2'],
        'clf__C': [1.0, 10.0]
    }
]

[nltk_data] Downloading package stopwords to /Users/hieu/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [19]:
# import logistic regression 
# iport grid serach
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

lr_tfidf = Pipeline([
    ('vect', tfidf),
    ('clf', LogisticRegression(solver='liblinear'))
])

gs_lr_tfidf = GridSearchCV(lr_tfidf, small_param_grid, scoring='accuracy', cv=5, verbose=1, n_jobs=-1)
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits




In [20]:
# show best params
gs_lr_tfidf.best_params_

{'clf__C': 10.0,
 'clf__penalty': 'l2',
 'vect__ngram_range': (1, 1),
 'vect__stop_words': None,
 'vect__tokenizer': <function __main__.tokenizer(text)>}

In [23]:
# show train/test best score
print('Train score: ', gs_lr_tfidf.best_score_)

print('Test score: ', gs_lr_tfidf.score(X_test, y_test))

Train score:  0.8932442711457709
Test score:  0.90024


# Bigger data

In [24]:
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    tokenized = [ w for w in text.split() if w not in stop ]
    return tokenized

  emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
  text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')


In [25]:
tokenizer('hello world')

['hello', 'world']

In [26]:
def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv) # skip header
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, int(label)

In [32]:
next(stream_docs(path='movie_data.csv'))

('"My family and I normally do not watch local movies for the simple reason that they are poorly made, they lack the depth, and just not worth our time.<br /><br />The trailer of ""Nasaan ka man"" caught my attention, my daughter in law\'s and daughter\'s so we took time out to watch it this afternoon. The movie exceeded our expectations. The cinematography was very good, the story beautiful and the acting awesome. Jericho Rosales was really very good, so\'s Claudine Barretto. The fact that I despised Diether Ocampo proves he was effective at his role. I have never been this touched, moved and affected by a local movie before. Imagine a cynic like me dabbing my eyes at the end of the movie? Congratulations to Star Cinema!! Way to go, Jericho and Claudine!!"',
 1)

In [33]:
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

In [39]:
# define hashing vectorizer
# import sgd
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import HashingVectorizer
hv = HashingVectorizer(decode_error='ignore', n_features=2**21, preprocessor=None, tokenizer=tokenizer)
clf = SGDClassifier(loss='log_loss', random_state=1)

doc_stream = stream_docs(path='movie_data.csv')
batch_size = 1000
pbar = pyprind.ProgBar(45, stream=sys.stdout)
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=batch_size)
    if not X_train: 
        break
    X_train = hv.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=np.unique(y_train))
    pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:20


In [40]:
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = hv.transform(X_test)
print('Test score: ', clf.score(X_test, y_test))

Test score:  0.8656


In [41]:
clf = clf.partial_fit(X_test, y_test)
print('Test score: ', clf.score(X_test, y_test))

Test score:  0.8832


# Topic modeling

In [42]:
import pandas as pd

df = pd.read_csv('movie_data.csv', encoding='utf-8')
df = df.rename(columns={'review': 'review_text', 'label': 'sentiment'})

In [44]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english', max_df=0.1, max_features=5000)
X = count.fit_transform(df.review_text.values)

In [45]:
# implement LiDA
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=10, learning_method='batch',random_state=123)
X_topics = lda.fit_transform(X)

In [46]:
X_topics

array([[1.48e-01, 5.49e-01, 3.03e-03, ..., 3.03e-03, 3.03e-03, 3.03e-03],
       [8.71e-01, 1.43e-02, 1.43e-02, ..., 1.43e-02, 1.43e-02, 1.43e-02],
       [1.58e-01, 3.30e-04, 3.35e-01, ..., 2.75e-01, 3.30e-04, 2.75e-02],
       ...,
       [2.79e-01, 9.90e-04, 9.90e-04, ..., 9.90e-04, 9.90e-04, 9.90e-04],
       [2.86e-03, 2.86e-03, 2.86e-03, ..., 2.86e-03, 2.80e-01, 2.86e-03],
       [3.03e-03, 7.17e-02, 6.89e-01, ..., 3.03e-03, 3.03e-03, 1.83e-01]])

In [47]:
lda.components_.shape

(10, 5000)

In [49]:
# print top 5 words for each topic
n_top_words = 5
feature_names = count.get_feature_names_out()

for topic_idx, topic in enumerate(lda.components_):
    print("Topic #%d:" % topic_idx)
    print(" ".join([feature_names[i]
                for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

Topic #0:
worst minutes awful script stupid

Topic #1:
family mother father girl children

Topic #2:
american dvd war music tv

Topic #3:
human audience cinema art feel

Topic #4:
police guy car dead murder

Topic #5:
horror house gore blood sex

Topic #6:
role performance comedy actor performances

Topic #7:
series episode war episodes season

Topic #8:
book version original effects read

Topic #9:
action fight guy guys cool

