# Classify movie reviews into positive or negative

Dataset: https://www.kaggle.com/utathya/imdb-review-dataset/version/1

In [3]:
import pandas as pd

df = pd.read_csv('../data/imdb-review-dataset/imdb_master.csv',
                 encoding='latin_1', index_col=0)

df1 = df.loc[df.label != 'unsup']

df1.label.value_counts() # count each label value

pos    25000
neg    25000
Name: label, dtype: int64

In [4]:
df_train = df1.loc[df1.type == 'train']
df_test = df1.loc[df1.type == 'test']

In [5]:
df_train = df_train.sample(n=2000, random_state=42)
df_test = df_test.sample(n=2000, random_state=42)

In [45]:
# Requires: conda install nltk
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\issohl\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\issohl\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\issohl\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [13]:
from nltk import word_tokenize

text = 'Hello this is a test.'

word_tokenize(text)

['Hello', 'this', 'is', 'a', 'test', '.']

In [38]:
from nltk.stem import WordNetLemmatizer

text = 'he liked cats and dogs, and teaching machines to learn'

lm = WordNetLemmatizer()

print([lm.lemmatize(token) for token in word_tokenize(text)])

['he', 'liked', 'cat', 'and', 'dog', ',', 'and', 'teaching', 'machine', 'to', 'learn']


In [41]:
from nltk.stem import SnowballStemmer

text = 'he liked cats and dogs, and teaching machines to learn'

stem = SnowballStemmer(language='english')

print([stem.stem(token) for token in word_tokenize(text)])

['he', 'like', 'cat', 'and', 'dog', ',', 'and', 'teach', 'machin', 'to', 'learn']


In [48]:
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))

text = 'he liked cats and dogs, and teaching machines to learn'

print([token for token in word_tokenize(text) if token not in stop])

['liked', 'cats', 'dogs', ',', 'teaching', 'machines', 'learn']


In [49]:
print(stop)

{'very', 'itself', 'does', 'nor', 'as', 'had', 'not', 'ours', "shan't", 'out', 'yourself', "hadn't", "hasn't", 'him', 'ma', 'over', 'each', 'is', "that'll", 'she', 'to', "she's", 'but', 'should', 'shouldn', 'needn', 'when', 'those', "weren't", 'don', 'didn', 's', 'if', 'did', 'into', 'more', 'no', 'it', 'doing', "didn't", 'these', 'just', 'then', 'what', 'a', 'ain', 'now', 've', "mightn't", 'his', 'them', 'up', 'he', 'was', 'won', "won't", 'such', 'wasn', 'were', 'theirs', 'or', 'from', 'yours', "needn't", 'few', 'once', 'd', 'can', 'during', 'they', 'own', 'will', "haven't", "isn't", 'there', 'some', 'y', 'at', 'on', "don't", 'we', "you'd", 'against', 'both', 'aren', 'shan', 're', 'himself', 'be', 'have', 'being', 'hadn', 'any', "wouldn't", 'of', 'under', 'why', 'which', 'after', 'has', 'between', 'again', 'further', 'me', 'do', 'all', 'you', 'and', 'same', 'so', 'than', "you've", 'down', 'weren', 'an', 'most', 'couldn', 'o', 'are', "wasn't", 'who', 'because', 'her', 'before', 'wouldn

In [58]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
corpus = [
   'This is the first document.',
   'This document is the second document.',
   'And this is the third one.',
   'Is this the first document?',
]

cv = CountVectorizer()
result = cv.fit_transform(corpus)

df = pd.DataFrame(result.todense(), columns=cv.get_feature_names())
pd.concat([df, pd.DataFrame(corpus, index=df.index, columns=['text'])], axis=1)

Unnamed: 0,and,document,first,is,one,second,the,third,this,text
0,0,1,1,1,0,0,1,0,1,This is the first document.
1,0,2,0,1,0,1,1,0,1,This document is the second document.
2,1,0,0,1,1,0,1,1,1,And this is the third one.
3,0,1,1,1,0,0,1,0,1,Is this the first document?


In [59]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
corpus = [
   'This is the first document.',
   'This document is the second document.',
   'And this is the third one.',
   'Is this the first document?',
]

tv = TfidfVectorizer()
result = tv.fit_transform(corpus)

df = pd.DataFrame(result.todense(), columns=cv.get_feature_names())
pd.concat([df, pd.DataFrame(corpus, index=df.index, columns=['text'])], axis=1)

Unnamed: 0,and,document,first,is,one,second,the,third,this,text
0,0.0,0.469791,0.580286,0.384085,0.0,0.0,0.384085,0.0,0.384085,This is the first document.
1,0.0,0.687624,0.0,0.281089,0.0,0.538648,0.281089,0.0,0.281089,This document is the second document.
2,0.511849,0.0,0.0,0.267104,0.511849,0.0,0.267104,0.511849,0.267104,And this is the third one.
3,0.0,0.469791,0.580286,0.384085,0.0,0.0,0.384085,0.0,0.384085,Is this the first document?


In [46]:
from nltk import word_tokenize # tokenization
from nltk.stem import WordNetLemmatizer # lemmatizes
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer # computes tfidf

class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
        self.stop = set(stopwords.words('english'))

    def __call__(self, doc):

        # tokenize text into tokens
        tokens = word_tokenize(doc)

        # strip out punctuation
        words = [t for t in tokens if t.isalpha()]
        
        # strip out stopwords
        words = [t for t in words if t not in self.stop]
        
        # lemmatize each token
        return [self.wnl.lemmatize(t) for t in words]


tfidf = TfidfVectorizer(analyzer='word', tokenizer=LemmaTokenizer())

V_train = tfidf.fit_transform(df_train.review)
V_train

<2000x21645 sparse matrix of type '<class 'numpy.float64'>'
	with 184219 stored elements in Compressed Sparse Row format>

In [10]:
y_train = df_train.label

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# visualize only the first n rows
n = 300

tsne = TSNE(n_components=2)
Z_train_2d = tsne.fit_transform(V_train[:n].todense())
y_train = df_train.label[:n]

fig, ax = plt.subplots()

labels = ['pos', 'neg']

for l in labels:
    ax.scatter(Z_train_2d[y_train == l, 0],
               Z_train_2d[y_train == l, 1],
               label=l, alpha=.2) # alpha = transparency

ax.legend()
plt.show()

```
Stage 1: Setup
scaler = StandardScaler()
scaler.fit(X_train) # setup scaler with X_train's mean, std

Stage 2: Apply
scaler.transform(X_train) # scaler is unchanged, just performs transformation
scaler.transform(X_test) # scaler is unchanged, just performs transformation
scaler.transform(new_data) # scaler is unchanged, just performs transformation
...

Stage 1: Setup
tfidf = TfidfVectorizer(..)
tfidf.fit(X_train) # setup tfidf with X_train's terms, documents

Stage 2: Apply
tfidf.transform(X_train) # tfidf is unchanged, just performs transformation
tfidf.transform(X_test) # tfidf is unchanged, just performs transformation
tfidf.transform(new_text) # tfidf is unchanged, just performs transformation
```


In [20]:
# transform V_test from text to vectors

V_test = tfidf.transform(df_test.review) # not fit_transform()
y_test = df_test.label
y_train = df_train.label

In [21]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

# scale
scaler = StandardScaler(with_mean=False)
V_train_sc = scaler.fit_transform(V_train) # fit(V_train), transform(V_train)
V_test_sc = scaler.transform(V_test)

# fit SGD
sgd = SGDClassifier(random_state=42)
sgd.fit(V_train_sc, y_train)
print(sgd.score(V_test_sc, y_test))

# fit SVC (GridSearch because this doesn't use gradient descent)
Cs = [.0001, .001, .01, .1, 1, 10]
gammas = [.0001, .001, .01, .1, 1, 10]
gs_svc = GridSearchCV(SVC(random_state=42), param_grid={'C':Cs, 'gamma':gammas})
gs_svc.fit(V_train_sc, y_train)
print(gs_svc.best_params_)
print(gs_svc.score(V_test_sc, y_test))



0.6495
{'C': 10, 'gamma': 0.0001}
0.6325


In [25]:
# metrics
from sklearn.metrics import classification_report, confusion_matrix

print('SGD')
pred_sgd = sgd.predict(V_test_sc)
print(classification_report(y_test, pred_sgd))
print(confusion_matrix(y_test, pred_sgd))

print('SVC')
pred_svc = gs_svc.predict(V_test_sc)
print(classification_report(y_test, pred_svc))
print(confusion_matrix(y_test, pred_svc))

SGD
             precision    recall  f1-score   support

        neg       0.67      0.65      0.66      1040
        pos       0.63      0.65      0.64       960

avg / total       0.65      0.65      0.65      2000

[[677 363]
 [338 622]]
SVC
             precision    recall  f1-score   support

        neg       0.59      0.97      0.73      1040
        pos       0.88      0.27      0.42       960

avg / total       0.73      0.63      0.58      2000

[[1004   36]
 [ 699  261]]


In [26]:
df_unsup = df.loc[df.label == 'unsup'] # unlabeled reviews

test = df_unsup.iloc[1:4].review # take 3 reviews
print(test)

# Tokenize .... Tfidf
V_test_doc = tfidf.transform(test)

# Scale
V_test_doc_sc = scaler.transform(V_test_doc)

# predict using SVC and SGD
print('SGD', sgd.predict(V_test_doc_sc))
print('SVC', gs_svc.predict(V_test_doc_sc))

50001    Take a low budget, inexperienced actors doubli...
50002    Everybody has seen 'Back To The Future,' right...
50003    Doris Day was an icon of beauty in singing and...
Name: review, dtype: object
SGD ['neg' 'neg' 'neg']
SVC ['neg' 'neg' 'pos']


In [27]:
test.values[2]

'Doris Day was an icon of beauty in singing and acting by her warm voice and genius acting in different movies obtained this film by her legend songs as (Iwill never stop loving you) with soft melody and warm lyrics by magic voice of Day.<br /><br />James Cagney was a villain of Hollywood by shark eyes and voice to send for audience the core of badness and evil characters as his profile in cinema.The producer choose previously Ava Gardener to be the hero of this film in-front Cagney but Cagney refused this choose because he said that Gardner not familiar with his role and he cheesed Day in it because of her fantastic abilities between singing and acting and she succeeded in it with Cagney.'

In [None]:
# Example
# doc = df_train.iloc[0].review

# tokens = word_tokenize(doc) # needs nltk.download('punkt')
# print(tokens[:20])

# lem = WordNetLemmatizer() # needs nltk.download('wordnet')

# lemmatized = [lem.lemmatize(t) for t in tokens]
# print(lemmatized[:20])

In [18]:
# Keras embedding layer
# https://www.liip.ch/en/blog/sentiment-detection-with-keras-word-embeddings-and-lstm-deep-learning-networks

# Without GPU: conda install keras
# With GPU: conda install keras-gpu

In [33]:
V_train_sc.shape

(2000, 21460)

In [65]:
from keras.models import Sequential
from keras.layers import Embedding, GRU, Dense, Conv1D, MaxPool1D

n_vocab = 5000

tfidf = TfidfVectorizer(stop_words='english', analyzer='word',
                        tokenizer=LemmaTokenizer(),
                        max_features=n_vocab)

tfidf.fit(df_train.review)
V_train_keras = tfidf.transform(df_train.review)
V_test_keras = tfidf.transform(df_test.review)

scaler = StandardScaler(with_mean=False)
scaler.fit(V_train_keras)
V_train_keras_sc = scaler.transform(V_train_keras)
V_test_keras_sc = scaler.transform(V_test_keras)

y_train_keras = y_train.map({'pos': 1, 'neg': 0})
y_test_keras = y_test.map({'pos': 1, 'neg': 0})

In [68]:
embedding_vector_length = 50

model = Sequential()
model.add(Embedding(n_vocab, embedding_vector_length, input_length=n_vocab))
model.add(Conv1D(100, kernel_size=3, activation='relu', padding='valid'))
model.add(MaxPool1D())
model.add(GRU(35))
model.add(Dense(1, activation='sigmoid')) 
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_15 (Embedding)     (None, 5000, 50)          250000    
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 4998, 100)         15100     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 2499, 100)         0         
_________________________________________________________________
gru_3 (GRU)                  (None, 35)                14280     
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 36        
Total params: 279,416
Trainable params: 279,416
Non-trainable params: 0
_________________________________________________________________


In [76]:
from keras.callbacks import EarlyStopping, TensorBoard
import time

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy']) 

earlystopping = EarlyStopping(patience=2)
tensorboard = TensorBoard(log_dir='./logs/text_gru/%d' % time.time(),
                          write_graph=False)

n = 1000
batch_size = 16
model.fit(V_train_keras_sc[:n], y_train_keras[:n],
          validation_data=(V_test_keras_sc[:n], y_test_keras[:n]),
          epochs=20, batch_size=batch_size,
          callbacks=[earlystopping, tensorboard])

Train on 1000 samples, validate on 1000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20


<keras.callbacks.History at 0x23714f7fdd8>

In [49]:
model.evaluate(V_test_keras_sc, y_test_keras)



[0.69238569688797, 0.52]

In [50]:
test = df_unsup.iloc[1:10].review # take 3 reviews
print(test)

# Tokenize .... Tfidf
V_test_doc = tfidf.transform(test)

# Scale
V_test_doc_sc = scaler.transform(V_test_doc)

# predict
print('Keras Embedding layer', model.predict(V_test_doc_sc))

50001    Take a low budget, inexperienced actors doubli...
50002    Everybody has seen 'Back To The Future,' right...
50003    Doris Day was an icon of beauty in singing and...
50004    After a series of silly, fun-loving movies, 19...
50005    This isn't exactly a musical, but it almost se...
50006    After seven years and seventeen pictures at Wa...
50007    In the 1950's there were many film boigraphies...
50008    MY RATING- 7.3<br /><br />This one is a curiou...
50009    Doris Day and James Cagney are excellent in th...
Name: review, dtype: object
Keras Embedding layer [[0.48327655]
 [0.48329026]
 [0.48327655]
 [0.4832878 ]
 [0.4832897 ]
 [0.48354024]
 [0.48327655]
 [0.48327655]
 [0.48327655]]
