In [0]:
import csv
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn import metrics
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

In [0]:
URL_Tr ='https://raw.githubusercontent.com/cacoderquan/Sentiment-Analysis-on-the-Rotten-Tomatoes-movie-review-dataset/master/train.tsv'
URL_Te ='https://raw.githubusercontent.com/cacoderquan/Sentiment-Analysis-on-the-Rotten-Tomatoes-movie-review-dataset/master/test.tsv'

In [0]:
train = pd.read_csv(URL_Tr,sep='\t')
test = pd.read_csv(URL_Te,sep='\t')


In [238]:
train.head()
test.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [239]:
print(train.shape,"\n",test.shape)

(156060, 4) 
 (66292, 3)


In [240]:
print("\t",train.isnull().values.any(), "\n\t",
      test.isnull().values.any()
     )

	 False 
	 False


In [241]:
#sanitization
fullSent = train.loc[train.groupby('SentenceId')['PhraseId'].idxmin()]

fullSent.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
63,64,2,"This quiet , introspective and entertaining in...",4
81,82,3,"Even fans of Ismail Merchant 's work , I suspe...",1
116,117,4,A positively thrilling combination of ethnogra...,3
156,157,5,Aggressive self-glorification and a manipulati...,1


In [242]:
print (len(train.groupby('SentenceId').nunique()),
      len(test.groupby('SentenceId').nunique())
      )

8529 3310


In [243]:
StopWords = ENGLISH_STOP_WORDS
print(StopWords)

frozenset({'my', 'beyond', 'ever', 'often', 'sometimes', 'amongst', 'inc', 'five', 'serious', 'the', 'thereby', 'no', 'thence', 'its', 'nobody', 'name', 'done', 'himself', 'interest', 'me', 'once', 'could', 'over', 'yet', 'ltd', 'during', 'being', 'would', 'go', 'by', 'herself', 'among', 'everything', 'hereafter', 'show', 'whoever', 'otherwise', 'become', 'all', 'also', 'off', 'yours', 'these', 'de', 'both', 'anyway', 'it', 'other', 'which', 'might', 'seemed', 'mine', 'he', 'myself', 'below', 'such', 'system', 'their', 'they', 'when', 'forty', 'con', 'therein', 'wherein', 'hers', 'however', 'why', 'many', 'nowhere', 'be', 'per', 'us', 'same', 're', 'couldnt', 'seem', 'take', 'out', 'though', 'throughout', 'something', 'upon', 'now', 'an', 'until', 'on', 'one', 'must', 'through', 'anywhere', 'hundred', 'where', 'next', 'six', 'as', 'is', 'more', 'therefore', 'cannot', 'fill', 'above', 'find', 'amoungst', 'bill', 'down', 'whenever', 'thus', 'from', 'anyone', 'third', 'too', 'whom', 'part

In [244]:
BOW_Vectorizer = CountVectorizer(strip_accents='unicode',
                                 stop_words=StopWords,
                                 ngram_range=(1,3),
                                 analyzer='word',
                                 min_df=5,
                                 max_df=0.5)

BOW_Vectorizer.fit(list(fullSent['Phrase']))

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=0.5, max_features=None, min_df=5,
                ngram_range=(1, 3), preprocessor=None,
                stop_words=frozenset({'a', 'about', 'above', 'across', 'after',
                                      'afterwards', 'again', 'against', 'all',
                                      'almost', 'alone', 'along', 'already',
                                      'also', 'although', 'always', 'am',
                                      'among', 'amongst', 'amoungst', 'amount',
                                      'an', 'and', 'another', 'any', 'anyhow',
                                      'anyone', 'anything', 'anyway',
                                      'anywhere', ...}),
                strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [245]:
#create tfidf vectorizer 
tfidf_vectorizer = TfidfVectorizer(min_df=5,
                                 max_df=5,
                                  analyzer='word',
                                  strip_accents='unicode',
                                  ngram_range=(1,3),
                                  sublinear_tf=True,
                                  smooth_idf=True,
                                  use_idf=True,
                                  stop_words=StopWords)

tfidf_vectorizer.fit(list(fullSent['Phrase']))


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=5, max_features=None,
                min_df=5, ngram_range=(1, 3), norm='l2', preprocessor=None,
                smooth_idf=True,
                stop_words=frozenset({'a', 'about', 'above', 'across', 'after',
                                      'afterwards', 'again', 'against', 'all',
                                      'almost', 'alone', 'along', 'already',
                                      'also', 'although', 'always', 'am',
                                      'among', 'amongst', 'amoungst', 'amount',
                                      'an', 'and', 'another', 'any', 'anyhow',
                                      'anyone', 'anything', 'anyway',
                                      'anywhere', ...}),
                strip_accents='unicode', sublinear_tf=True,
                token_

In [246]:
#tfid
#build train and test datasets
phrase = fullSent['Phrase']
sentiment = fullSent['Sentiment']
phrase[0], sentiment[0]

('A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .',
 1)

In [247]:
X_train,X_test,Y_train,Y_test = train_test_split(phrase,sentiment,test_size=0.2,random_state=4)

X_train.shape, Y_train.shape, X_test.shape, Y_test.shape

((6823,), (6823,), (1706,), (1706,))

In [248]:
#calling both the methods
#method 1-BOW
train_bow=BOW_Vectorizer.transform(X_train)
test_bow=BOW_Vectorizer.transform(X_test)
train_bow.shape[1]


3548

In [249]:
bow_feature_vec = pd.DataFrame(train_bow.toarray(), columns = BOW_Vectorizer.get_feature_names())
bow_feature_vec.head(15)

bow_feature_vec_test = pd.DataFrame(test_bow.toarray(), columns = BOW_Vectorizer.get_feature_names())
bow_feature_vec_test.head(15)

Unnamed: 0,10,10 minutes,100,101,11,12,13,15,19,20,20 years,2002,20th,21st,30,50,51,60s,70s,80,90,90 minute,90 minutes,abandon,ability,able,absolutely,absorbing,abstract,absurd,absurdity,abuse,academy,accents,acceptable,accessible,accomplished,account,accurate,achievement,...,worth,worth look,worth price,worth seeing,worth watching,worthwhile,worthy,wow,wrapped,wrenching,writer,writer director,writers,writing,written,wrong,wrote,wry,xxx,ya,ya ya,yarn,year,year best,year old,yearning,years,years ago,yes,yiddish,york,young,young men,young woman,younger,youth,yu,zero,zhang,zone
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [0]:
from keras import backend as K
def recall_m(y_true, y_pred):
  true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
  possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
  recall = true_positives / (possible_positives + K.epsilon())
  return recall

def precision_m(y_true, y_pred):
  true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
  predicted_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
  precision = true_positives / (predicted_positives + K.epsilon())
  return precision

def f1_m(y_true, y_pred):
  precision = precision_m(y_true, y_pred)
  recall = recall_m(y_true, y_pred)
  return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [0]:
from keras.models import Sequential
from keras.layers import Conv1D, MaxPooling1D, Dense, Dropout, Flatten
from keras.layers import Activation, Conv1D, GlobalMaxPooling1D
from keras import optimizers

In [252]:
fea_vec_dim = bow_feature_vec.shape[1]
print(fea_vec_dim, n_class)

X_train = bow_feature_vec.values.reshape((bow_feature_vec.shape[0], bow_feature_vec.shape[1], 1))
X_train.shape



fea_vec_test_dim = bow_feature_vec_test.shape[1]
print(fea_vec_test_dim, n_class)

X_test = bow_feature_vec_test.values.reshape((bow_feature_vec_test.shape[0], bow_feature_vec_test.shape[1], 1))
X_test.shape



3548 5
3548 5


(1706, 3548, 1)

In [0]:
def baseline_cnn_model(fea_matrix, n_class, mode, compiler):
  #create model
  model = Sequential()
  model.add(Conv1D(filters=64, kernel_size = 3, activation = 'relu',
                  input_shape=(fea_matrix.shape[1], fea_matrix.shape[2])))
  model.add(MaxPooling1D(pool_size = 2))
  model.add(Conv1D(filters=128, kernel_size = 3, activation = 'relu'))
  model.add(MaxPooling1D(pool_size=2))
  model.add(Flatten())
  model.add(Activation('relu'))
  model.add(Dense(n_class))
  if n_class==1 and mode == "cla":
    model.add(Activation('sigmoid'))
    # compile the model
    model.compile(optimizer=compiler, loss = 'binary_crossentropy',
                 metrics=['acc', f1_m, precision_m, recall_m])
  else:
    model.add(Activation('softmax'))  
    #comoile the model
    model.compile(optimizer=compiler, loss = 'sparse_categorical_crossentropy',
                 metrics=['acc', f1_m, precision_m, recall_m])
  return model
  

In [0]:
lr = 1e-3
batch_size = 128
num_epochs = 5
decay = 1e-4
mode = "reg"
n_class = 5 #5

adm = optimizers.Adam(lr = lr, decay = decay)
sgd = optimizers.SGD(lr = lr, nesterov = True, momentum = 0.7, decay = decay)
Nadam = optimizers.Nadam(lr = lr, beta_1=0.9, beta_2=0.999, epsilon = 1e-08)
model = baseline_cnn_model(X_train, n_class, mode, Nadam)

In [255]:
model.fit(X_train, Y_train, batch_size = batch_size, 
          epochs = num_epochs, verbose=1, validation_split = 0.2)

Train on 5458 samples, validate on 1365 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7ff3025c8ef0>

In [0]:
def print_metrics(accuracy, f1_score, precision, recall):
  print('SIMPLE CNN MODEL PERFORMANCE')
  print('Accuracy: ', np.round(accuracy, 4))
  print('Precision: ', np.round(precision, 4))
  print('Recall: ', np.round(recall, 4))
  print('F1 Score: ', np.round(f1_score, 4))
  print('\n')

In [257]:
loss, accuracy, f1_score, precision, recall = model.evaluate(X_test, Y_test)
print_metrics(accuracy, f1_score, precision, recall)

SIMPLE CNN MODEL PERFORMANCE
Accuracy:  0.3247
Precision:  1.464
Recall:  1.464
F1 Score:  1.464


