In [1]:
#IMPORTING REQUIRED LIBRARIES
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from keras.utils import to_categorical
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer, LancasterStemmer
from sklearn.model_selection import train_test_split
from keras import backend as K

Using TensorFlow backend.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [2]:
#IPORTING THE DATASET
dataset = pd.read_csv("https://raw.githubusercontent.com/cacoderquan/Sentiment-Analysis-on-the-Rotten-Tomatoes-movie-review-dataset/master/train.tsv", sep = '\t')
#PRINTING THE DATASET
dataset.head(7)

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2
5,6,1,of escapades demonstrating the adage that what...,2
6,7,1,of,2


In [0]:
dataset = dataset.loc[:, 'Phrase':'Sentiment']
# dataset = dataset.drop(columns = ['PhraseId', 'SentenceId'])
documents = dataset.values.tolist()

In [0]:
#DATA PRE-PROCESSING FOR DATASET
porter = PorterStemmer()
lancaster = LancasterStemmer()
#REMOVAL OF STOP WORDS
wordnet_lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
punctuations = '-\'?:!.,;\"()'

remove_stopwords = True
useStemming = False
useLemma = True
removePuncs = True

for l in range(len(documents)):
  label = documents[l][1]
  tempReview = []
  for w in documents[l][0].split(' '):
    newWord = w
    if remove_stopwords and (w in stop_words):
      continue
    if removePuncs and (w in punctuations):
      continue
    if useStemming:
      newWord = porter.stem(newWord)
    if useLemma:
      newWord = wordnet_lemmatizer.lemmatize(newWord)
    tempReview.append(newWord)
    
    documents[l] = (' '.join(tempReview), label)

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = TfidfVectorizer(stop_words = 'english', ngram_range= (1,1), max_features=1500)

In [0]:
#SPLITTING THE DATASET FOR TESTING AND TRAINING
all_data = pd.DataFrame(documents, columns=['Phrase', 'Sentiment'])
x_train, x_test, y_train, y_test = train_test_split(all_data['Phrase'], all_data['Sentiment'], train_size = 0.7, shuffle = True, random_state = 2003)

In [0]:
X = vectorizer.fit_transform(all_data['Phrase'])
x_train = vectorizer.transform(x_train)
x_test = vectorizer.transform(x_test)

In [0]:
x_train_np = x_train.toarray()
y_train_np = to_categorical(y_train)
x_test_np = x_test.toarray()
y_test_np = to_categorical(y_test)
x_train = np.expand_dims(x_train_np, axis=2)
x_test = np.expand_dims(x_test_np, axis=2)

In [9]:
#BUILDING THE MODEL
from keras.models import Sequential
from keras.layers import Dense, Conv1D, Flatten, MaxPooling1D

batch_size = 128
# inputs = inputs
# outputs =outputs
#create model
model = Sequential()
#DEFINING THE MODEL LAYERS
model.add(Conv1D(filters = 128, kernel_size=1, activation='relu', input_shape=(x_train_np.shape[1],1)))
model.add(Conv1D(filters = 128, kernel_size=1, activation='relu'))
model.add(Conv1D(filters = 128, kernel_size=1, activation='relu'))
model.add(MaxPooling1D(pool_size =2))
model.add(Flatten())
model.add(Dense(100, activation='relu'))
model.add(Dense(5, activation='softmax'))







In [10]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])





In [11]:
#TRAINING THE MODEL
#SET EPOCH RATE TO 5
model.fit(x_train, y_train_np, epochs = 5, batch_size = 128)


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



Epoch 1/5





Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f37ff4647b8>

In [0]:
model.save("1098167_1dconv_reg.h5")

In [0]:
from keras.models import load_model
model = load_model('1098167_1dconv_reg.h5')

In [14]:

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 1500, 128)         256       
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 1500, 128)         16512     
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 1500, 128)         16512     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 750, 128)          0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 96000)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 100)               9600100   
_________________________________________________________________
dense_2 (Dense)              (None, 5)                

In [15]:
#PRINTING THE EVALUATION METRICS RESULTS FOR THE MODEL
loss, acc = model.evaluate(x_test, y_test_np)
print('Loss: ', np.round(loss, 4))
print('Accuracy: ', np.round(acc, 4))


Loss:  1.0267
Accuracy:  0.604


In [16]:
r_pred = model.predict_classes(x_test, batch_size=128, verbose=0)
r_pred[1]
round_label=np.argmax(y_test_np, axis=1)
round_label[1]
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

# precision tp / (tp + fp)
precision = precision_score(round_label, r_pred, average='weighted')
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(round_label, r_pred, average='weighted')
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(round_label, r_pred, average='weighted')
print('F1 score: %f' % f1)

Precision: 0.580497
Recall: 0.603998
F1 score: 0.575615
