# ICP 11 - Sentiment Analysis
Keenan Flynn and Jasmine Thai

Import packages and dependencies

In [None]:
import pandas as pd
import nltk
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, GlobalMaxPool1D, MaxPooling1D, Conv1D, Flatten
from keras.losses import BinaryCrossentropy
import re
import string
import nltk
from nltk import word_tokenize
nltk.download('punkt')
import numpy as np
from nltk import WordNetLemmatizer  
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

## Pre-Processing

Some of the reviews had commas within the text of the review.
This read in the columns incorrectly so we added a quotechar parameter. We also dropped the unsupervised labels

In [None]:
df = pd.read_csv('imdb_master.csv',encoding='latin-1', quotechar='"')
df = df[df['label'] != 'unsup']

Get the columns as a numpy array

In [None]:
sentences = df['review'].values
y = df['label'].values

In [None]:
df

Unnamed: 0.1,Unnamed: 0,type,review,label,file
0,0,test,Once again Mr. Costner has dragged out a movie...,neg,0_2.txt
1,1,test,This is an example of why the majority of acti...,neg,10000_4.txt
2,2,test,"First of all I hate those moronic rappers, who...",neg,10001_1.txt
3,3,test,Not even the Beatles could write songs everyon...,neg,10002_3.txt
4,4,test,Brass pictures (movies is not a fitting word f...,neg,10003_3.txt
...,...,...,...,...,...
49995,49995,train,"Seeing as the vote average was pretty low, and...",pos,9998_9.txt
49996,49996,train,"The plot had some wretched, unbelievable twist...",pos,9999_8.txt
49997,49997,train,I am amazed at how this movie(and most others ...,pos,999_10.txt
49998,49998,train,A Christmas Together actually came before my t...,pos,99_8.txt


### Sentence Cleaning & Pre-Processing

Show what the sentence attribute looks like before pre-proccessing

In [None]:
sentences

array(["Once again Mr. Costner has dragged out a movie for far longer than necessary. Aside from the terrific sea rescue sequences, of which there are very few I just did not care about any of the characters. Most of us have ghosts in the closet, and Costner's character are realized early on, and then forgotten until much later, by which time I did not care. The character we should really care about is a very cocky, overconfident Ashton Kutcher. The problem is he comes off as kid who thinks he's better than anyone else around him and shows no signs of a cluttered closet. His only obstacle appears to be winning over Costner. Finally when we are well past the half way point of this stinker, Costner tells us all about Kutcher's ghosts. We are told why Kutcher is driven to be the best with no prior inkling or foreshadowing. No magic here, it was all I could do to keep from turning it off an hour in.",
       "This is an example of why the majority of action films are the same. Generic and 

This loop removes any punctuation, changes any capitol letters to lowercase, and cleans the text so that it is a better input for the Embedding layer of the NN. We use the Regex library to do this.

In [None]:
for i in range(len(sentences)):
  sentences[i]=sentences[i].translate(str.maketrans("","",string.punctuation))
  sentences[i]=''.join(i for i in sentences[i] if not i.isdigit())
  sentences[i]=re.sub(r'\s+',' ',sentences[i],flags=re.I)
  sentences[i]=re.sub(r'[!@#$%^&*()_+|\}{;:/><.}]','',sentences[i],flags=re.I)
  sentences[i]=re.sub(r'\s+[a-zA-Z]\s+', ' ',sentences[i])
  sentences[i]=re.sub(r"[a-zA-Z]", lambda x :  x.group(0).lower(), sentences[i])
  sentences=df.iloc[:,2].values

Show the sentence attribute after Regex modifications

In [None]:
sentences

array(['once again mr costner has dragged out movie for far longer than necessary aside from the terrific sea rescue sequences of which there are very few just did not care about any of the characters most of us have ghosts in the closet and costners character are realized early on and then forgotten until much later by which time did not care the character we should really care about is very cocky overconfident ashton kutcher the problem is he comes off as kid who thinks hes better than anyone else around him and shows no signs of cluttered closet his only obstacle appears to be winning over costner finally when we are well past the half way point of this stinker costner tells us all about kutchers ghosts we are told why kutcher is driven to be the best with no prior inkling or foreshadowing no magic here it was all could do to keep from turning it off an hour in',
       'this is an example of why the majority of action films are the same generic and boring theres really nothing wort

We use nltk word_tokenize() to sepearte each review into a list of words.

In [None]:
sentences = np.array([word_tokenize(x) for x in sentences])

  """Entry point for launching an IPython kernel.


This is what the sentences attribute looks like after tokenization.

In [None]:
sentences

array([list(['once', 'again', 'mr', 'costner', 'has', 'dragged', 'out', 'movie', 'for', 'far', 'longer', 'than', 'necessary', 'aside', 'from', 'the', 'terrific', 'sea', 'rescue', 'sequences', 'of', 'which', 'there', 'are', 'very', 'few', 'just', 'did', 'not', 'care', 'about', 'any', 'of', 'the', 'characters', 'most', 'of', 'us', 'have', 'ghosts', 'in', 'the', 'closet', 'and', 'costners', 'character', 'are', 'realized', 'early', 'on', 'and', 'then', 'forgotten', 'until', 'much', 'later', 'by', 'which', 'time', 'did', 'not', 'care', 'the', 'character', 'we', 'should', 'really', 'care', 'about', 'is', 'very', 'cocky', 'overconfident', 'ashton', 'kutcher', 'the', 'problem', 'is', 'he', 'comes', 'off', 'as', 'kid', 'who', 'thinks', 'hes', 'better', 'than', 'anyone', 'else', 'around', 'him', 'and', 'shows', 'no', 'signs', 'of', 'cluttered', 'closet', 'his', 'only', 'obstacle', 'appears', 'to', 'be', 'winning', 'over', 'costner', 'finally', 'when', 'we', 'are', 'well', 'past', 'the', 'half', 

We can use a Word Net Lemmatizer to get the lemma of each word so that our feature space is reduced.

In [None]:
lemma = WordNetLemmatizer() 

for sent in sentences:
  for word in sent:
    lemma.lemmatize(word)

Sentences after Lemmatization

In [None]:
sentences

array([list(['once', 'again', 'mr', 'costner', 'has', 'dragged', 'out', 'movie', 'for', 'far', 'longer', 'than', 'necessary', 'aside', 'from', 'the', 'terrific', 'sea', 'rescue', 'sequences', 'of', 'which', 'there', 'are', 'very', 'few', 'just', 'did', 'not', 'care', 'about', 'any', 'of', 'the', 'characters', 'most', 'of', 'us', 'have', 'ghosts', 'in', 'the', 'closet', 'and', 'costners', 'character', 'are', 'realized', 'early', 'on', 'and', 'then', 'forgotten', 'until', 'much', 'later', 'by', 'which', 'time', 'did', 'not', 'care', 'the', 'character', 'we', 'should', 'really', 'care', 'about', 'is', 'very', 'cocky', 'overconfident', 'ashton', 'kutcher', 'the', 'problem', 'is', 'he', 'comes', 'off', 'as', 'kid', 'who', 'thinks', 'hes', 'better', 'than', 'anyone', 'else', 'around', 'him', 'and', 'shows', 'no', 'signs', 'of', 'cluttered', 'closet', 'his', 'only', 'obstacle', 'appears', 'to', 'be', 'winning', 'over', 'costner', 'finally', 'when', 'we', 'are', 'well', 'past', 'the', 'half', 

After Lemmatizing, we need to rejoin the tokenized words into a sentence

In [None]:
#Create a temp list to store the joined sentences
sent_list = []
for sent in sentences:
  sent_list.append(' '.join(sent))
#Overwrite sentences with the temp array
sentences = np.asarray(sent_list)

This is what the sentence attribute looks like after preprocessing

In [None]:
sentences

array(['once again mr costner has dragged out movie for far longer than necessary aside from the terrific sea rescue sequences of which there are very few just did not care about any of the characters most of us have ghosts in the closet and costners character are realized early on and then forgotten until much later by which time did not care the character we should really care about is very cocky overconfident ashton kutcher the problem is he comes off as kid who thinks hes better than anyone else around him and shows no signs of cluttered closet his only obstacle appears to be winning over costner finally when we are well past the half way point of this stinker costner tells us all about kutchers ghosts we are told why kutcher is driven to be the best with no prior inkling or foreshadowing no magic here it was all could do to keep from turning it off an hour in',
       'this is an example of why the majority of action films are the same generic and boring theres really nothing wort

### Pre-processing inputs before Neural Network

We can now preprocess our labels. Our labels are not enumerated so we can encode them with a Label Encoder so each label has a discrete numerical value (0 or 1).

In [None]:
le = LabelEncoder()
y = le.fit_transform(y)

The keras Tokenizer takes each sentence and transforms it into a numerical representation of that sentence

In [None]:
tokenizer = Tokenizer(num_words = 2000)
tokenizer.fit_on_texts(sentences)

We need to capture input lengths for the NN and further transform our input.

In [None]:
#Get a max length so we can define our NN input
max_review_len = max([len(s.split()) for s in sentences])

#vocab size is the number of distinct words in our sentence input. Ideally we want to minimize this variable (which we do through pre-processing)
vocab_size = len(tokenizer.word_index)+1

# Transforms each text in texts to a sequence of integers. 
sentences = tokenizer.texts_to_sequences(sentences)   

# Add extra characters to each sentence so they are uniform length
padded_docs = pad_sequences(sentences,maxlen=max_review_len)

Split our input into training and testing subsets

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(padded_docs, y, test_size=0.30, random_state=42)

###Callbacks for overfitting

Reduce Learning Rate callback

In [None]:
reduce_lr = keras.callbacks.ReduceLROnPlateau(
  monitor='val_loss', 
  factor=0.2,
  patience=5, 
  min_lr=0.001
)

Early Stopping callback

In [None]:
earlyStop = keras.callbacks.EarlyStopping(
  monitor='val_loss',
  patience=5, 
  verbose=1,
  mode='auto', 
  restore_best_weights=True
)

##Embedded Model

Accuracy: 86%

In [None]:
def embedded_model():
  model = Sequential()
  #added embedding layer
  model.add(Embedding(vocab_size, 50, input_length=max_review_len))
  model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
  model.add(MaxPooling1D(pool_size=2))
  model.add(Flatten())
  #model.add(GlobalMaxPool1D())
  model.add(Dense(300, activation='relu'))
  model.add(Dense(1, activation='sigmoid'))
  #Compile model
  model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
  return model

In [None]:
model = embedded_model()
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 2370, 50)          8893950   
                                                                 
 conv1d (Conv1D)             (None, 2363, 32)          12832     
                                                                 
 max_pooling1d (MaxPooling1D  (None, 1181, 32)         0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, 37792)             0         
                                                                 
 dense (Dense)               (None, 300)               11337900  
                                                                 
 dense_1 (Dense)             (None, 1)                 301       
                                                        

In [None]:
history = model.fit(X_train,y_train, epochs=10, verbose=True, validation_data=(X_test,y_test), batch_size=256, callbacks=[reduce_lr, earlyStop])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 8: early stopping


In [None]:
# Model evaluation
scores = model.evaluate(X_test, y_test, verbose=0)
print("Embedded Model Accuracy: %.2f%%" % (scores[1]*100))

Embedded Model Accuracy: 88.41%


##Baseline model

Accuracy: 50%

In [None]:
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(300, input_dim=max_review_len, kernel_initializer='normal', activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model 

In [None]:
model = baseline_model()
history = model.fit(X_train,y_train, epochs=10, verbose=True, validation_data=(X_test,y_test), batch_size=256,callbacks=[reduce_lr, earlyStop])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 7: early stopping


In [None]:
# Model evaluation
scores = model.evaluate(X_test, y_test, verbose=0)
print("Base Model Accuracy: %.2f%%" % (scores[1]*100))

Base Model Accuracy: 50.39%


##Training Model on 20_newsgroups

Accuracy: 47%

In [None]:
#Import dataset
from sklearn.datasets import fetch_20newsgroups
df = fetch_20newsgroups(subset='all',
                             shuffle=False, remove=('headers', 'footers', 'quotes')) ##remove unneccessary information

In [None]:
sentences = df.data ##will undergo preprocessing to make our padded_doc for test train split
y = df.target ##df target is our labels 

In [None]:
#get the target shape
df.target.shape

(18846,)

In [None]:
#show sentences type, we'll have to turn this into a np.array later
print(type(sentences))

<class 'list'>


###Preprocessing

In [None]:
#Initialize a temp list
temp = []
for i in range(len(sentences)):
  #Iterate through the list and filter the strings
  sentences[i]=sentences[i].translate(str.maketrans("","",string.punctuation))
  sentences[i]=''.join(i for i in sentences[i] if not i.isdigit())
  sentences[i]=re.sub(r'\s+',' ',sentences[i],flags=re.I)
  sentences[i]=re.sub(r'[!@#$%^&*()_+|\}{;:/><.}]','',sentences[i],flags=re.I) ##remove punctuation characters
  sentences[i]=re.sub(r'\s+[a-zA-Z]\s+', ' ',sentences[i])
  sentences[i]=re.sub(r"[a-zA-Z]", lambda x :  x.group(0).lower(), sentences[i]) ##lower case words
  temp.append(sentences[i])
sentences = temp ##set sentences to be equal to temp

In [None]:
#word tokenize for each string in the dataset
sentences = np.array([word_tokenize(x) for x in sentences])

  


In [None]:
##Lemmatize each word via two for loops
lemma = WordNetLemmatizer() 

for sent in sentences:
  for word in sent:
    lemma.lemmatize(word)

In [None]:
#Create a temp list to store the joined sentences
sent_list = []
for sent in sentences:
  sent_list.append(' '.join(sent))
#Overwrite sentences with the temp array
sentences = np.asarray(sent_list)

In [None]:
#Encoding the target Column to be ints
le = LabelEncoder()
y = le.fit_transform(y)

In [None]:
#Updates internal vocabulary based on a list of texts.  
tokenizer = Tokenizer(num_words = 2000)
tokenizer.fit_on_texts(sentences) #This method creates the vocabulary index based on word frequency.

In [None]:
#Prepping data for embedding layer
max_news_len = max([len(s.split()) for s in sentences])
vocab_size = len(tokenizer.word_index)+1
X = tokenizer.texts_to_sequences(sentences) #Transforms each text in texts to a sequence of integers
padded_docs = pad_sequences(X, maxlen = max_news_len)

In [None]:
#Test train split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(padded_docs, y, test_size=0.30, random_state=42)

###Embedded Model

Accuracy: 47%

In [None]:
#Create our model
def embedded_model(vocab_size, max_len):
  model = Sequential()
  #added embedding layer
  model.add(Embedding(vocab_size, 50, input_length=max_len)) 
  model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
  model.add(MaxPooling1D(pool_size=2))
  model.add(Flatten())
  #model.add(GlobalMaxPool1D())
  model.add(Dense(300, activation='relu'))
  model.add(Dense(20, activation='softmax'))
  #Compile model
  model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
  return model

In [None]:
#initialize model
model = embedded_model(vocab_size, max_news_len)
model.summary() #show summary

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 11092, 50)         6074650   
                                                                 
 conv1d (Conv1D)             (None, 11085, 32)         12832     
                                                                 
 max_pooling1d (MaxPooling1D  (None, 5542, 32)         0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, 177344)            0         
                                                                 
 dense (Dense)               (None, 300)               53203500  
                                                                 
 dense_1 (Dense)             (None, 20)                6020      
                                                        

In [None]:
#fit the model with reduce_lr and earlyStop callbacks
history = model.fit(X_train,y_train, epochs=10, verbose=True, validation_data=(X_test,y_test), batch_size=256, callbacks=[reduce_lr, earlyStop])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# Model evaluation
scores = model.evaluate(X_test, y_test, verbose=0)
print("Embedded Model Accuracy: %.2f%%" % (scores[1]*100))

Embedded Model Accuracy: 47.63%
