# Sentiment Analysis

In [1]:
import pandas as pd
import re 
import numpy as np
import spacy
import time

from numpy import asarray
from numpy import zeros
from nltk.corpus import stopwords
from nltk import word_tokenize

from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers.recurrent import SimpleRNN, LSTM
from keras.layers import Dense, Flatten, Embedding, Masking, Bidirectional
from keras_self_attention import SeqSelfAttention

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

from gensim.models import Word2Vec



In [2]:
# Taken from Spring 2021 Class Notes
from typing import List
def integer_encode_documents(docs: List[str], tokenizer: Tokenizer)-> List[List[int]]:
    documents = []
    for d in docs:
        doc_integers = []
        for i in text_to_word_sequence(d):
            doc_integers.append(tokenizer.word_index[i])
        documents.append(doc_integers)
    return documents

def integer_encode_documents(docs, tokenizer):
    return tokenizer.texts_to_sequences(docs)

In [3]:
def get_max_token_length_per_doc(docs: List[List[str]])-> int:
    return max(list(map(lambda x: len(x.split()), docs)))

In [4]:
reviews_df = pd.read_csv('IMDB Dataset.csv')

In [5]:
reviews_df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [6]:
# convert sentiment into binary labels
reviews_df['label'] = pd.get_dummies(reviews_df["sentiment"])['positive']
reviews_df.head()

Unnamed: 0,review,sentiment,label
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1


In [7]:
# check class balance
reviews_df['label'].mean()

0.5

In [8]:
# Function that removes characters
def replace(replacements):
    for replace in replacements: 
        reviews_df['review'] = reviews_df['review'].str.replace(replace, '', case = False)

In [9]:
replace(['/><br', '<br', "\"", "\'", "/", '=',
         '<', '>', ',', '_', '\n', '\.', '-', '\n', '\(', '\)'])

In [10]:
potential_duplicates = 'abcdefghijklmnopqrstuvwxyz!?'

# Loops through the alphabet and replaces charaters that appear 3+ times in a row with one occurence
for i in potential_duplicates: 
    if i == '?':
        i = '\?'
    reviews_df['review'] = reviews_df.review.apply(lambda x: re.sub(i + i + i + '+', i, x))

In [11]:
# Preprocessing

# remove unnecessary characters/punctuations
reviews_df['review'] = reviews_df['review'].str.replace(r'(\n)|(\r)|(\t)|(<br>)|(<br />)|(<br/>)',"")

# group similar concepts
reviews_df['review'] = reviews_df['review'].str.replace(r'movie|film',"movie_token",case = False)

In [12]:
import en_core_web_md
nlp = spacy.load("en_core_web_sm", disable=['ner', 'parser'])

In [13]:
# Apply Lemmatization
reviews_df['review'] = reviews_df['review'].apply(lambda x: ' '.join([token.lemma_ for token in nlp(x)]))

In [14]:
stop_words = list(stopwords.words('english'))

In [15]:
# Remove stop words
reviews_df['review'] = reviews_df['review'].apply(lambda x: ' '.join([item for item in x.split() if item not in stop_words]))

In [16]:
# Tokenize text
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=10000, oov_token="UNKNOWN_TOKEN")
tokenizer.fit_on_texts(reviews_df['review'])

In [17]:
# Integer encode and pad doc sequences
docs = integer_encode_documents(reviews_df['review'], tokenizer)
max_length = get_max_token_length_per_doc(reviews_df['review'])
padded_docs = pad_sequences(docs, maxlen=300, padding='post')

In [18]:
X_train, X_test, y_train, y_test = train_test_split(padded_docs, reviews_df['label'], test_size = 0.1)

## RNN & LSTM

### using self-trained embeddings

# Using Word2vec

In [19]:
from gensim.models import Word2Vec

# Fit a Word2Vec model to create customer embeddings
docs_w2v = [word_tokenize(single) for single in reviews_df['review']]
model_word2vec = Word2Vec(docs_w2v, vector_size=300, window=10, min_count=1, workers=4)
model_word2vec.wv.save_word2vec_format('imdb_embeddings.txt', binary=False)

In [20]:
VOCAB_SIZE = 10000

# Load the embedding vectors

def load_embedding_vectors():
    embeddings_index = {}
    with open('imdb_embeddings.txt') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    print('Loaded %s word vectors.' % len(embeddings_index))
    return embeddings_index
embeddings_index = load_embedding_vectors()

embedding_matrix_imdb = zeros((VOCAB_SIZE, 300))
for word, i in tokenizer.word_index.items():
    if i == 10000:
        break
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: # check that it is an actual word that we have embeddings for
        embedding_matrix_imdb[i] = embedding_vector

Loaded 179333 word vectors.


In [21]:
from keras.layers import Flatten, Masking

# Compile an LSTM model

model = Sequential()
model.add(Embedding(VOCAB_SIZE, 300, weights = [embedding_matrix_imdb], input_length = 300, trainable = False))
model.add(Masking(mask_value = 0.0))
model.add(LSTM(units=64, input_shape = (1, 300)))
model.add(Dense(16))
model.add(Dense(1, activation = 'sigmoid'))

model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 300, 300)          3000000   
_________________________________________________________________
masking (Masking)            (None, 300, 300)          0         
_________________________________________________________________
lstm (LSTM)                  (None, 64)                93440     
_________________________________________________________________
dense (Dense)                (None, 16)                1040      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 3,094,497
Trainable params: 94,497
Non-trainable params: 3,000,000
_________________________________________________________________


In [22]:
# fit the model
model.fit(X_train, y_train, epochs=5, verbose=1)

# evaluate the model
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 89.700001


In [23]:
predictions = [i[0] for i in (model.predict(X_test) > 0.5).astype("int32")]
# Prints evaluation metrics
print('Accuracy: ', round(np.mean(predictions == y_test) * 100, 3))
print('ROC_AUC: ', round(roc_auc_score(y_test, predictions) * 100, 3))
print()
confusion_matrix(y_test, predictions)

Accuracy:  89.7
ROC_AUC:  89.707



array([[2175,  331],
       [ 184, 2310]])

In [24]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.92      0.87      0.89      2506
           1       0.87      0.93      0.90      2494

    accuracy                           0.90      5000
   macro avg       0.90      0.90      0.90      5000
weighted avg       0.90      0.90      0.90      5000



### Model Output

In [25]:
# Times how long it takes to score all documents
start = time.time()
all_predictions = model.predict(padded_docs)
end = time.time()
print(end - start)

40.683452129364014


In [26]:
original = pd.read_csv('IMDB Dataset.csv')

In [27]:
original['prediction'] = all_predictions

In [28]:
original['PredictedClass'] = np.where(reviews_df['prediction'] > 0.5, 1, 0)

KeyError: 'prediction'

In [None]:
original['label'] = pd.get_dummies(original["sentiment"])['positive']

In [None]:
def get_token_length_per_doc(docs: List[List[str]])-> int:
    return (list(map(lambda x: len(x.split()), docs)))

In [None]:
lengths = get_token_length_per_doc(reviews_df['review'])

In [None]:
import matplotlib.pyplot as plt

In [None]:
# View historgram of document lengths
plt.hist(lengths)
plt.title('Histogram of Document Lengths')
plt.xlabel('Document Length')

In [None]:
original.iloc[14]

In [None]:
original['review'][14]

In [None]:
original.iloc[10]

In [None]:
original['review'][10]