In [1]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Libraries for classical machine learning
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Libraries for deep learning
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Flatten, Embedding, Dropout, Bidirectional, SpatialDropout1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import ModelCheckpoint

# Libraries for topic modeling
from pprint import pprint
import gensim, spacy
import gensim.corpora as corpora
from gensim.utils import lemmatize, simple_preprocess
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolors
from nltk.corpus import stopwords
import pyLDAvis.gensim

%matplotlib inline

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
comment = pd.read_csv('./datasets/clean_train.csv')

In [3]:
comment.head()

Unnamed: 0.1,Unnamed: 0,username,rating,comment,comment_clean,number_of_words,target
0,132,IluvatarIrmo,7.0,beautiful miniatures. Gameplay is random due t...,beautiful miniature gameplay random due dice t...,9,1
1,1773,Rabid,9.0,"Freakin' awesome!\nSimple rules, smooth gamepl...",freakin awesome simple rule smooth gameplay ni...,24,1
2,2492,LouieSTFU,8.0,Expansions:\n[thing=174506][/thing],expansion thing thing,3,1
3,1962,terp8in,9.0,"After single play, I like this very much. How...",single play like much however game play need p...,25,1
4,1769,Quertzacoalt,6.0,"You probably heard that before, but this game ...",probably heard game broken player way easy sur...,18,0


In [4]:
comment.shape #13,259 documents

(13268, 7)

In [5]:
# baseline score
comment['target'].value_counts(normalize=True)

1    0.755803
0    0.244197
Name: target, dtype: float64

# Pre-Modeling

In [6]:
X = comment['comment_clean']
y = comment['target']

In [7]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [8]:
X_train.shape

(10614,)

In [9]:
X_val.shape

(2654,)

In [10]:
y_train.value_counts(normalize=True)

1    0.755794
0    0.244206
Name: target, dtype: float64

In [11]:
y_val.value_counts(normalize=True)

1    0.75584
0    0.24416
Name: target, dtype: float64

In [12]:
# Use count vectorizer to check how many unique words there are
cvec = CountVectorizer(stop_words='english') 
cvec_df = pd.DataFrame(cvec.fit_transform(X_train).todense(), columns=cvec.get_feature_names())
cvec_df.shape #17878 unique words

(10614, 17082)

In [13]:
# Write a function that takes in the actual y value and model predictions, 
# and prints out the confusion matrix and classification report
# Dataset: Validation or test set

def cmat(actual_y, predictions, dataset):
    
    # Create a classification report
    print('Classification report for', dataset)
    print(classification_report(actual_y, predictions))
    print('')
    
    # Create a confusion matrix
    cm = confusion_matrix(actual_y, predictions)
    cm_df = pd.DataFrame(cm, columns=['Predicted Positive Review','Predicted Negative Review'], index=['Actual Positive Review', 'Actual Negative Review'])
    print('Confusion matrix for', dataset)
    print(cm_df)

## count vectorizer & logistic regression

In [14]:
# Create a pipeline with Count Vectorizer and Logistic Regression
pipe_cvec_lr = Pipeline([
    ('cvec', CountVectorizer(stop_words='english')), 
    ('lr', LogisticRegression(random_state=42))
])

# Search over the following values of hyperparameters:
pipe_cvec_lr_params = {
    'cvec__max_features': [300], #100,200
    'cvec__min_df': [2,3], 
    'cvec__max_df': [.9,.95], 
#     'cvec__ngram_range':[(1,1),(1,2)],  
    'lr__penalty': ['l2'],
    'lr__C': [.01,.1]
}

# Instantiate GridSearchCV
gs_cvec_lr = GridSearchCV(pipe_cvec_lr, # Objects to optimise
                          param_grid = pipe_cvec_lr_params, # Hyperparameters for tuning
                          cv=10) # 10-fold cross validation

# Fit model on to training data
gs_cvec_lr.fit(X_train, y_train)

# Generate predictions on validation set
cvec_lr_pred = gs_cvec_lr.predict(X_val)

# Print best parameters
print('Best parameters: ', gs_cvec_lr.best_params_)

# Print accuracy scores
print('Best CV score: ', gs_cvec_lr.best_score_)
print('Training score:', gs_cvec_lr.score(X_train, y_train))
print('Validation score:', gs_cvec_lr.score(X_val, y_val))
print('')

# Print classification report and confusion matrix
cmat(y_val, cvec_lr_pred, 'validation set')

Best parameters:  {'cvec__max_df': 0.9, 'cvec__max_features': 300, 'cvec__min_df': 2, 'lr__C': 0.1, 'lr__penalty': 'l2'}
Best CV score:  0.7724734686922581
Training score: 0.7829282080271339
Validation score: 0.7709118311981914

Classification report for validation set
              precision    recall  f1-score   support

           0       0.62      0.16      0.25       648
           1       0.78      0.97      0.86      2006

    accuracy                           0.77      2654
   macro avg       0.70      0.56      0.56      2654
weighted avg       0.74      0.77      0.71      2654


Confusion matrix for validation set
                        Predicted Positive Review  Predicted Negative Review
Actual Positive Review                        101                        547
Actual Negative Review                         61                       1945


## tf-idf & logistic regression

In [15]:
# Create a pipeline with TF-IDF and Logistic Regression
pipe_tvec_lr = Pipeline([
    ('tvec', TfidfVectorizer(stop_words='english')),
    ('lr', LogisticRegression(random_state=42))
])

# Search over the following values of hyperparameters:
pipe_tvec_lr_params = {
    'tvec__max_features': [300], #100,200
    'tvec__min_df': [2,3], #2,3 
    'tvec__max_df': [.9,.95], 
#     'tvec__ngram_range':[(1,1),(1,2)],  
    'lr__penalty': ['l2'],
    'lr__C': [.1, 1] #.1, .01
}

# Instantiate GridSearchCV
gs_tvec_lr = GridSearchCV(pipe_tvec_lr, # Objects to optimise
                          param_grid = pipe_tvec_lr_params, # Hyperparameters for tuning
                          cv=10) # 10-fold cross validation

# Fit model on to training data
gs_tvec_lr.fit(X_train, y_train)

# Generate predictions on validation set
tvec_lr_pred = gs_tvec_lr.predict(X_val)

# Print best parameters
print('Best parameters: ', gs_tvec_lr.best_params_)

# Print accuracy scores
print('Best CV score: ', gs_tvec_lr.best_score_)
print('Training score:', gs_tvec_lr.score(X_train, y_train))
print('Validation score:', gs_tvec_lr.score(X_val, y_val))
print('')

# Print classification report and confusion matrix
cmat(y_val, tvec_lr_pred, 'validation set')

Best parameters:  {'lr__C': 1, 'lr__penalty': 'l2', 'tvec__max_df': 0.9, 'tvec__max_features': 300, 'tvec__min_df': 2}
Best CV score:  0.7755803695834687
Training score: 0.7860373092142453
Validation score: 0.7720422004521477

Classification report for validation set
              precision    recall  f1-score   support

           0       0.62      0.17      0.26       648
           1       0.78      0.97      0.87      2006

    accuracy                           0.77      2654
   macro avg       0.70      0.57      0.56      2654
weighted avg       0.74      0.77      0.72      2654


Confusion matrix for validation set
                        Predicted Positive Review  Predicted Negative Review
Actual Positive Review                        108                        540
Actual Negative Review                         65                       1941


## count vectorizer & naive bayes

In [16]:
# Create a pipeline with Count Vectorizer and Naive Bayes
pipe_cvec_nb = Pipeline([
    ('cvec', CountVectorizer(stop_words='english')),
    ('nb', MultinomialNB())
])

# Search over the following values of hyperparameters:
pipe_cvec_nb_params = {
    'cvec__max_features': [500], #200
    'cvec__min_df': [2,3],
    'cvec__max_df': [.9,.95], 
#     'cvec__ngram_range':[(1,1),(1,2)],  
}

# Instantiate GridSearchCV
gs_cvec_nb = GridSearchCV(pipe_cvec_nb, # Objects to optimise
                          param_grid = pipe_cvec_nb_params, # Hyperparameters for tuning
                          cv=10) # 10-fold cross validation

# Fit model on to training data
gs_cvec_nb.fit(X_train, y_train)

# Generate predictions on validation set
cvec_nb_pred = gs_cvec_nb.predict(X_val)

# Print best parameters
print('Best parameters: ', gs_cvec_nb.best_params_)

# Print accuracy scores
print('Best CV score: ', gs_cvec_nb.best_score_)
print('Training score:', gs_cvec_nb.score(X_train, y_train))
print('Validation score:', gs_cvec_nb.score(X_val, y_val))
print('')

# Print classification report and confusion matrix
cmat(y_val, cvec_nb_pred, 'validation set')

Best parameters:  {'cvec__max_df': 0.9, 'cvec__max_features': 500, 'cvec__min_df': 2}
Best CV score:  0.7689853938028829
Training score: 0.7836819295270397
Validation score: 0.7607385079125848

Classification report for validation set
              precision    recall  f1-score   support

           0       0.52      0.27      0.36       648
           1       0.80      0.92      0.85      2006

    accuracy                           0.76      2654
   macro avg       0.66      0.60      0.61      2654
weighted avg       0.73      0.76      0.73      2654


Confusion matrix for validation set
                        Predicted Positive Review  Predicted Negative Review
Actual Positive Review                        178                        470
Actual Negative Review                        165                       1841


## tf-idf & naive bayes

In [17]:
# Create a pipeline with TF-IDF and Naive Bayes
pipe_tvec_nb = Pipeline([
    ('tvec', TfidfVectorizer(stop_words='english')),
    ('nb', MultinomialNB())
])

# Search over the following values of hyperparameters:
pipe_tvec_nb_params = {
    'tvec__max_features': [500], #200
    'tvec__min_df': [2,3], #
    'tvec__max_df': [.9,.95], 
#     'tvec__ngram_range':[(1,1),(1,2)],  
}

# Instantiate GridSearchCV
gs_tvec_nb = GridSearchCV(pipe_tvec_nb, # Objects to optimise
                          param_grid = pipe_tvec_nb_params, # Hyperparameters for tuning
                          cv=10) # 10-fold cross validation

# Fit model on to training data
gs_tvec_nb.fit(X_train, y_train)

# Generate predictions on validation set
tvec_nb_pred = gs_tvec_nb.predict(X_val)

# Print best parameters
print('Best parameters: ', gs_tvec_nb.best_params_)

# Print accuracy scores
print('Best CV score: ', gs_tvec_nb.best_score_)
print('Training score:', gs_tvec_nb.score(X_train, y_train))
print('Validation score:', gs_tvec_nb.score(X_val, y_val))
print('')

# Print classification report and confusion matrix
cmat(y_val, tvec_nb_pred, 'validation set')

Best parameters:  {'tvec__max_df': 0.9, 'tvec__max_features': 500, 'tvec__min_df': 2}
Best CV score:  0.7679487247755111
Training score: 0.7704918032786885
Validation score: 0.7709118311981914

Classification report for validation set
              precision    recall  f1-score   support

           0       0.87      0.07      0.13       648
           1       0.77      1.00      0.87      2006

    accuracy                           0.77      2654
   macro avg       0.82      0.53      0.50      2654
weighted avg       0.79      0.77      0.69      2654


Confusion matrix for validation set
                        Predicted Positive Review  Predicted Negative Review
Actual Positive Review                         47                        601
Actual Negative Review                          7                       1999


## count vectorizer & svc

In [18]:
# Create a pipeline with Count Vectorizer and SVC
pipe_cvec_svc = Pipeline([
    ('cvec', CountVectorizer(stop_words='english')),
    ('svc', SVC(random_state=42))
])

# Search over the following values of hyperparameters:
pipe_cvec_svc_params = {
    'cvec__max_features': [300], #200,500
    'cvec__min_df': [2,3], 
    'cvec__max_df': [.9,.95], 
#     'cvec__ngram_range':[(1,1),(1,2)],  
    'svc__kernel': ['linear'], #'poly', 'rbf'
#     'svc__degree': [3],
    'svc__C': [.1]
}

# Instantiate GridSearchCV
gs_cvec_svc = GridSearchCV(pipe_cvec_svc, # Objects to optimise
                          param_grid = pipe_cvec_svc_params, # Hyperparameters for tuning
                          cv=10) # 10-fold cross validation

# Fit model on to training data
gs_cvec_svc.fit(X_train, y_train)

# Generate predictions on validation set
cvec_svc_pred = gs_cvec_svc.predict(X_val)

# Print best parameters
print('Best parameters: ', gs_cvec_svc.best_params_)

# Print accuracy scores
print('Best CV score: ', gs_cvec_svc.best_score_)
print('Training score:', gs_cvec_svc.score(X_train, y_train))
print('Validation score:', gs_cvec_svc.score(X_val, y_val))
print('')

# Print classification report and confusion matrix
cmat(y_val, cvec_svc_pred, 'validation set')

Best parameters:  {'cvec__max_df': 0.9, 'cvec__max_features': 300, 'cvec__min_df': 2, 'svc__C': 0.1, 'svc__kernel': 'linear'}
Best CV score:  0.7701146273192153
Training score: 0.7736951196532881
Validation score: 0.7656367746797287

Classification report for validation set
              precision    recall  f1-score   support

           0       0.63      0.10      0.17       648
           1       0.77      0.98      0.86      2006

    accuracy                           0.77      2654
   macro avg       0.70      0.54      0.52      2654
weighted avg       0.74      0.77      0.69      2654


Confusion matrix for validation set
                        Predicted Positive Review  Predicted Negative Review
Actual Positive Review                         64                        584
Actual Negative Review                         38                       1968


## tf-idf & svc

In [19]:
# Create a pipeline with TF-IDF Vectorizer and SVC
pipe_tvec_svc = Pipeline([
    ('tvec', TfidfVectorizer(stop_words='english')),
    ('svc', SVC(probability=True, random_state=42)) 
])

# Search over the following values of hyperparameters:
pipe_tvec_svc_params = {
    'tvec__max_features': [800], #200,500
    'tvec__min_df': [2,3], 
    'tvec__max_df': [.9,.95], 
#     'tvec__ngram_range':[(1,1),(1,2)],  
    'svc__kernel': ['linear'], #'poly', 'rbf'
#     'svc__degree': [3],
    'svc__C': [.1] # .01
}

# Instantiate GridSearchCV
gs_tvec_svc = GridSearchCV(pipe_tvec_svc, # Objects to optimise
                          param_grid = pipe_tvec_svc_params, # Hyperparameters for tuning
                          cv=10) # 10-fold cross validation

# Fit model on to training data
gs_tvec_svc.fit(X_train, y_train)

# Generate predictions on validation set
tvec_svc_pred = gs_tvec_svc.predict(X_val)

# Print best parameters
print('Best parameters: ', gs_tvec_svc.best_params_)

# Print accuracy scores
print('Best CV score: ', gs_tvec_svc.best_score_)
print('Training score:', gs_tvec_svc.score(X_train, y_train))
print('Validation score:', gs_tvec_svc.score(X_val, y_val))
print('')

# Print classification report and confusion matrix
cmat(y_val, tvec_svc_pred, 'validation set')

Best parameters:  {'svc__C': 0.1, 'svc__kernel': 'linear', 'tvec__max_df': 0.9, 'tvec__max_features': 800, 'tvec__min_df': 2}
Best CV score:  0.7622011178737325
Training score: 0.7631430186546071
Validation score: 0.7633760361718162

Classification report for validation set
              precision    recall  f1-score   support

           0       0.86      0.04      0.07       648
           1       0.76      1.00      0.86      2006

    accuracy                           0.76      2654
   macro avg       0.81      0.52      0.47      2654
weighted avg       0.79      0.76      0.67      2654


Confusion matrix for validation set
                        Predicted Positive Review  Predicted Negative Review
Actual Positive Review                         24                        624
Actual Negative Review                          4                       2002


## voting classifier

In [20]:
# Instantiate the Voting Classifier with TF-IDF Logistic Regression and SVC
voting_clf = VotingClassifier(
    estimators=[('tvec_lr', gs_tvec_lr),
                ('tvec_svc', gs_tvec_svc)], 
    voting='soft', 
    weights=[1,2]
)

# Fit model on to training data
voting_clf.fit(X_train, y_train)

# Generate predictions on validation set
voting_pred = voting_clf.predict(X_val)

# Print accuracy scores
print('Training score:', voting_clf.score(X_train, y_train))
print('Validation score:', voting_clf.score(X_val, y_val))
print('')

# Print classification report and confusion matrix
cmat(y_val, voting_pred, 'validation set')

Training score: 0.7964009798379499
Validation score: 0.7856066314996232

Classification report for validation set
              precision    recall  f1-score   support

           0       0.70      0.21      0.33       648
           1       0.79      0.97      0.87      2006

    accuracy                           0.79      2654
   macro avg       0.75      0.59      0.60      2654
weighted avg       0.77      0.79      0.74      2654


Confusion matrix for validation set
                        Predicted Positive Review  Predicted Negative Review
Actual Positive Review                        139                        509
Actual Negative Review                         60                       1946


## Deep Learning

In [21]:
X_train_list = X_train.values
X_val_list = X_val.values

y_train_list = y_train.values
y_val_list = y_val.values

In [22]:
max_words = 1000 # Note: Model tends to overfit when max_words is set to 2000-3000 words

# Tokenize the data
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(list(X_train_list))

# Convert the text to sequences
X_train_list = tokenizer.texts_to_sequences(X_train_list)
X_val_list = tokenizer.texts_to_sequences(X_val_list)

In [23]:
vocab_size = len(tokenizer.word_index)+1
print('Vocabulary size:', vocab_size)

Vocabulary size: 17298


In [24]:
max_length = 30 

X_train_padded = pad_sequences(X_train_list, maxlen=max_length, padding='post')
X_val_padded = pad_sequences(X_val_list, maxlen=max_length, padding='post')

In [25]:
# Instantiate model
model = Sequential()

# Add embedding layer, embed_dim:8
model.add(Embedding(vocab_size, 8, input_length=max_length)) 

# SpatialDropout1D performs variational dropout
model.add(SpatialDropout1D(0.2))

# Add LSTM
model.add(Bidirectional(LSTM(8, return_sequences=True, dropout=0.5, recurrent_dropout=0))) 

# Add a dense layer
model.add(Dense(8, activation='relu'))

# Add output layer
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer=Adam(lr=0.001), loss='binary_crossentropy', metrics=['accuracy'])

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [26]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 30, 8)             138384    
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 30, 8)             0         
_________________________________________________________________
bidirectional (Bidirectional (None, 30, 16)            1088      
_________________________________________________________________
dense (Dense)                (None, 30, 8)             136       
_________________________________________________________________
dense_1 (Dense)              (None, 30, 1)             9         
Total params: 139,617
Trainable params: 139,617
Non-trainable params: 0
_________________________________________________________________


In [34]:
#Callback to save the models and weights

outputFolder = './GA Capstone CMON/output'
if not os.path.exists(outputFolder):
    os.makedirs(outputFolder)
filepath = outputFolder+"/model-{epoch:02d}-{val_accuracy:.2f}.hdf5"

checkpoint_callback = ModelCheckpoint(
    filepath, monitor='val_accuracy', verbose=1,
    save_best_only=False, save_weights_only=False,
    save_frequency=1)

In [32]:
%%time
# Fit the model and store it in the history object
history = model.fit(X_train_padded, y_train_list, epochs=10, batch_size=64, validation_data=(X_val_padded, y_val_list), callbacks=[checkpoint_callback])

ValueError: A target array with shape (10614, 1) was passed for an output of shape (None, 30, 1) while using as loss `binary_crossentropy`. This loss expects targets to have the same shape as the output.

In [29]:
#Plot the training and validation loss by epoch

plt.figure(figsize=(10,8))
epochs = range(1,11)
plt.plot(epochs, history.history['loss'], label='Training loss')
plt.plot(epochs, history.history['val_loss'], label='Validation loss')
plt.title('Training and Validation Loss by Epoch', size=15, weight='bold')
plt.xlabel('Epochs', size=12)
plt.ylabel('Loss', size=12)
plt.xticks(np.arange(1,11,1))
plt.legend();

NameError: name 'history' is not defined

<Figure size 720x576 with 0 Axes>

In [None]:
# Plot the training and validation accuracy by epoch

plt.figure(figsize=(10,8))
epochs = range(1,11)
plt.plot(epochs, history.history['accuracy'], label='Training accuracy')
plt.plot(epochs, history.history['val_accuracy'], label='Validation accuracy')
plt.title('Training and Validation Accuracy by Epoch', size=15, weight='bold')
plt.xlabel('Epochs', size=12)
plt.ylabel('Accuracy', size=12)
plt.xticks(np.arange(1,11,1))
plt.legend();

## Loading a check-pointed model

In [None]:
# Load weights
model.load_weights('./output/model-08-0.80.hdf5')

In [None]:
# Compile the model
model.compile(optimizer=Adam(lr=0.001), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
# Generate predictions on validation set
class_preds = (model.predict(X_val_padded) > 0.5).astype(int)

In [None]:
val_preds = []

# Loop through the class prediction list and append the predictions to val_preds
for i in class_preds:
    val_preds.append(i[0][0])

# Convert val_preds to a series
val_preds = pd.Series(val_preds)

In [None]:
# Accuracy scores
print('Accuracy score on training set: ', model.evaluate(X_train_padded, y_train_list, verbose=0)[1])
print('Accuracy score on validation set: ', model.evaluate(X_val_padded, y_val_list, verbose=0)[1])
print('')

# Print classification report and confusion matrix for validation set
cmat(y_val, val_preds, 'validation set')