# Here we build our dictionary and train our model

Using the candidates and parties preprocessed tweets as our labeled dataset

In [None]:
# Libs
import csv
import string
import numpy as np
import time
import sys
import nltk
import random
import matplotlib.pyplot as plt
from joblib import dump, load

from tqdm import tqdm_notebook as tqdm

from nltk.tokenize import word_tokenize

from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten
from keras.layers import GlobalMaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing.text import Tokenizer

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
import libs.bag_of_worder as bag_of_worder
import libs.preprocessor as tweet_preproc

# Init Preprocessor
twitterPreprocessor = tweet_preproc.TwitterPreprocessor()

# Dataset

IMPORTANT, the label should have only two states 0: dems, 1: republican

In [None]:
def load_dataset(path):
    
    x = []
    y = []

    with open(path, 'r', newline='', encoding="utf-8") as csvfile:
        
        reader = csv.reader(csvfile, quotechar='"', delimiter=',')
        
        # Taking the header of the file + the index of useful columns:
        header = next(reader)
        ind_label = header.index('label')
        ind_text = header.index('text')
        
        for row in reader:
            
            label = row[ind_label]
            if label == "democrat":
                y.append(0)
            elif label == "republican":
                y.append(1)
            else:
                continue
                
            x.append(row[ind_text])
            

        assert len(x) == len(y)

        return x, y

In [None]:
def checkBalanced(labels):
    return np.count_nonzero(labels)/len(labels)


def balance_dataset(features, labels):
    
    # Combine the features with the labels
    combined = list(zip(features, labels))
    
    # Shuffle the list
    random.shuffle(combined)

    # Split the feature and label
    X[:], y[:] = zip(*combined)
    
    # Count number of '1'
    nbrOfOnes = np.count_nonzero(y)
    nbrOfZeros = len(y) - nbrOfOnes
    excessNbr = abs(nbrOfOnes - nbrOfZeros)
    
    # Balance dataset
    removed_counter = 0
    if(nbrOfOnes > nbrOfZeros):  # too much '1'

        for ind in range(0,len(y)):
            if(y[ind] == 1):
                X.pop(ind)
                y.pop(ind)
                removed_counter = removed_counter + 1
                
            if(removed_counter >= abs(excessNbr)):
                break

    else:                       # too much '0'

        for ind in range(0,len(y)):
            if(y[ind] == 0):
                X.pop(ind)
                y.pop(ind)
                removed_counter = removed_counter + 1
                
            if(removed_counter >= abs(excessNbr)):
                break
            
    return X, y

In [None]:
# Path of the labeled dataset
path = "data/parties_candidates/sources.csv"

# Load dataset from path
X, y = load_dataset(path)

# Make sure there is 50/50 of both labels
X, y = balance_dataset(X,y)

# Split the data
train_X, valid_X, train_Y, valid_Y = train_test_split(X, y, test_size=0.1, random_state=12, shuffle=True, stratify=y)

print("Length of training set : ", len(train_X))
print("Length of validation set : ", len(valid_X))

In [None]:
# Make sure the training data is balanced
print("--- Proportion of republican label ---")
print(checkBalanced(train_Y))

# Review Length in Words
Inspired from: https://machinelearningmastery.com/predict-sentiment-movie-reviews-using-deep-learning/

In [None]:
# Summarize number of classes
print("Classes: ")
print(np.unique(train_Y))

In [None]:
# Summarize number of words
print("Number of words: ")
print(len(np.unique(np.hstack(train_X))))

In [None]:
# Summarize review length
print("Review length: ")
result = [len(x) for x in train_X]
print("Mean %.2f words (%f)" % (np.mean(result), np.std(result)))
# plot review length
fig = plt.figure()
plt.boxplot(result)
plt.show()

## Training and testing models
Codes copied from: https://stackabuse.com/python-for-nlp-movie-sentiment-analysis-using-deep-learning-in-keras/

### Transforming data

In [None]:
top_words = 5000

In [None]:
tokenizer = Tokenizer(num_words=top_words)
tokenizer.fit_on_texts(train_X)

train_X_t = tokenizer.texts_to_sequences(train_X)
valid_X_t = tokenizer.texts_to_sequences(valid_X)

In [None]:
# Adding 1 because of reserved 0 index
vocab_size = len(tokenizer.word_index) + 1

maxlen = 100

train_X_t = pad_sequences(train_X_t, padding='post', maxlen=maxlen)
valid_X_t = pad_sequences(valid_X_t, padding='post', maxlen=maxlen)

print('vocab_size: {}'.format(vocab_size))

In [None]:
train_X_t[3]

Lets save our tokenizer

In [None]:
# saving
with open('model/tokenizer.joblib', 'wb') as handle:
    dump(tokenizer, handle) 

### Random Forest algorithm
Here, we don't need any normalization.

In [None]:
from sklearn.ensemble import RandomForestClassifier

text_classifier = RandomForestClassifier(n_estimators=200, random_state=0)
text_classifier.fit(train_X_t, train_Y)

In [None]:
predictions = text_classifier.predict(valid_X_t)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(valid_Y, predictions))
print(classification_report(valid_Y, predictions))
print(accuracy_score(valid_Y, predictions))

In [None]:
# saving
with open('model/rand_forest.joblib', 'wb') as handle:
    dump(text_classifier, handle) 

### Logistic Regression algorithm

In [None]:
from sklearn.preprocessing import MinMaxScaler
# Normalize data
scaler = MinMaxScaler()
train_X_t_n = scaler.fit_transform(train_X_t)
valid_X_t_n = scaler.transform(valid_X_t)

In [None]:
from sklearn.linear_model import LogisticRegression

text_classifier = LogisticRegression()
text_classifier.fit(train_X_t, train_Y)

In [None]:
predictions = text_classifier.predict(valid_X_t)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(valid_Y, predictions))
print(classification_report(valid_Y, predictions))
print(accuracy_score(valid_Y, predictions))

In [None]:
# saving
with open('model/logistic.joblib', 'wb') as handle:
    dump(text_classifier, handle) 

### SVM algorithm

In [None]:
from sklearn import svm

text_classifier = svm.SVC(gamma='auto')
text_classifier.fit(train_X_t_n, train_Y)

In [None]:
predictions = text_classifier.predict(valid_X_t_n)

In [None]:
print(confusion_matrix(valid_Y, predictions))
print(classification_report(valid_Y, predictions))
print(accuracy_score(valid_Y, predictions))

In [None]:
dump(text_classifier, 'model/svm_svc.joblib')

### Simple Neural Network
Inspired from: https://machinelearningmastery.com/predict-sentiment-movie-reviews-using-deep-learning/

In [None]:
# create the model
model = Sequential()
model.add(Embedding(top_words, 32, input_length=maxlen))
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

In [None]:
history = model.fit(train_X_t, train_Y, batch_size=128, epochs=5, verbose=2, validation_split=0.2)

In [None]:
model.save('model/dl_snn.h5')

In [None]:
score = model.evaluate(valid_X_t, valid_Y, verbose=0)

print('Test Score: {}'.format(score[0]))
print('Test Accuracy: {}'.format(score[1]))

In [None]:
fig = plt.figure()
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()

fig = plt.figure()
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()

In [None]:
predictions = model.predict(valid_X_t)

In [None]:
print(confusion_matrix(valid_Y, predictions.round()))
print(classification_report(valid_Y, predictions.round()))
print(accuracy_score(valid_Y, predictions.round()))

### One-Dimensional Convolutional Neural Network Model
Inspired from: https://machinelearningmastery.com/predict-sentiment-movie-reviews-using-deep-learning/

In [None]:
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

In [None]:
# create the model
model = Sequential()
model.add(Embedding(top_words, 256, input_length=maxlen))
model.add(Conv1D(filters=256, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit(train_X_t, train_Y, batch_size=128, epochs=10, verbose=2, validation_split=0.2)

In [None]:
model.save('model/dl_cnn.h5')

In [None]:
score = model.evaluate(valid_X_t, valid_Y, verbose=0)

print('Test Score: {}'.format(score[0]))
print('Test Accuracy: {}'.format(score[1]))

In [None]:
fig = plt.figure()
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()
fig.savefig('results/dl_cnn_train_curve_acc.pdf', bbox_inches='tight')

fig = plt.figure()
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()
fig.savefig('results/dl_cnn_train_curve_loss.pdf', bbox_inches='tight')

In [None]:
predictions = model.predict(valid_X_t)

In [None]:
print(confusion_matrix(valid_Y, predictions.round()))
print(classification_report(valid_Y, predictions.round()))
print(accuracy_score(valid_Y, predictions.round()))