# Here we build our dictionary and train our model

Using the candidates and parties preprocessed tweets as our labeled dataset

In [None]:
# Libs
import csv
import string
import numpy as np
import time
import sys
import nltk
import random

from tqdm import tqdm_notebook as tqdm

from nltk.tokenize import word_tokenize

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
import libs.bag_of_worder as bag_of_worder
import libs.preprocessor as tweet_preproc

# Init Preprocessor
twitterPreprocessor = tweet_preproc.TwitterPreprocessor()

# Dataset

IMPORTANT, the label should have only two states 0: dems, 1: republican

In [None]:
def load_dataset(path):
    
    x = []
    y = []

    with open(path, 'r', newline='', encoding="utf-8") as csvfile:
        
        reader = csv.reader(csvfile, quotechar='"', delimiter=',')
        
        # Taking the header of the file + the index of useful columns:
        header = next(reader)
        ind_label = header.index('label')
        ind_text = header.index('text')
        
        for row in reader:
            
            label = row[ind_label]
            if label == "democrat":
                y.append(0)
            elif label == "republican":
                y.append(1)
            else:
                continue
                
            x.append(row[ind_text])
            

        assert len(x) == len(y)

        return x, y

In [None]:
def checkBalanced(labels):
    return np.count_nonzero(labels)/len(labels)


def balance_dataset(features, labels):
    
    # Combine the features with the labels
    combined = list(zip(features, labels))
    
    # Shuffle the list
    random.shuffle(combined)

    # Split the feature and label
    X[:], y[:] = zip(*combined)
    
    # Count number of '1'
    nbrOfOnes = np.count_nonzero(y)
    nbrOfZeros = len(y) - nbrOfOnes
    excessNbr = abs(nbrOfOnes - nbrOfZeros)
    
    # Balance dataset
    removed_counter = 0
    if(nbrOfOnes > nbrOfZeros):  # too much '1'

        for ind in range(0,len(y)):
            if(y[ind] == 1):
                X.pop(ind)
                y.pop(ind)
                removed_counter = removed_counter + 1
                
            if(removed_counter >= abs(excessNbr)):
                break

    else:                       # too much '0'

        for ind in range(0,len(y)):
            if(y[ind] == 0):
                X.pop(ind)
                y.pop(ind)
                removed_counter = removed_counter + 1
                
            if(removed_counter >= abs(excessNbr)):
                break
            
    return X, y

In [None]:
# Path of the labeled dataset
path = "data/parties_candidates/sources.csv"

# Load dataset from path
X, y = load_dataset(path)

# Make sure there is 50/50 of both labels
X, y = balance_dataset(X,y)

# Split the data
train_X, valid_X, train_Y, valid_Y = train_test_split(X, y, test_size=0.1, random_state=12, shuffle=True, stratify=y)

print("Length of training set : ", len(train_X))
print("Length of validation set : ", len(valid_X))

In [None]:
# Make sure the training data is balanced
print("--- Proportion of republican label ---")
print(checkBalanced(train_Y))

# Dictionary

In [None]:
def bigram(tokens):
    """
    tokens: a list of strings
    """
    # Init array
    bigrams = []
    
    # Go through tokens
    for i in range(0,len(tokens)-1):
        bigrams.append(" ".join([tokens[i],tokens[i+1]]))
    
    # This function returns the list of bigrams
    return bigrams


# Returns unique words
def buildDict(tweets, addBigram=False):
    
    # Init empty set
    wordDict = set()
    
    # Go through each tweet of the validation set
    for tweet in tweets:

        # Tokenize
        words = word_tokenize(tweet)
        
        # Add Bigram
        if(addBigram):
            words = words + bigram(words)

        # Go through each word
        for word in words:

            # Append to dictionary if not already there
            if(word not in wordDict):
                wordDict.add(word)
                
    # Get the stats
    print("Dict Dimension: " + str(len(wordDict)))
    
    return list(wordDict)


def loadDict():
    
    # Init dict
    wordDict = []
    
    path = "model/dictionary.txt"
    with open(path, 'r', newline='', encoding="utf-8") as input_file:    
        for row in input_file:
            wordDict.append(row.strip())
            
    # Get the stats
    print("Dict Dimension: " + str(len(wordDict)))
            
    return wordDict

In [None]:
# Create a dictionary of all the words
wordDict = loadDict()

## Dictionary Reduction

In [None]:
def reduceDict(wordDict,countBoW,tweets):

    # Init the BoW Matrix
    matrixBoW = np.zeros((1, len(wordDict)),dtype=np.int16)

    for tweet in tqdm(tweets):

        # Compute the BoW
        bowObject = countBoW.computeLine(tweet)

        # Add to matrixBoW
        matrixBoW = np.add(matrixBoW,bowObject)


    # Only keep words that occured more than once
    newWordDict = []
    for ind in np.argwhere(matrixBoW > 1):
        newWordDict.append(wordDict[ind[1]])

    reduction = len(newWordDict)/(1.0*len(wordDict))
    print(reduction)

    
    # Write to file
    path = "model/dictionary_new.txt"
    with open(path, 'w+', newline='', encoding="utf-8") as output_file:
        for word in newWordDict:
            output_file.write(str(word) + "\n")
            
    print("New dictionary created!")

In [None]:
# reduceDict(wordDict,countBoW,train_X)

# Classifier

In [None]:
from joblib import dump, load

# Load Model
def loadModel():
    
    try:
        classifier = load('model/logistic.joblib') 
        print("Model Loaded!")
        return classifier
    
    except:
        print("ERROR: Model not saved")
        
        
def saveModel(clf):

    dump(clf, 'model/logistic.joblib') 

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

def train_evaluate(training_X, training_Y, validation_X, validation_Y, bowObj):
    """
    training_X: tweets from the training dataset
    training_Y: tweet labels from the training dataset
    validation_X: tweets from the validation dataset
    validation_Y: tweet labels from the validation dataset
    bowObj: Bag-of-word object
    
    :return: the classifier and its accuracy in the training and validation dataset.
    """

    classifier = LogisticRegression(n_jobs=-1,solver='lbfgs', multi_class='auto')

    training_rep = bowObj.computeTFID(training_X)

    classifier.fit(training_rep, training_Y)

    trainAcc = accuracy_score(training_Y, classifier.predict(training_rep))
    validationAcc = accuracy_score(
        validation_Y, classifier.predict(bowObj.computeTFID(validation_X)))

    return classifier, trainAcc, validationAcc


In [None]:
# Init Bag-of-Worder using the dictionary
countBoW = bag_of_worder.BagOfWorder(wordDict)

# Train
classifier, trainAcc, validationAcc = train_evaluate(train_X,train_Y,valid_X,valid_Y,countBoW)
print("Training Accuracy: " + str(trainAcc))
print("Validation Accuracy: " + str(validationAcc))

In [None]:
# Save Model
saveModel(classifier)

In [None]:
def checkPerf(classifier, countBoW):
    
    # Check performance
    preds = classifier.predict(countBoW.computeTFID(valid_X))
    print(np.count_nonzero(np.equal(preds,valid_Y))/len(preds))

In [None]:
checkPerf(classifier,countBoW)