# Dataset

In [None]:
import csv
import string
import numpy as np
import time

from tqdm import tqdm_notebook as tqdm

import sys
import nltk
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split

def load_dataset(path):
    
    x = []
    y = []

    with open(path, 'r', newline='', encoding="utf-8") as csvfile:
        
        reader = csv.reader(csvfile, quotechar='"', delimiter=',')
        
        # Taking the header of the file + the index of useful columns:
        header = next(reader)
        ind_label = header.index('label')
        ind_text = header.index('text')
        
        for row in reader:
            
            label = row[ind_label]
            if label == "democrat":
                y.append(0)
            elif label == "republican":
                y.append(1)
            else:
                print("ERROR : " + str(row))
                continue
                
            x.append(row[ind_text])
            

        assert len(x) == len(y)

        return x, y


# Path of the dataset
path = "databases/labeler/stemmed.csv"

X, y = load_dataset(path)

train_valid_X, test_X, train_valid_Y, test_Y = train_test_split(X, y, test_size=0.15, random_state=12)

train_X, valid_X, train_Y, valid_Y = train_test_split(train_valid_X, train_valid_Y, test_size=0.18, random_state=12)

print("Length of training set : ", len(train_X))
print("Length of validation set : ", len(valid_X))
print("Length of test set : ", len(test_X))


# Dictionary

In [None]:
def bigram(tokens):
    """
    tokens: a list of strings
    """
    # Init array
    bigrams = []
    
    # Go through tokens
    for i in range(0,len(tokens)-1):
        bigrams.append(" ".join([tokens[i],tokens[i+1]]))
    
    # This function returns the list of bigrams
    return bigrams


# Returns unique words
def buildDict(tweets, addBigram=False):
    
    # Init empty set
    wordDict = set()
    
    # Go through each tweet of the validation set
    for tweet in tweets:

        # Tokenize
        words = word_tokenize(tweet)
        
        # Add Bigram
        if(addBigram):
            words = words + bigram(words)

        # Go through each word
        for word in words:

            # Append to dictionary if not already there
            if(word not in wordDict):
                wordDict.add(word)
                
    # Get the stats
    print("Dict Dimension: " + str(len(wordDict)))
    
    return list(wordDict)


def loadDict():
    
    # Init dict
    wordDict = []
    
    path = "databases/dictionary.txt"
    with open(path, 'r', newline='', encoding="utf-8") as input_file:    
        for row in input_file:
            wordDict.append(row.strip())
            
    # Get the stats
    print("Dict Dimension: " + str(len(wordDict)))
            
    return wordDict
    

# Create a dictionary of all the words
wordDict = loadDict()

# Bag-of-Words

In [None]:
from scipy.sparse import csr_matrix

class CountBoW(object):

    def __init__(self, words):
        """
        pipelineObj: instance of PreprocesingPipeline
        bigram: enable or disable bigram
        trigram: enable or disable trigram
        words: list of words in the vocabulary
        """
        self.words = words
        
        
    def computeLine(self, tweet):
        
        # Tokenize
        tokens = word_tokenize(tweet)
        
        # Init the BoW Matrix
        matrixBoW = np.zeros((1, len(self.words)),dtype=np.int16)
        
        # Go through each tokenized tweet
        for token in tokens:
                
            try:
                # Get the dictionary index of this token
                dictIndex = self.words.index(token)

                # Increment the BoW row at this index
                matrixBoW[0][dictIndex] += 1

            except ValueError:
                pass
        
        # Return the BoW Matrix
        return matrixBoW
    
        
    def computeMatrix(self, tweets):
        """
        Calcule du BoW, à partir d'un dictionnaire de mots et d'une liste de tweets.
        On suppose que l'on a déjà collecté le dictionnaire sur l'ensemble d'entraînement.
        
        Entrée: tokens, une liste de vecteurs contenant les tweets (une liste de liste)
        
        Return: une csr_matrix
        """
        
        if self.words is None:
            raise Exception(
                "ERROR: You have not provided the dictionary"
            )
        
        
        # Init the BoW Matrix
        matrixBoW = np.zeros((len(tweets), len(self.words)),dtype=np.int16)
        
        for i in tqdm(range(0,len(tweets))):
            
            tweet = tweets[i]
            
            matrixBoW[i] = self.computeLine(tweet)
            
        
        # Convert to CSR
        matrixBoW = csr_matrix(matrixBoW, shape=(len(tweets), len(self.words)), dtype=np.int16)
        
        # Return the BoW Matrix
        return matrixBoW

In [None]:
countBoW = CountBoW(wordDict)

## Dictionary Reduction

In [None]:
def reduceDict(wordDict,countBoW,tweets):

    # Init the BoW Matrix
    matrixBoW = np.zeros((1, len(wordDict)),dtype=np.int16)

    for tweet in tqdm(tweets):

        # Compute the BoW
        bowObject = countBoW.computeLine(tweet)

        # Add to matrixBoW
        matrixBoW = np.add(matrixBoW,bowObject)


    # Only keep words that occured more than once
    newWordDict = []
    for ind in np.argwhere(matrixBoW > 1):
        newWordDict.append(wordDict[ind[1]])

    reduction = len(newWordDict)/(1.0*len(wordDict))
    print(reduction)


    # Write to file
    path = "databases/dictionary.txt"
    with open(path, 'w+', newline='', encoding="utf-8") as output_file:
        for word in newWordDict:
            output_file.write(str(word) + "\n")

# Classifier

In [None]:
from joblib import dump, load

# Load Model
def loadModel():
    
    try:
        classifier = load('logistic.joblib') 
        return classifier
    
    except:
        print("Model not saved")
        
        
def saveModel(clf):
    
    dump(clf, 'logistic.joblib') 

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

def train_evaluate(training_X, training_Y, validation_X, validation_Y, bowObj):
    """
    training_X: tweets from the training dataset
    training_Y: tweet labels from the training dataset
    validation_X: tweets from the validation dataset
    validation_Y: tweet labels from the validation dataset
    bowObj: Bag-of-word object
    
    :return: the classifier and its accuracy in the training and validation dataset.
    """

    classifier = LogisticRegression(n_jobs=-1,solver='lbfgs', multi_class='auto')

    training_rep = bowObj.computeMatrix(training_X)

    classifier.fit(training_rep, training_Y)

    trainAcc = accuracy_score(training_Y, classifier.predict(training_rep))
    validationAcc = accuracy_score(
        validation_Y, classifier.predict(bowObj.computeMatrix(validation_X)))

    return classifier, trainAcc, validationAcc


In [None]:
countBoW = CountBoW(wordDict)
classifier, trainAcc, validationAcc = train_evaluate(train_X,train_Y,valid_X,valid_Y,countBoW)

In [None]:
print("Training Accuracy: " + str(trainAcc))
print("Validation Accuracy: " + str(validationAcc))

In [None]:
def predict(tweet,label):

    # Make a prediction
    pred = classifier.predict(countBoW.computeLine(tweet))

    # Print tweet
    print(tweet)

    # Print prediction
    if(pred[0] == 0):
        print("Pred: Democrat")
    else:
        print("Pred: Republican")

    # Print Actual
    if(label == 0):
        print("Actual: Democrat")
    else:
        print("Actual: Republican")
        
    return pred[0] == label

In [None]:
# Check performance
preds = classifier.predict(countBoW.computeMatrix(train_X[1:4]))
np.count_nonzero(np.equal(preds,train_Y))/len(preds)