# Dataset

In [None]:
import csv
import string
import numpy as np

from tqdm import tqdm_notebook as tqdm

import sys
import nltk
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split

def load_dataset(path):
    
    x = []
    y = []

    with open(path, 'r', newline='', encoding="utf-8") as csvfile:
        
        reader = csv.reader(csvfile, quotechar='"', delimiter=',')
        
        # Taking the header of the file + the index of useful columns:
        header = next(reader)
        ind_label = header.index('label')
        ind_text = header.index('text')
        
        for row in reader:
            
            label = row[ind_label]
            if label == "democrat":
                y.append(0)
            elif label == "republican":
                y.append(1)
            else:
                print("ERROR : " + str(row))
                continue
                
            x.append(row[ind_text])
            

        assert len(x) == len(y)

        return x, y


# Path of the dataset
path = "databases/stemmed.csv"

X, y = load_dataset(path)

train_valid_X, test_X, train_valid_Y, test_Y = train_test_split(X, y, test_size=0.15, random_state=12)

train_X, valid_X, train_Y, valid_Y = train_test_split(train_valid_X, train_valid_Y, test_size=0.18, random_state=12)

print("Length of training set : ", len(train_X))
print("Length of validation set : ", len(valid_X))
print("Length of test set : ", len(test_X))


# Dictionary

In [None]:
def bigram(tokens):
    """
    tokens: a list of strings
    """
    # Init array
    bigrams = []
    
    # Go through tokens
    for i in range(0,len(tokens)-1):
        bigrams.append(" ".join([tokens[i],tokens[i+1]]))
    
    # This function returns the list of bigrams
    return bigrams


# Returns unique words
def buildDict(tweets, addBigram=False):
    
    # Init empty set
    wordDict = set()
    
    # Go through each tweet of the validation set
    for tweet in tweets:

        # Tokenize
        words = word_tokenize(tweet)
        
        # Add Bigram
        if(addBigram):
            words = words + bigram(words)

        # Go through each word
        for word in words:

            # Append to dictionary if not already there
            if(word not in wordDict):
                wordDict.add(word)
                
    # Get the stats
    print("Dict Dimension: " + str(len(wordDict)))
    
    return list(wordDict)


# Create a dictionary of all the words
wordDict = buildDict(train_X,addBigram=False)

# Bag-of-Words

In [None]:
class CountBoW(object):

    def __init__(self, words):
        """
        pipelineObj: instance of PreprocesingPipeline
        bigram: enable or disable bigram
        trigram: enable or disable trigram
        words: list of words in the vocabulary
        """
        self.words = words
        
    def computeLineBoW(self, tokens):
        
        # Init the BoW Matrix
        matrixBoW = np.zeros((1, len(self.words)),dtype=np.int16)
        
        # Go through each tokenized tweet
        for token in tokens:
                
            try:
                # Get the dictionary index of this token
                dictIndex = self.words.index(token)

                # Increment the BoW row at this index
                matrixBoW[0][dictIndex] += 1

            except ValueError:
                pass
        
        # Return the BoW Matrix
        return matrixBoW
    
        
    def computeBoW(self, tokens):
        """
        Calcule du BoW, à partir d'un dictionnaire de mots et d'une liste de tweets.
        On suppose que l'on a déjà collecté le dictionnaire sur l'ensemble d'entraînement.
        
        Entrée: tokens, une liste de vecteurs contenant les tweets (une liste de liste)
        
        Return: une csr_matrix
        """
        
        if self.words is None:
            raise Exception(
                "ERROR: You have not provided the dictionary"
            )
        
        
        # Init the BoW Matrix
        matrixBoW = np.zeros((len(tokens), len(self.words)),dtype=np.int16)
        
        # Go through each tokenized tweet
        for i in range(0,len(tokens)):
            
            # Get tokenized tweet
            tokenizedTweet = tokens[i]
            
            # Go through each tokens
            for j in range(0,len(tokenizedTweet)):
                
                # Get the token
                token = tokenizedTweet[j]
                
                try:
                    # Get the dictionary index of this token
                    dictIndex = self.words.index(token)
                    
                    # Increment the BoW row at this index
                    matrixBoW[i][dictIndex] += 1
                    
                except ValueError:
                    pass
        
        # Return the BoW Matrix
        return matrixBoW

In [None]:
countBoW = CountBoW(wordDict)

## Dictionary Reduction

In [None]:
# Init the BoW Matrix
matrixBoW = np.zeros((1, len(wordDict)),dtype=np.int16)

for tweet in tqdm(train_X):
    
    # Tokenize
    tokenized_tweet = word_tokenize(tweet)
    
    # Compute the BoW
    bowObject = countBoW.computeLineBoW(tokenized_tweet)
    
    # Add to matrixBoW
    matrixBoW = np.add(matrixBoW,bowObject)

Only keep words that occured more than once

In [None]:
newWordDict = []
for ind in np.argwhere(matrixBoW > 1):
    newWordDict.append(wordDict[ind[1]])
    
reduction = len(newWordDict)/(1.0*len(wordDict))
print(reduction)

In [None]:
# Write to file
path = "databases/dictionary.txt"
with open(path, 'w+', newline='', encoding="utf-8") as output_file:
    for word in newWordDict:
        output_file.write(str(word) + "\n")

# LSTM

In [None]:
# Pytorch Dependencies
import torch
import torch.nn as nn
import torch.nn.functional as F


In [None]:
import torch.utils.data

class Dataset(torch.utils.data.dataset.Dataset):
    
    def __init__(self, tweets, labels):
        
        # We set the images attribute as the filename column of the csv
        self.tweets = tweets
        
        # We set the labels attribute as the value column of the csv
        self.labels = labels
        
        # The length is simply the number of rows
        self.length = len(self.tweets)
    
    
    def __getitem__(self, index):
        
        # Get Tweet
        tweet = self.tweets[index]

        # Tokenize
        tokenized_tweet = word_tokenize(tweet)
        
        # Compute the BoW
        bowObject = countBoW.computeBoW(tokenized_tweet)
        
        # Get the label for this image
        label = str(self.labels[index])
            
        return {"text":bowObject, "label":label}

    
    def __len__(self):
        return self.length