In [1]:
# libs
import sys
import nltk
import csv
import time
import sys

import re
import string
import numpy as np

from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from string import punctuation 
from nltk.corpus import stopwords 

## Load Model

In [2]:
from joblib import dump, load

# Load Model
try:
    classifier = load('logistic.joblib') 

except:
    print("Model not saved")

## Load Dictionary

In [3]:
# Init dict
wordDict = []

path = "databases/dictionary.txt"
with open(path, 'r', newline='', encoding="utf-8") as input_file:    
    for row in input_file:
        wordDict.append(row.strip())

# Get the stats
print("Dict Dimension: " + str(len(wordDict)))

Dict Dimension: 25890


## Tweet Preprocess Pipeline

In [4]:
def hasNumbers(inputString):
        return any(char.isdigit() for char in inputString)
    
#Emoji patterns
emoji_pattern = re.compile("["
         u"\U0001F600-\U0001F64F"  # emoticons
         u"\U0001F300-\U0001F5FF"  # symbols & pictographs
         u"\U0001F680-\U0001F6FF"  # transport & map symbols
         u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
         u"\U00002702-\U000027B0"
         u"\U000024C2-\U0001F251"
         "]+", flags=re.UNICODE)

# Sad Emoticons
emoticons_sad = set([
    ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
    ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
    ':c', ':{', '>:\\', ';('
    ])

#HappyEmoticons
emoticons_happy = set([
    ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
    ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
    '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
    'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
    '<3'
    ])

emoticons_all = emoticons_happy.union(emoticons_sad)

class TwitterPreprocessor():
    
    def __init__(self):
        self.stemmer = SnowballStemmer("english", ignore_stopwords=True)
        self.stopwords = set(stopwords.words('english'))
        self.emoticons = emoticons_all
        
        
    def clean(self, tweet):

        # remove URLs
        tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','', tweet) 
        tweet = re.sub(r':', '', tweet)
        tweet = re.sub(r'‚Ä¶', '', tweet)

        #replace consecutive non-ASCII characters with a space
        tweet = re.sub(r'[^\x00-\x7F]+',' ', tweet)
        
        #remove emojis from tweet
        tweet = emoji_pattern.sub(r'', tweet)

        # Tokenize
        tokens = word_tokenize(tweet)
        
        # Init
        filtered_tweet = []

        # Go through words
        for w in tokens:

            #check tokens against stop words, emoticons, punctuations and numerical
            if(w not in self.stopwords and w not in self.emoticons and w not in punctuation and not hasNumbers(w)):
                filtered_tweet.append(w)

        # Join filtered tokens
        tweet = ' '.join(filtered_tweet)

        # Remove leading/trailing whitespaces
        tweet = tweet.strip()
        
        # Make sure there is no punctuation in the characters
        tweet = ''.join([c for c in tweet if c not in punctuation])

        return tweet
        
        
    def stem(self, tweet):
        
        # Tokenize
        tokens = word_tokenize(tweet)
        
        # Have to return the stemmed token
        tokens = [self.stemmer.stem(token) for token in tokens]
        
        # Join
        tweet = ' '.join(tokens)
        
        return tweet
    
        
    def preprocess(self, tweet):
        
        tweet = self.clean(tweet)
        tweet = self.stem(tweet)
        
        return tweet       
           
    

In [5]:
from scipy.sparse import csr_matrix

class CountBoW(object):

    def __init__(self, words):
        """
        pipelineObj: instance of PreprocesingPipeline
        bigram: enable or disable bigram
        trigram: enable or disable trigram
        words: list of words in the vocabulary
        """
        self.words = words
        
        
    def computeLine(self, tweet):
        
        # Tokenize
        tokens = word_tokenize(tweet)
        
        # Init the BoW Matrix
        matrixBoW = np.zeros((1, len(self.words)),dtype=np.int16)
        
        # Go through each tokenized tweet
        for token in tokens:
                
            try:
                # Get the dictionary index of this token
                dictIndex = self.words.index(token)

                # Increment the BoW row at this index
                matrixBoW[0][dictIndex] += 1

            except ValueError:
                pass
        
        # Return the BoW Matrix
        return matrixBoW
    
        
    def computeMatrix(self, tweets):
        """
        Calcule du BoW, à partir d'un dictionnaire de mots et d'une liste de tweets.
        On suppose que l'on a déjà collecté le dictionnaire sur l'ensemble d'entraînement.
        
        Entrée: tokens, une liste de vecteurs contenant les tweets (une liste de liste)
        
        Return: une csr_matrix
        """
        
        if self.words is None:
            raise Exception(
                "ERROR: You have not provided the dictionary"
            )
        
        
        # Init the BoW Matrix
        matrixBoW = np.zeros((len(tweets), len(self.words)),dtype=np.int16)
        
        for i in tqdm(range(0,len(tweets))):
            
            tweet = tweets[i]
            
            matrixBoW[i] = self.computeLine(tweet)
            
        
        # Convert to CSR
        matrixBoW = csr_matrix(matrixBoW, shape=(len(tweets), len(self.words)), dtype=np.int16)
        
        # Return the BoW Matrix
        return matrixBoW
    

In [6]:
# Init Preprocessor
twitterPreprocessor = TwitterPreprocessor()

# Init Bag-of-Worder using the dictionary
countBoW = CountBoW(wordDict)

def predictTweets(tweets):
    
    preds = []
    for tweet in tweets:
        preds.append(predictTweet(tweet))

    return preds


def predictTweet(tweet,min_confidence=0.5):
    
    # Preprocess
    tweet = twitterPreprocessor.preprocess(tweet)
    
    # Create a one hot matrix of the words in the tweet
    oneHotTweet = countBoW.computeLine(tweet)
    
    # Check performance
    prob_dem, prob_rep = classifier.predict_proba(oneHotTweet)[0]
    
    # Compare to min confidence level
    if(prob_dem > min_confidence):
        return 0
    elif(prob_rep > min_confidence):
        return 1
    else:
        return -1
    

## Load tweets

In [None]:
path = "databases/elections.csv"

with open(path, 'r', newline='', encoding="utf-8") as csvfile:
        
    reader = csv.reader(csvfile, quotechar='"', delimiter=',')

    # Taking the header of the file + the index of useful columns:
    header = next(reader)
    ind_createdAt = header.index('created_at')
    ind_text = header.index('text')
    ind_description = header.index('description')
    ind_location = header.index('location')
    
    # Init counter
    tweet_counter = 0

    # go through rows
    for row in reader:
        
        # get data
        created_at = row[ind_createdAt]
        text = row[ind_text]
        description = row[ind_description]
        location = row[ind_location]
        
        # predict
        print(predictTweet(description,min_confidence=0.9))
        print(predictTweet(text,min_confidence=0.9))
        
        print("\n")
        print(description)
        print(text)
        print("\n")
        
        # increment counter
        tweet_counter = tweet_counter + 1
