In [None]:
# libs
import sys
import csv
import time
from glob import glob

from tqdm import tqdm_notebook as tqdm

import libs.bag_of_worder as bag_of_worder
import libs.preprocessor as tweet_preproc

## Load Model

In [None]:
from joblib import dump, load

# Load Model
try:
    classifier = load('model/logistic.joblib') 
    print("Model loaded!")

except:
    print("ERROR: Model not loaded")

## Load Dictionary

In [None]:
# Init dict
wordDict = []

path = "model/dictionary.txt"
with open(path, 'r', newline='', encoding="utf-8") as input_file:    
    for row in input_file:
        wordDict.append(row.strip())

# Get the stats
print("Dict Dimension: " + str(len(wordDict)))

## Load Objects

In [None]:
# Init Preprocessor
twitterPreprocessor = tweet_preproc.TwitterPreprocessor()

# Init Bag-of-Worder using the dictionary
countBoW = bag_of_worder.BagOfWorder(wordDict)

## Predict Functions

In [None]:
labels = {
    "-1": "neutral", 
    "0": "democrat",
    "1": "republican"
}

In [None]:
def predictTweets(tweets,min_confidence=0.5):
    
    # Preprocess
    tweets = twitterPreprocessor.preprocessAll(tweets)
    
    # Create a one hot matrix of the words in the tweet
    oneHotTweets = countBoW.computeMatrix(tweets)
    
    preds = []
    for oneHotTweet in oneHotTweets:
        
        prob_dem, prob_rep = classifier.predict_proba(oneHotTweet)[0]
        
        # Compare to min confidence level
        if(prob_dem > min_confidence):
            preds.append(0)
        elif(prob_rep > min_confidence):
            preds.append(1)
        else:
            preds.append(-1)

            
    # Make sure they are the same size
    assert len(preds) == len(tweets)
    
    return preds


def predictTweet(tweet,min_confidence=0.5):
    
    # Preprocess
    tweet = twitterPreprocessor.preprocess(tweet)
    
    # Create a one hot matrix of the words in the tweet
    oneHotTweet = countBoW.computeLine(tweet)
    
    # Check performance
    prob_dem, prob_rep = classifier.predict_proba(oneHotTweet)[0]
    
    # Compare to min confidence level
    if(prob_dem > min_confidence):
        return 0
    elif(prob_rep > min_confidence):
        return 1
    else:
        return -1
    

## Load tweets

In [None]:
def file_len(fname):
    
    with open(fname) as f:
        for i, l in enumerate(f):
            pass
    
    nbrOfLines = i + 1
    print("Nbr of lines : " + str(nbrOfLines))
    
    return nbrOfLines

In [None]:
def predictFile(src_path,out_path,MIN_CONFIDENCE = 0.8):
    
    # Count number of tweets
    nbr_tweets = file_len(src_path)

    with open(out_path, 'w+', newline='', encoding="utf-8") as outfile:
        with open(src_path, 'r', newline='', encoding="utf-8") as csvfile:

            # init reader
            reader = csv.reader(csvfile, quotechar='"', delimiter=',')

            # Taking the header of the file + the index of useful columns:
            header = next(reader)
            ind_createdAt = header.index('created_at')
            ind_text = header.index('text')
            ind_description = header.index('description')
            ind_location = header.index('location')

            # convert tweets file to list                
            tweets = []
            all_tweets = []
            for row in reader:
                    
                # get data
                created_at = row[ind_createdAt]
                text = row[ind_text]
                #description = row[ind_description]
                location = row[ind_location]
                
                # append to lists
                all_tweets.append([created_at,text,location])
                tweets.append(text)

            
            # Predict all tweets
            preds = predictTweets(tweets,min_confidence=MIN_CONFIDENCE)
                        
            # Write headers for first row
            outfile.write('"label","created_at","text","location"\n')

            # Init counter
            tweet_counter = 0
            
            # Write to file
            for i in tqdm(range(0,len(preds))):
                
                # Get pred
                pred = preds[i]

                # If failed skip
                if(pred < 0):
                    continue
                    
                # Get data                
                created_at = all_tweets[i][0]
                text = all_tweets[i][1]
                #description = row[ind_description]
                location = all_tweets[i][2]

                # Write to file
                rowData = [str(pred),created_at,text,location]
                rowData = '"' + '","'.join(rowData) + '"\n'
                outfile.write(rowData)

                # increment counter
                tweet_counter = tweet_counter + 1
                    
                
            print("Nbr of tweets labeled: " + str(tweet_counter))

In [None]:
# Glob all the tweets csv
filenames = glob("data/general/*/tweets.csv")
for fname in filenames:
    
    print(fname)
    outpath = "/".join(fname.split("/")[:-1]) + "/predictions.csv"
    predictFile(fname, outpath,MIN_CONFIDENCE=0.85)

In [None]:
fname = 'data/general/2016-11-04/tweets.csv'
    
print("\n" + fname)
outpath = "/".join(fname.split("/")[:-1]) + "/predictions.csv"
predictFile(fname, outpath,MIN_CONFIDENCE=0.95)