## Library Imports

In [1]:
from whoosh import index, writing, scoring
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
from whoosh.analysis import *
from whoosh.qparser import QueryParser
from nltk.tokenize import *
import os, os.path
import shutil
import pandas as pd
import re
import tqdm

## File Imports

In [2]:
#List file locations for inputs
classified_tweets_path = "classified_tweets.txt"
unclassified_tweets_path = "unclassified_tweets.txt"
corpus_path = "corpus.txt"
stop_words_path = "stop_words.txt"

In [76]:
# Read classified tweets into dataframe
classified_tweets = pd.read_csv(classified_tweets_path,encoding="utf-8")
classified_tweets.head()

Unnamed: 0,class,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [77]:
# Read raw unclassified tweets
with open(unclassified_tweets_path) as f:
    unclassified_tweets = f.readlines()

# Strip new lines and blanks lines from unclassified tweets
unclassified_tweets_temp = []
for line in unclassified_tweets:
    if line.strip():
        unclassified_tweets_temp.append(line)
unclassified_tweets_temp = [x.strip('\n') for x in unclassified_tweets_temp]

# Write to dataframe
unclassified_tweets = pd.DataFrame(unclassified_tweets_temp,columns=['text'])
unclassified_tweets.loc[255,:]

text    take back canada! #votetogether for strongest ...
Name: 255, dtype: object

## Clean tweets

In [78]:
def tweet_cleaner(tweet_df):
    '''
    Function to clean and tokenize the 'text' column of a dataframe.
    Input: dataframe with 'text' column populated with strings to be cleaned
    Output: input dataframe with 'text' column entries cleaned as follows:
        a) Remove HTML tags & attributes ******
        b) Replace HTML character codes with ASCII equivalent ********
        c) Remove URLs
        d) make text lowercase
        e) Remove stopwords
        f) Changed from string to list of tokens
    '''
    # Create Whoosh filter to tokenize, switch to lowercase and remove stop words
    tokenizer = RegexTokenizer() | LowercaseFilter() | StopFilter()
    # Create Regex filter to remove URL's
    # Source: https://gist.github.com/gruber/249502
    url_regex = r'\b((?:[a-z][\w-]+:(?:\/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}\/?)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\)){0,}(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s\!()\[\]{};:\'\"\.\,<>?«»“”‘’‘’]){0,})'
    j=0
    # Loop over all entries in 'text column'
    for i,tweet in enumerate(tweet_df.text):
        # Replace tweet with filtered list
        try:
            tweet = re.sub(url_regex, "", tweet)
            tweet_df.set_value(i,'text',[token.text for token in tokenizer(tweet)])
        # If tweet is not passed as list, pront single error statement
        except TypeError:
            if j==0: 
                print("wrong data type detected in 'text' column")
                j+=1
    return tweet_df

In [79]:
tweet_cleaner(unclassified_tweets)
unclassified_tweets.head()

Unnamed: 0,text
0,"[living, dream, cameraman, camera, cameracepti..."
1,"[justin, trudeau, reasons, thanksgiving, today..."
2,"[themadape, butt, butt, re, allergic, latex, m..."
3,"[massive, explosions, peace, march, turkey, 30..."
4,"[mulcair, suggests, there, bad, blood, between..."


In [80]:
tweet_cleaner(classified_tweets)
classified_tweets.head()

Unnamed: 0,class,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"[switchfoot, awww, bummer, shoulda, got, david..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,"[upset, he, update, his, facebook, texting, mi..."
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,"[kenichan, dived, many, times, ball, managed, ..."
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,"[my, whole, body, feels, itchy, like, its, fire]"
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"[nationwideclass, no, behaving, all, mad, why,..."


## Exploratory Analysis

In [93]:
def assign_party (tweet_df):
    '''
    Function to assign a political party to each tweet in column 'text'
    Input: dataframe with 'text' column populated with list of words to be categorized
    Output: 
    '''
    # Add column to dataframe for political party classification
    tweet_df['party'] = ''
    
    # Create list of words associated with each political party
    lib_list = ['justin','trudeau','justintrudeau', 'libéral', 'liberal', 'liberals','libs','lib', 'liberal_party',
                'lpc','ptlib','realchange','kathleen_wynne','mcguinty', 'yvonnejjones']
    con_list = ['stephenharper','steven', 'harper','stevenharper','conservative', 'cpc', 'pmharper','pm','pierrepoilievre',
                'poilievre','conservatives','pcers','pc','pttory','justnotready','duffy',
               'primeminister','prime','minister', 'tory','canadianvalues','minjoeoliver',
               'premierministre','premierministreharper']
    ndp_list = ['tom','mulcair','thomasmulcair', 'tommulcair', 'ndp', 'ndp_hq','ptndp','ndp2015','whatwouldjackdo',
               'leap','rablaney']
    
    # Set counters for occurences of each party
    lib_count = 0
    con_count = 0
    npd_count = 0
    oth_count = 0
    
    # Assign party to each tweet
    for i,tweet in enumerate(tweet_df.text):
        # The score of a tweet with respect to a particular party is determined by how many words overlap with above associated words
        lib_score = len(set(tweet) & set(lib_list))
        con_score = len(set(tweet) & set(con_list))
        ndp_score = len(set(tweet) & set(ndp_list))
        
        if (lib_score>con_score) & (lib_score>ndp_score):
            lib_count+=1
            tweet_df.set_value(i,'party','Liberal')
        elif (con_score>lib_score) & (con_score>ndp_score):
            con_count+=1
            tweet_df.set_value(i,'party','Conservative')
        elif (ndp_score>con_score) & (ndp_score>lib_score): 
            npd_count+=1
            tweet_df.set_value(i,'party','NDP')
#         elif (ndp_score==0) & (con_score==0) & (lib_score==0) :
        else:
            oth_count+=1
            tweet_df.set_value(i,'party','Other')
#             print(i,tweet,'\n')
#             print('lib_score: ',lib_score)
#             print('con_score: ',con_score)
#             print('ndp_score: ',ndp_score)
#             print('\n')
    print('lib_count: ',lib_count)
    print('con_count: ',con_count)
    print('npd_count: ',npd_count)
    print('oth_count: ',oth_count)

In [94]:
assign_party(unclassified_tweets)

lib_count:  577
con_count:  627
npd_count:  303
oth_count:  1571


In [95]:
unclassified_tweets

Unnamed: 0,text,party
0,"[living, dream, cameraman, camera, cameracepti...",NDP
1,"[justin, trudeau, reasons, thanksgiving, today...",Liberal
2,"[themadape, butt, butt, re, allergic, latex, m...",Other
3,"[massive, explosions, peace, march, turkey, 30...",Other
4,"[mulcair, suggests, there, bad, blood, between...",NDP
5,"[polqc, se, sort, de, la, marde, avec, harper,...",Other
6,"[harper, gave, 8m, help, other, countries, get...",Conservative
7,"[tommy, taylor, added]",Other
8,"[justintrudeau, thomasmulcair]",Other
9,"[tracy, retweeted, tsec]",Other


## Model Preparation