# Tweet Preprocessing

In [None]:
# libs
import sys
import nltk
import csv
import time
import sys

import re
import string

from nltk.tokenize import word_tokenize
from string import punctuation 
from nltk.corpus import stopwords 


from tqdm import tqdm_notebook as tqdm

# Pytorch Dependencies
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
# nltk.download("punkt")
# nltk.download("stopwords")
# !pip install --user tweet-preprocessor

## Helpher Functions

In [None]:
def file_len(fname):
    
    with open(fname) as f:
        for i, l in enumerate(f):
            pass
    
    nbrOfLines = i + 1
    print("Nbr of lines : " + str(nbrOfLines))
    
    return nbrOfLines

## Tweet Preprocessor

In [None]:
def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)

#Emoji patterns
emoji_pattern = re.compile("["
         u"\U0001F600-\U0001F64F"  # emoticons
         u"\U0001F300-\U0001F5FF"  # symbols & pictographs
         u"\U0001F680-\U0001F6FF"  # transport & map symbols
         u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
         u"\U00002702-\U000027B0"
         u"\U000024C2-\U0001F251"
         "]+", flags=re.UNICODE)

# Sad Emoticons
emoticons_sad = set([
    ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
    ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
    ':c', ':{', '>:\\', ';('
    ])

#HappyEmoticons
emoticons_happy = set([
    ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
    ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
    '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
    'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
    '<3'
    ])

emoticons = emoticons_happy.union(emoticons_sad)

def clean_tweets(tweet):
 
    stop_words = set(stopwords.words('english'))
    
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','', tweet) # remove URLs
    tweet = re.sub(r':', '', tweet)
    tweet = re.sub(r'‚Ä¶', '', tweet)
    
    #replace consecutive non-ASCII characters with a space
    tweet = re.sub(r'[^\x00-\x7F]+',' ', tweet)#remove emojis from tweet
    tweet = emoji_pattern.sub(r'', tweet)#filter using NLTK library append it to a string
    
    word_tokens = word_tokenize(tweet)
    filtered_tweet = [] #looping through conditions
    
    for w in word_tokens:
        
        #check tokens against stop words , emoticons and punctuations
        if w not in stop_words and w not in emoticons and w not in string.punctuation and not hasNumbers(w):
            filtered_tweet.append(w)
       
    # Join tokens
    tweet = ' '.join(filtered_tweet)
    
    tweet = tweet.strip()
            
    return tweet

## Preprocessed

In [None]:
labels = ['democrat','republican','neutral']
features = ['text','description']

# We have the pro-democrat tweets and pro-republican tweets
# Each of them are composed of tweets from the candidates and party twitter account
democratic_path = "databases/democratic/democratic.csv"
republican_path = "databases/republican/republican.csv"

# We want to merge the two files with only the needed features & labels
output_path = "databases/sources.csv"

# Creation of the file that will contain the hydrated tweets:
with open(output_path, 'w+', newline='', encoding="utf-8") as output_file:
    
    # --- Democratic ---
    with open(democratic_path, 'r', newline='', encoding="latin-1") as input_file:

        # init reader
        reader = csv.reader(input_file, quotechar='"', delimiter=',')

        # Taking the header of the file + the index of useful columns:
        header = next(reader)
        ind_text = header.index('text')
        ind_description = header.index('description')
        
        # Write headers for first row
        output_file.write('"label","text","description"\n')

        # Go through input file
        for row in reader:
            
            # Preprocess the content
            text = clean_tweets(row[ind_text])
            description = clean_tweets(row[ind_description])
            
            rowData = ["democrat",text,description]
            rowData = '"' + '","'.join(rowData) + '"\n'
            
            # Write row
            output_file.write(rowData)
            
    
    # --- Republicains ---    
    with open(republican_path, 'r', newline='', encoding="latin-1") as input_file:

        # init reader
        reader = csv.reader(input_file, quotechar='"', delimiter=',')

        # Taking the header of the file + the index of useful columns:
        header = next(reader)
        ind_text = header.index('text')
        ind_description = header.index('description')
        
        # Write headers for first row
        output_file.write("label,text,description\n")

        # Go through input file
        for row in reader:
            
            # Preprocess the content
            text = clean_tweets(row[ind_text])
            description = clean_tweets(row[ind_description])
            
            rowData = ["republican",text,description]
            rowData = '"' + '","'.join(rowData) + '"\n'
            
            # Write row
            output_file.write(rowData)
            

## Stemmer

In [None]:
from nltk.stem.snowball import SnowballStemmer


class Stemmer(object):

    def __init__(self):
        self.stemmer = SnowballStemmer("english", ignore_stopwords=True)

    def stem(self, tokens):
        """
        token: a string that contain a token
        """
        
        # Have to return the stemmed token
        return [self.stemmer.stem(token) for token in tokens]

## Stemmed

In [None]:
input_path = "databases/sources.csv"
output_path = "databases/stemmed.csv"

# Creation of the file that will contain the hydrated tweets:
with open(output_path, 'w+', newline='', encoding="utf-8") as output_file:
    with open(input_path, 'r', newline='', encoding="utf-8") as input_file:
        
        # init reader
        reader = csv.reader(input_file, quotechar='"', delimiter=',')

        # Taking the header of the file + the index of useful columns:
        header = next(reader)
        ind_label = header.index('label')
        ind_text = header.index('text')
        ind_description = header.index('description')
        
        # Write headers
        output_file.write('"label","text"\n')
        
        # Init stemmer
        stemmer = Stemmer()
        
        # Go through input file
        for row in reader:
            
            # Get the content
            label = row[ind_label]
            text = row[ind_text]
            
            # Make sure there is no punctuation
            text = ''.join([c for c in text if c not in string.punctuation])
            
            # Tokenize
            text_tokens = word_tokenize(text)            
            
            # Stem
            stems = stemmer.stem(text_tokens)
            
            # Join stems
            stems = " ".join(stems)
            
            # Write row            
            rowData = [label,stems]
            rowData = '"' + '","'.join(rowData) + '"\n'
            
            # Write row
            output_file.write(rowData)
            