# Lab-05: Visualizing tweets from the 2020 US presidential election

In [None]:
import os
import pandas as pd

# Dataset is a randomly sampled subset of: https://www.kaggle.com/manchunhui/us-election-2020-tweets
trump = pd.read_csv("2020_tweets_trump.csv", lineterminator='\n')
biden = pd.read_csv("2020_tweets_biden.csv", lineterminator='\n')

In [None]:
len(biden), len(trump)

In [None]:
M = 10000
trump = trump.sample(n=M//2)
biden = biden.sample(n=M//2)

In [None]:
biden_tweets = biden['tweet'].tolist()
trump_tweets = trump['tweet'].tolist()

In [None]:
biden_tweets[3023]

# Preprocessing pipeline

In [None]:
import re
from tqdm import tqdm
from typing import List

import spacy
from spacy.language import Language

pipeline_name = '2020ElectionTweets'


def camel_case_split(str):
    """ This function turns in #Biden2020 into Biden 2020 """
    return " ".join([wrd for wrd in re.findall(r'[A-Z](?:[a-z]+|[A-Z]*(?=[A-Z]|$))', str)])


@Language.component(pipeline_name)
def preprocess(doc):
    doc = [token for token in doc if not token.is_punct]
    # doc = [token for token in doc if not token.is_stop]
    doc = [token.text.lower().strip() for token in doc]
    doc = [token for token in doc if 0 < len(token) <= 12]
    return " ".join(doc)


class Pipeline:
    
    # http://emailregex.com/
    email_re = r"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)
    *|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]
    |\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9]
    (?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}
    (?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:
    (?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"""
    # replace = [ (pattern-to-replace, replacement),  ...]
    replace = [
        ("<[^>]*>", " "),
        (email_re, " "),                           # Matches emails
        (r"(?<=\d),(?=\d)", ""),                   # Remove commas in numbers
        (r"\d+", " "),                             # Map digits to special token <numbr>
        (r"[*\^\.$&@<>,\-/+{|}=?#:;'\"\[\]]", ""), # Punctuation and other junk
        (r"[\n\t\r]", " "),                        # Removes newlines, tabs, creturn
        (r"[^\x00-\x7F]+", ""),                    # Removes non-ascii chars
        (r"\\+", " "),                             # Removes double-backslashs
        (r"\s+n\s+", " "),                         # 'n' leftover from \\n
        (r"\s+", " ")                              # Strips extra whitespace
    ]
    
    def __init__(self):
        self.pipeline = spacy.load('en_core_web_sm')
        self.pipeline.add_pipe(pipeline_name);
        
    def __call__(self, *args, **kwargs):
        return self.transform(*args, **kwargs)

    def transform(self, doc: str):
        for repl in self.replace:
            doc = re.sub(repl[0], repl[1], doc)
        doc = camel_case_split(doc)
        return self.pipeline(doc)
    
pipeline = Pipeline();

In [None]:
from tqdm import tqdm

with tqdm(total=M//2) as bar:
    for i, (bt, tt) in enumerate(zip(biden_tweets, trump_tweets)):
        biden_tweets[i] = pipeline(bt)
        trump_tweets[i] = pipeline(tt)
        bar.update()

In [None]:
biden_tweets[3023]

### Concatenate documents for vocab generation

In [None]:
all_tweets = biden_tweets + trump_tweets

## (20 pts) Task I: Train a Doc2Vec model (using the Gensim package) on tweets from the 2020 US presidential election

*Docs*: 

* https://radimrehurek.com/gensim/models/doc2vec.html

*Useful tutorials*: 

* https://www.kaggle.com/pierremegret/gensim-word2vec-tutorial 
* https://radimrehurek.com/gensim/auto_examples/tutorials/run_doc2vec_lee.html#sphx-glr-auto-examples-tutorials-run-doc2vec-lee-py

### Hyperparameters

In [None]:
# Change as needed
K = 20
word_frequency_threshold = 2
epochs = 10
lr = 0.01

In [None]:
from gensim.models.doc2vec import Doc2Vec

model = Doc2Vec(vector_size=K, min_count=word_frequency_threshold, epochs=epochs)

In [None]:
# Your code goes here

## (10  pts) Task II: Evaluate your model by computing the most similar documents (tweets) to new (perhaps made up) tweets

In [None]:
# Template function
def find_similar_tweets(tweet, top_n=10):
    doc_vector = model.infer_vector(tweet)
    sims = model.dv.most_similar([doc_vector], topn=top_n)
    return sims

In [None]:
# Your code goes here

## (10 pts extra credit) Task III: Produce a scatter plot of the compressed document embeddings (2D or 3D)

*Useful resources*:

* http://projector.tensorflow.org/

In [None]:
# Your code goes here