In [1]:
# Imports
from collections import Counter
import re
import string
import pandas as pd
import requests
import json
import squarify
# import matplotlib.pyplot as plt
import seaborn as sns
import emoji
import spacy
from spacy.tokenizer import Tokenizer

# Gensim stopwords
from gensim.parsing.preprocessing import STOPWORDS as SW

# Wordclouds stopwords
from wordcloud import STOPWORDS
stopwords = set(STOPWORDS)

# Establish the English Core Web
nlp = spacy.load('en_core_web_lg')

### Load the data and go from the original tweet to a tweet with no Emoji's and no url's

In [2]:
# Bring in the JSON
url = 'https://raw.githubusercontent.com/jacobpad/Labs-Stuff/master/dutchbros_followers.json'
r = requests.get(url)
df = r.json()

# Simple formating on the JSON
df = pd.DataFrame(df.values())
df = df.rename(columns={0:'original_tweet'})

# Make emoji free text
# Source: https://stackoverflow.com/questions/33404752/removing-emojis-from-a-string-in-python
def give_emoji_free_text(text):
    """
    Removes emoji's from tweets
    Accepts:
        Text (tweets)
    Returns:
        Text (emoji free tweets)
    """
    emoji_list = [c for c in text if c in emoji.UNICODE_EMOJI]
    clean_text = ' '.join([str for str in text.split() if not any(i in str for i in emoji_list)])
    return clean_text

# Apply the function above and get tweets free of emoji's
call_emoji_free = lambda x: give_emoji_free_text(x)

# Apply `call_emoji_free` which calls the function to remove all emoji's
df['emoji_free_tweet'] = df['original_tweet'].apply(call_emoji_free)


# Removing url's
def remove_url(text):
    """
    Remove URL's
    Accepts:
        emoji_free_tweet
    Returns:
        emoji_free_tweet & url_free_tweet
    Makes a new column
    """
    # https://www.youtube.com/watch?v=O2onA4r5UaY
    pattern = r"http\S+"
    tokens = re.sub(pattern, "", text)
    return tokens

# Make new url_free_tweet column by applying the function on emoji_free_tweet
df['url_free_tweet'] = df['emoji_free_tweet'].apply(remove_url)

# View the dataframe with 3 columns - original_tweet, emoji_free_tweet, url_free_tweet. 
# url_free_tweet is also emoji free
df

Unnamed: 0,original_tweet,emoji_free_tweet,url_free_tweet
0,Flynn was fucking railroaded. https://t.co/4aZ...,Flynn was fucking railroaded. https://t.co/4aZ...,Flynn was fucking railroaded.
1,President @realDonaldTrump talks about Opening...,President @realDonaldTrump talks about Opening...,President @realDonaldTrump talks about Opening...
2,"#Obamagate Comey magic: Flynn calls ""turned up...","#Obamagate Comey magic: Flynn calls ""turned up...","#Obamagate Comey magic: Flynn calls ""turned up..."
3,Matt Gaetz predicts President Trump will pardo...,Matt Gaetz predicts President Trump will pardo...,Matt Gaetz predicts President Trump will pardo...
4,We're in the 21st century Joe... https://t.co/...,We're in the 21st century Joe... https://t.co/...,We're in the 21st century Joe...
...,...,...,...
18518,After a great phone call with the @ASUFootball...,After a great phone call with the @ASUFootball...,After a great phone call with the @ASUFootball...
18519,After a great zoom meeting with the @ASUFootba...,After a great zoom meeting with the @ASUFootba...,After a great zoom meeting with the @ASUFootba...
18520,Morning 💗 https://t.co/uu7MhenMOf,Morning https://t.co/uu7MhenMOf,Morning
18521,Always be careful who you open up to. Only a f...,Always be careful who you open up to. Only a f...,Always be careful who you open up to. Only a f...


### Tokenize `url_free_tweet`

In [3]:
"""
Import Gensim and Wordcloud to use their stopwords as well and use the combined stopwords of ALL as the variable:
ALL_STOP_WORDS
"""

# Tokenizer
tokenizer = Tokenizer(nlp.vocab)


# Custom stopwords
custom_stopwords = ['hi','\n','\n\n', '&amp;', ' ', '.', '-', 'got', "it's", 'it’s', "i'm", 'i’m', 'im', 'want', 'like']

# Customize stop words by adding to the default list
STOP_WORDS = nlp.Defaults.stop_words.union(custom_stopwords)

# ALL_STOP_WORDS = spacy + gensim + wordcloud
ALL_STOP_WORDS = STOP_WORDS.union(SW).union(stopwords)



tokens = []

for doc in tokenizer.pipe(df['url_free_tweet'], batch_size=500):
    doc_tokens = []    
    for token in doc: 
        if token.text.lower() not in STOP_WORDS:
            doc_tokens.append(token.text.lower())   
    tokens.append(doc_tokens)

# Makes tokens column
df['tokens'] = tokens

# View df
df

Unnamed: 0,original_tweet,emoji_free_tweet,url_free_tweet,tokens
0,Flynn was fucking railroaded. https://t.co/4aZ...,Flynn was fucking railroaded. https://t.co/4aZ...,Flynn was fucking railroaded.,"[flynn, fucking, railroaded.]"
1,President @realDonaldTrump talks about Opening...,President @realDonaldTrump talks about Opening...,President @realDonaldTrump talks about Opening...,"[president, @realdonaldtrump, talks, opening, ..."
2,"#Obamagate Comey magic: Flynn calls ""turned up...","#Obamagate Comey magic: Flynn calls ""turned up...","#Obamagate Comey magic: Flynn calls ""turned up...","[#obamagate, comey, magic:, flynn, calls, ""tur..."
3,Matt Gaetz predicts President Trump will pardo...,Matt Gaetz predicts President Trump will pardo...,Matt Gaetz predicts President Trump will pardo...,"[matt, gaetz, predicts, president, trump, pard..."
4,We're in the 21st century Joe... https://t.co/...,We're in the 21st century Joe... https://t.co/...,We're in the 21st century Joe...,"[we're, 21st, century, joe...]"
...,...,...,...,...
18518,After a great phone call with the @ASUFootball...,After a great phone call with the @ASUFootball...,After a great phone call with the @ASUFootball...,"[great, phone, @asufootball, coach’s, humbled,..."
18519,After a great zoom meeting with the @ASUFootba...,After a great zoom meeting with the @ASUFootba...,After a great zoom meeting with the @ASUFootba...,"[great, zoom, meeting, @asufootball, staff,, e..."
18520,Morning 💗 https://t.co/uu7MhenMOf,Morning https://t.co/uu7MhenMOf,Morning,[morning]
18521,Always be careful who you open up to. Only a f...,Always be careful who you open up to. Only a f...,Always be careful who you open up to. Only a f...,"[careful, open, to., actually, care,, curious.]"


### Lemmatization

In [4]:
# Make tokens a string again
# credit : https://stackoverflow.com/questions/45306988/column-of-lists-convert-list-to-string-as-a-new-column
df['tokens_back_to_text'] = [' '.join(map(str, l)) for l in df['tokens']]

def get_lemmas(text):
    '''Used to lemmatize the processed tweets'''
    lemmas = []
    
    doc = nlp(text)
    
    # Something goes here :P
    for token in doc: 
        if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_ != 'PRON'):
            lemmas.append(token.lemma_)
    
    return lemmas

df['lemmas'] = df['tokens_back_to_text'].apply(get_lemmas)

# Make lemmas a string again
df['lemmas_back_to_text'] = [' '.join(map(str, l)) for l in df['lemmas']]
# df[['original_tweet', 'lemmas_back_to_text']]

### Tokenize the lemmetized text to see what changes

In [5]:
# Tokenizer
tokenizer = Tokenizer(nlp.vocab)

# Tokenizer function
def tokenize(text):
    """
    Parses a string into a list of semantic units (words)
    Args:
        text (str): The string that the function will tokenize.
    Returns:
        list: tokens parsed out
    """
    # Removing url's
    pattern = r"http\S+"
    
    tokens = re.sub(pattern, "", text) # https://www.youtube.com/watch?v=O2onA4r5UaY
    tokens = re.sub('[^a-zA-Z 0-9]', '', text)
    tokens = re.sub('[%s]' % re.escape(string.punctuation), '', text) # Remove punctuation
    tokens = re.sub('\w*\d\w*', '', text) # Remove words containing numbers
    tokens = tokens.strip(',') # TESTING THIS LINE
    tokens = tokens.strip('?') # TESTING THIS LINE
    tokens = tokens.strip('!') # TESTING THIS LINE
    tokens = tokens.strip("'") # TESTING THIS LINE
    tokens = tokens.strip(".") # TESTING THIS LINE

    tokens = tokens.lower().split() # Make text lowercase and split it
    
    return tokens

# Apply tokenizer
df['lemma_tokens'] = df['lemmas_back_to_text'].apply(tokenize)

# View those tokens (the 4th column)
df

Unnamed: 0,original_tweet,emoji_free_tweet,url_free_tweet,tokens,tokens_back_to_text,lemmas,lemmas_back_to_text,lemma_tokens
0,Flynn was fucking railroaded. https://t.co/4aZ...,Flynn was fucking railroaded. https://t.co/4aZ...,Flynn was fucking railroaded.,"[flynn, fucking, railroaded.]",flynn fucking railroaded.,"[flynn, fucking, railroaded]",flynn fucking railroaded,"[flynn, fucking, railroaded]"
1,President @realDonaldTrump talks about Opening...,President @realDonaldTrump talks about Opening...,President @realDonaldTrump talks about Opening...,"[president, @realdonaldtrump, talks, opening, ...",president @realdonaldtrump talks opening ameri...,"[president, @realdonaldtrump, talks, opening, ...",president @realdonaldtrump talks opening america,"[president, @realdonaldtrump, talks, opening, ..."
2,"#Obamagate Comey magic: Flynn calls ""turned up...","#Obamagate Comey magic: Flynn calls ""turned up...","#Obamagate Comey magic: Flynn calls ""turned up...","[#obamagate, comey, magic:, flynn, calls, ""tur...","#obamagate comey magic: flynn calls ""turned up...","[obamagate, comey, magic, flynn, call, turn, f...",obamagate comey magic flynn call turn find ova...,"[obamagate, comey, magic, flynn, call, turn, f..."
3,Matt Gaetz predicts President Trump will pardo...,Matt Gaetz predicts President Trump will pardo...,Matt Gaetz predicts President Trump will pardo...,"[matt, gaetz, predicts, president, trump, pard...",matt gaetz predicts president trump pardon rog...,"[matt, gaetz, predict, president, trump, pardo...",matt gaetz predict president trump pardon roge...,"[matt, gaetz, predict, president, trump, pardo..."
4,We're in the 21st century Joe... https://t.co/...,We're in the 21st century Joe... https://t.co/...,We're in the 21st century Joe...,"[we're, 21st, century, joe...]",we're 21st century joe...,"[21st, century, joe]",21st century joe,"[century, joe]"
...,...,...,...,...,...,...,...,...
18518,After a great phone call with the @ASUFootball...,After a great phone call with the @ASUFootball...,After a great phone call with the @ASUFootball...,"[great, phone, @asufootball, coach’s, humbled,...",great phone @asufootball coach’s humbled bless...,"[great, phone, @asufootball, coach, humble, bl...",great phone @asufootball coach humble blessed ...,"[great, phone, @asufootball, coach, humble, bl..."
18519,After a great zoom meeting with the @ASUFootba...,After a great zoom meeting with the @ASUFootba...,After a great zoom meeting with the @ASUFootba...,"[great, zoom, meeting, @asufootball, staff,, e...","great zoom meeting @asufootball staff, extreme...","[great, zoom, meeting, @asufootball, staff, ex...",great zoom meeting @asufootball staff extremel...,"[great, zoom, meeting, @asufootball, staff, ex..."
18520,Morning 💗 https://t.co/uu7MhenMOf,Morning https://t.co/uu7MhenMOf,Morning,[morning],morning,[morning],morning,[morning]
18521,Always be careful who you open up to. Only a f...,Always be careful who you open up to. Only a f...,Always be careful who you open up to. Only a f...,"[careful, open, to., actually, care,, curious.]","careful open to. actually care, curious.","[careful, open, actually, care, curious]",careful open actually care curious,"[careful, open, actually, care, curious]"


### Common words

In [6]:
# Count Function to count tokens
def count(docs):
    word_counts = Counter()
    appears_in = Counter()
        
    total_docs = len(docs)

    for doc in docs:
        word_counts.update(doc)
        appears_in.update(set(doc))

    temp = zip(word_counts.keys(), word_counts.values())
        
    wc = pd.DataFrame(temp, columns = ['word', 'count'])

    wc['rank'] = wc['count'].rank(method='first', ascending=False)
    total = wc['count'].sum()

    wc['pct_total'] = wc['count'].apply(lambda x: x / total)
        
    wc = wc.sort_values(by='rank')
    wc['cul_pct_total'] = wc['pct_total'].cumsum()

    t2 = zip(appears_in.keys(), appears_in.values())
    ac = pd.DataFrame(t2, columns=['word', 'appears_in'])
    wc = ac.merge(wc, on='word')

    wc['appears_in_pct'] = wc['appears_in'].apply(lambda x: x / total_docs)
        
    return wc.sort_values(by='rank')

# The object `Counter` takes an iterable, but you can instaniate an empty one and update it. 
word_counts = Counter()


wc = count(df['lemmas'])
wc.head(20)
tokens = []

for doc in tokenizer.pipe(df['lemmas_back_to_text'], batch_size=500):
    doc_tokens = []    
    for token in doc: 
        if token.text.lower() not in STOP_WORDS:
            doc_tokens.append(token.text.lower())   
    tokens.append(doc_tokens)
    
df['lemmas'] = tokens

wc = count(df['lemmas'])
wc.head()

Unnamed: 0,word,appears_in,count,rank,pct_total,cul_pct_total,appears_in_pct
188,people,910,1033,1.0,0.006728,0.006728,0.049128
283,good,857,928,2.0,0.006044,0.012772,0.046267
47,time,780,844,3.0,0.005497,0.018269,0.04211
267,love,751,830,4.0,0.005406,0.023674,0.040544
276,know,729,765,5.0,0.004982,0.028657,0.039356
