In [1]:
# !python3.6 -m pip install pysqlite3
# !python3.6 -m pip install textblob
# !python3.6 -m pip install GetOldTweets3
# !python3.6 -m pip install pandas
# !python3.6 -m pip install pyarrow

#!sudo apt-get install libsqlite3-dev
#!sudo apt-get install sqlite3

In [2]:
from textblob import TextBlob
from _sqlite3 import *
import re

def clean_tweet(tweet): 
        ''' 
        Utility function to clean tweet text by removing links, special characters 
        using simple regex statements. 
        '''
        return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split()) 
    
def get_tweet_sentiment(tweet): 
        ''' 
        Utility function to classify sentiment of passed tweet 
        using textblob's sentiment method 
        '''
        # create TextBlob object of passed tweet text 
        analysis = TextBlob(tweet) 
        # set sentiment 
        if analysis.sentiment.polarity > 0: 
            return 'positive'
        elif analysis.sentiment.polarity == 0: 
            return 'neutral'
        else: 
            return 'negative'

def get_tweet_tokens(tweet):
    tokens = [token for token in tweet.split()]
    return tokens

In [3]:
# !python3.6 -m pip install nltk
import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
import datetime
from datetime import timezone

In [4]:
import GetOldTweets3 as got
import pandas as pd
import os

In [5]:
def save_raw_analysis(df = pd.DataFrame(), year = 2016, month = 1 , day = 1, person = "Trump", isTop = "Top"):
    if(len(df) == 0):
        return
    token_folder = "./data/raw_analysis/"
    if(os.path.exists(token_folder) == False):
        os.mkdir(token_folder)
    
    prefix = person + "-" + str(year) + "-" + str(month) + "-" + str(day) + "-" + isTop
    fn = token_folder + prefix + ".gzip"
    if os.path.exists(fn):
        print(str(datetime.datetime.now()) + "[System]: Existed " + prefix + ".(Anaysis)")
        return
    
    df["year"] = str(year)
    df["month"] = str(month)
    df["day"] = str(day)
    
    dt = datetime.datetime(year, month, day)
    timestamp = dt.replace(tzinfo=timezone.utc).timestamp()
    df["timestamp"] = timestamp * 1000
    
    df.to_parquet(fn, compression='gzip')
    print(str(datetime.datetime.now()) + "[System]: " + fn + " saved.(Anaysis)")

In [6]:
def save_tweet_tokens(tokens = [], year = 2016, month = 1 , day = 1, person = "Trump", isTop = "Top"):
    token_folder = "./data/tokens/"
    if(os.path.exists(token_folder) == False):
        os.mkdir(token_folder)
    
    prefix = person + "-" + str(year) + "-" + str(month) + "-" + str(day) + "-" + isTop
    fn = token_folder + prefix + ".gzip"
    if os.path.exists(fn):
        print(str(datetime.datetime.now()) + "[System]: Existed " + prefix + ".(Tokens)")
        return
    
    print(str(datetime.datetime.now()) + "[System]: Start processing " + prefix + ".(Tokens)")
    stop_words = stopwords.words('english')
    external_step_words = ['donald', 'trump', 'hillary', 'clinton', 'i', 'the', 'is', 'he', 'a', 'via', 'amp', '0','1','2','3','4','5','6','7','8','9']

    tokens = [token.lower() for li in list(df['tokens']) for token in li if token not in stop_words]
    tokens = [token for token in tokens if token not in external_step_words]
            
    df_tokens = pd.DataFrame(tokens, columns=['tokens'])
    df_tokens = df_tokens.groupby(['tokens']).size().reset_index(name='counts')
    df_tokens = df_tokens.sort_values('counts', ascending=False)
    df_tokens.to_parquet(fn, compression='gzip')
    
    df_tokens["year"] = str(year)
    df_tokens["month"] = str(month)
    df_tokens["day"] = str(day)
    
    dt = datetime.datetime(year, month, day)
    timestamp = dt.replace(tzinfo=timezone.utc).timestamp()
    df_tokens["timestamp"] = timestamp * 1000
    
    print(str(datetime.datetime.now()) + "[System]: " + fn + " saved.(Tokens)")

In [7]:
def save_sentiment(df = pd.DataFrame(), year = 2016, month = 1 , day = 1, person = "Trump", isTop = "Top"):
    if(len(df) == 0):
        return
    token_folder = "./data/sentiment/"
    if(os.path.exists(token_folder) == False):
        os.mkdir(token_folder)
        
    df = pd.DataFrame([df.groupby(["positive"]).size().tolist()],columns=['negative','neutral', 'positive'])
    df["heats"] = (df['positive'] + df['neutral'] + df['positive'])
    df["p_ratio"] = df['positive'] / df["heats"]
    
    df["year"] = str(year)
    df["month"] = str(month)
    df["day"] = str(day)
    
    dt = datetime.datetime(year, month, day)
    timestamp = dt.replace(tzinfo=timezone.utc).timestamp()
    df["timestamp"] = timestamp * 1000
    
    
    prefix = person + "-" + str(year) + "-" + str(month) + "-" + str(day) + "-" + isTop
    fn = token_folder + prefix + ".gzip"
    if os.path.exists(fn):
        print(str(datetime.datetime.now()) + "[System]: Existed " + prefix + ".(Sentiment)")
        return
    
    df.to_parquet(fn, compression='gzip')
    print(str(datetime.datetime.now()) + "[System]: " + fn + " saved.(Sentiment)")



In [8]:
def read_parquet(month, day):
    fn1 = "./data/Donald-Trump/Donald-Trump-2016" + "-"+ str(month) + "-" + str(day) +".parquet.gzip"
    fn2 = "./data/Trump/Trump-2016" + "-"+ str(month) + "-" + str(day) +".parquet.gzip"
    if(os.path.exists(fn1) and os.path.exists(fn2)):
        df1 =  pd.read_parquet(fn1)
        df2 =  pd.read_parquet(fn2)
        return pd.concat([df1, df2])
    else:
        return pd.DataFrame()

In [9]:
for month in range(8, 12):
    for day in range (1, 32):
        df = read_parquet(month , day)
        if(len(df)==0):
            continue
        else:
            df['text']     = df['text'].apply(clean_tweet)
            df['positive'] = df['text'].apply(get_tweet_sentiment)
            df['tokens']   = df['text'].apply(get_tweet_tokens)
            
            save_raw_analysis(df = df, month = month, day = day)
            save_tweet_tokens(tokens = [], month = month, day = day)
            save_sentiment(df = df, month = month, day = day)

2020-02-26 02:14:11.927759[System]: ./data/raw_analysis/Trump-2016-8-1-Top.gzip saved.(Anaysis)
2020-02-26 02:14:11.928092[System]: Start processing Trump-2016-8-1-Top.(Tokens)
2020-02-26 02:14:12.241020[System]: ./data/tokens/Trump-2016-8-1-Top.gzip saved.(Tokens)
2020-02-26 02:14:12.253008[System]: ./data/sentiment/Trump-2016-8-1-Top.gzip saved.(Sentiment)
2020-02-26 02:14:17.396375[System]: ./data/raw_analysis/Trump-2016-8-2-Top.gzip saved.(Anaysis)
2020-02-26 02:14:17.396743[System]: Start processing Trump-2016-8-2-Top.(Tokens)
2020-02-26 02:14:17.744233[System]: ./data/tokens/Trump-2016-8-2-Top.gzip saved.(Tokens)
2020-02-26 02:14:17.756706[System]: ./data/sentiment/Trump-2016-8-2-Top.gzip saved.(Sentiment)
2020-02-26 02:14:22.441817[System]: ./data/raw_analysis/Trump-2016-8-3-Top.gzip saved.(Anaysis)
2020-02-26 02:14:22.442067[System]: Start processing Trump-2016-8-3-Top.(Tokens)
2020-02-26 02:14:22.763994[System]: ./data/tokens/Trump-2016-8-3-Top.gzip saved.(Tokens)
2020-02-26 0

2020-02-26 02:15:28.875228[System]: ./data/raw_analysis/Trump-2016-8-24-Top.gzip saved.(Anaysis)
2020-02-26 02:15:28.875569[System]: Start processing Trump-2016-8-24-Top.(Tokens)
2020-02-26 02:15:29.045083[System]: ./data/tokens/Trump-2016-8-24-Top.gzip saved.(Tokens)
2020-02-26 02:15:29.053623[System]: ./data/sentiment/Trump-2016-8-24-Top.gzip saved.(Sentiment)
2020-02-26 02:15:32.154729[System]: ./data/raw_analysis/Trump-2016-8-25-Top.gzip saved.(Anaysis)
2020-02-26 02:15:32.155043[System]: Start processing Trump-2016-8-25-Top.(Tokens)
2020-02-26 02:15:32.374035[System]: ./data/tokens/Trump-2016-8-25-Top.gzip saved.(Tokens)
2020-02-26 02:15:32.383996[System]: ./data/sentiment/Trump-2016-8-25-Top.gzip saved.(Sentiment)
2020-02-26 02:15:35.062229[System]: ./data/raw_analysis/Trump-2016-8-26-Top.gzip saved.(Anaysis)
2020-02-26 02:15:35.062598[System]: Start processing Trump-2016-8-26-Top.(Tokens)
2020-02-26 02:15:35.251237[System]: ./data/tokens/Trump-2016-8-26-Top.gzip saved.(Tokens)
2

2020-02-26 02:16:32.708440[System]: ./data/raw_analysis/Trump-2016-9-17-Top.gzip saved.(Anaysis)
2020-02-26 02:16:32.708758[System]: Start processing Trump-2016-9-17-Top.(Tokens)
2020-02-26 02:16:32.908012[System]: ./data/tokens/Trump-2016-9-17-Top.gzip saved.(Tokens)
2020-02-26 02:16:32.918575[System]: ./data/sentiment/Trump-2016-9-17-Top.gzip saved.(Sentiment)
2020-02-26 02:16:35.009940[System]: ./data/raw_analysis/Trump-2016-9-18-Top.gzip saved.(Anaysis)
2020-02-26 02:16:35.010323[System]: Start processing Trump-2016-9-18-Top.(Tokens)
2020-02-26 02:16:35.174339[System]: ./data/tokens/Trump-2016-9-18-Top.gzip saved.(Tokens)
2020-02-26 02:16:35.182624[System]: ./data/sentiment/Trump-2016-9-18-Top.gzip saved.(Sentiment)
2020-02-26 02:16:37.378111[System]: ./data/raw_analysis/Trump-2016-9-19-Top.gzip saved.(Anaysis)
2020-02-26 02:16:37.378433[System]: Start processing Trump-2016-9-19-Top.(Tokens)
2020-02-26 02:16:37.548186[System]: ./data/tokens/Trump-2016-9-19-Top.gzip saved.(Tokens)
2

2020-02-26 02:18:30.324699[System]: Existed Trump-2016-10-16-Top.(Anaysis)
2020-02-26 02:18:30.324802[System]: Start processing Trump-2016-10-16-Top.(Tokens)
2020-02-26 02:18:30.673536[System]: ./data/tokens/Trump-2016-10-16-Top.gzip saved.(Tokens)
2020-02-26 02:18:30.685210[System]: ./data/sentiment/Trump-2016-10-16-Top.gzip saved.(Sentiment)
2020-02-26 02:18:35.731959[System]: ./data/raw_analysis/Trump-2016-10-17-Top.gzip saved.(Anaysis)
2020-02-26 02:18:35.732278[System]: Start processing Trump-2016-10-17-Top.(Tokens)
2020-02-26 02:18:36.082117[System]: ./data/tokens/Trump-2016-10-17-Top.gzip saved.(Tokens)
2020-02-26 02:18:36.094319[System]: ./data/sentiment/Trump-2016-10-17-Top.gzip saved.(Sentiment)
2020-02-26 02:18:41.086735[System]: ./data/raw_analysis/Trump-2016-10-18-Top.gzip saved.(Anaysis)
2020-02-26 02:18:41.087026[System]: Start processing Trump-2016-10-18-Top.(Tokens)
2020-02-26 02:18:41.433427[System]: ./data/tokens/Trump-2016-10-18-Top.gzip saved.(Tokens)
2020-02-26 02

# Appendix
[1] **Twitter Sentiment Analysis using Python,**, *https://www.geeksforgeeks.org/twitter-sentiment-analysis-using-python/*