In [1]:
# !python3.6 -m pip install pysqlite3
# !python3.6 -m pip install textblob
# !python3.6 -m pip install GetOldTweets3
# !python3.6 -m pip install pandas
# !python3.6 -m pip install pyarrow

#!sudo apt-get install libsqlite3-dev
#!sudo apt-get install sqlite3

In [2]:
from textblob import TextBlob
from _sqlite3 import *
import re

def clean_tweet(tweet): 
        ''' 
        Utility function to clean tweet text by removing links, special characters 
        using simple regex statements. 
        '''
        return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split()) 
    
def get_tweet_sentiment(tweet): 
        ''' 
        Utility function to classify sentiment of passed tweet 
        using textblob's sentiment method 
        '''
        # create TextBlob object of passed tweet text 
        analysis = TextBlob(tweet) 
        # set sentiment 
        if analysis.sentiment.polarity > 0: 
            return 'positive'
        elif analysis.sentiment.polarity == 0: 
            return 'neutral'
        else: 
            return 'negative'

def get_tweet_tokens(tweet):
    tokens = [token for token in tweet.split()]
    return tokens

In [3]:
# !python3.6 -m pip install nltk
import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
import datetime

In [4]:
import GetOldTweets3 as got
import pandas as pd
import os

In [5]:
def save_raw_analysis(df = pd.DataFrame(), year = 2016, month = 1 , day = 1, person = "Trump", isTop = "Top"):
    if(len(df) == 0):
        return
    token_folder = "./data/raw_analysis/"
    if(os.path.exists(token_folder) == False):
        os.mkdir(token_folder)
    
    prefix = person + "-" + str(year) + "-" + str(month) + "-" + str(day) + "-" + isTop
    fn = token_folder + prefix + ".gzip"
    if os.path.exists(fn):
        print(str(datetime.datetime.now()) + "[System]: Existed " + prefix + ".(Anaysis)")
        return
    
    df.to_parquet(fn, compression='gzip')
    print(str(datetime.datetime.now()) + "[System]: " + fn + " saved.(Anaysis)")

In [6]:
def save_tweet_tokens(tokens = [], year = 2016, month = 1 , day = 1, person = "Trump", isTop = "Top"):
    token_folder = "./data/tokens/"
    if(os.path.exists(token_folder) == False):
        os.mkdir(token_folder)
    
    prefix = person + "-" + str(year) + "-" + str(month) + "-" + str(day) + "-" + isTop
    fn = token_folder + prefix + ".gzip"
    if os.path.exists(fn):
        print(str(datetime.datetime.now()) + "[System]: Existed " + prefix + ".(Tokens)")
        return
    
    print(str(datetime.datetime.now()) + "[System]: Start processing " + prefix + ".(Tokens)")
    stop_words = stopwords.words('english')
    external_step_words = ['donald', 'trump', 'hillary', 'clinton', 'i', 'the', 'is', 'he', 'a', 'via', 'amp', '0','1','2','3','4','5','6','7','8','9']

    tokens = [token.lower() for li in list(df['tokens']) for token in li if token not in stop_words]
    tokens = [token for token in tokens if token not in external_step_words]
            
    df_tokens = pd.DataFrame(tokens, columns=['tokens'])
    df_tokens = df_tokens.groupby(['tokens']).size().reset_index(name='counts')
    df_tokens = df_tokens.sort_values('counts', ascending=False)
    df_tokens.to_parquet(fn, compression='gzip')
    print(str(datetime.datetime.now()) + "[System]: " + fn + " saved.(Tokens)")

In [7]:
def save_sentiment(df = pd.DataFrame(), year = 2016, month = 1 , day = 1, person = "Trump", isTop = "Top"):
    if(len(df) == 0):
        return
    token_folder = "./data/sentiment/"
    if(os.path.exists(token_folder) == False):
        os.mkdir(token_folder)
        
    df = pd.DataFrame([df.groupby(["positive"]).size().tolist()],columns=['negative','neutral', 'positive'])
    df["year"] = str(year)
    df["month"] = str(month)
    df["day"] = str(day)
    
    
    prefix = person + "-" + str(year) + "-" + str(month) + "-" + str(day) + "-" + isTop
    fn = token_folder + prefix + ".gzip"
    if os.path.exists(fn):
        print(str(datetime.datetime.now()) + "[System]: Existed " + prefix + ".(Sentiment)")
        return
    
    df.to_parquet(fn, compression='gzip')
    print(str(datetime.datetime.now()) + "[System]: " + fn + " saved.(Sentiment)")



In [8]:
def read_parquet(month, day):
    fn1 = "./data/Donald-Trump/Donald-Trump-2016" + "-"+ str(month) + "-" + str(day) +".parquet.gzip"
    fn2 = "./data/Trump/Trump-2016" + "-"+ str(month) + "-" + str(day) +".parquet.gzip"
    if(os.path.exists(fn1) and os.path.exists(fn2)):
        df1 =  pd.read_parquet(fn1)
        df2 =  pd.read_parquet(fn2)
        return pd.concat([df1, df2])
    else:
        return pd.DataFrame()

In [9]:
for month in range(8, 12):
    for day in range (1, 32):
        df = read_parquet(month , day)
        if(len(df)==0):
            continue
        else:
            df['text']     = df['text'].apply(clean_tweet)
            df['positive'] = df['text'].apply(get_tweet_sentiment)
            df['tokens']   = df['text'].apply(get_tweet_tokens)
            
            save_raw_analysis(df = df, month = month, day = day)
            save_tweet_tokens(tokens = [], month = month, day = day)
            save_sentiment(df = df, month = month, day = day)

2020-02-25 23:38:51.874523[System]: ./data/raw_analysis/Trump-2016-8-1-Top.gzip saved.(Anaysis)
2020-02-25 23:38:51.874857[System]: Start processing Trump-2016-8-1-Top.(Tokens)
2020-02-25 23:38:52.187458[System]: ./data/tokens/Trump-2016-8-1-Top.gzip saved.(Tokens)
positive
negative    2476
neutral     5139
positive    3544
dtype: int64
2020-02-25 23:38:52.201904[System]: ./data/sentiment/Trump-2016-8-1-Top.gzip saved.(Sentiment)
2020-02-25 23:38:57.325851[System]: ./data/raw_analysis/Trump-2016-8-2-Top.gzip saved.(Anaysis)
2020-02-25 23:38:57.326172[System]: Start processing Trump-2016-8-2-Top.(Tokens)
2020-02-25 23:38:57.671868[System]: ./data/tokens/Trump-2016-8-2-Top.gzip saved.(Tokens)
positive
negative    2959
neutral     5312
positive    4204
dtype: int64
2020-02-25 23:38:57.682177[System]: ./data/sentiment/Trump-2016-8-2-Top.gzip saved.(Sentiment)
2020-02-25 23:39:02.391684[System]: ./data/raw_analysis/Trump-2016-8-3-Top.gzip saved.(Anaysis)
2020-02-25 23:39:02.392048[System]: 

2020-02-25 23:39:58.838343[System]: ./data/raw_analysis/Trump-2016-8-20-Top.gzip saved.(Anaysis)
2020-02-25 23:39:58.838656[System]: Start processing Trump-2016-8-20-Top.(Tokens)
2020-02-25 23:39:59.016187[System]: ./data/tokens/Trump-2016-8-20-Top.gzip saved.(Tokens)
positive
negative    1301
neutral     2629
positive    2260
dtype: int64
2020-02-25 23:39:59.024362[System]: ./data/sentiment/Trump-2016-8-20-Top.gzip saved.(Sentiment)
2020-02-25 23:40:01.048536[System]: ./data/raw_analysis/Trump-2016-8-21-Top.gzip saved.(Anaysis)
2020-02-25 23:40:01.048852[System]: Start processing Trump-2016-8-21-Top.(Tokens)
2020-02-25 23:40:01.195014[System]: ./data/tokens/Trump-2016-8-21-Top.gzip saved.(Tokens)
positive
negative    1015
neutral     2077
positive    1834
dtype: int64
2020-02-25 23:40:01.202325[System]: ./data/sentiment/Trump-2016-8-21-Top.gzip saved.(Sentiment)
2020-02-25 23:40:03.656657[System]: ./data/raw_analysis/Trump-2016-8-22-Top.gzip saved.(Anaysis)
2020-02-25 23:40:03.657000[

2020-02-25 23:40:49.760412[System]: ./data/raw_analysis/Trump-2016-9-9-Top.gzip saved.(Anaysis)
2020-02-25 23:40:49.760558[System]: Start processing Trump-2016-9-9-Top.(Tokens)
2020-02-25 23:40:49.932875[System]: ./data/tokens/Trump-2016-9-9-Top.gzip saved.(Tokens)
positive
negative    1220
neutral     2412
positive    1753
dtype: int64
2020-02-25 23:40:49.940306[System]: ./data/sentiment/Trump-2016-9-9-Top.gzip saved.(Sentiment)
2020-02-25 23:40:52.470823[System]: ./data/raw_analysis/Trump-2016-9-10-Top.gzip saved.(Anaysis)
2020-02-25 23:40:52.471184[System]: Start processing Trump-2016-9-10-Top.(Tokens)
2020-02-25 23:40:52.657127[System]: ./data/tokens/Trump-2016-9-10-Top.gzip saved.(Tokens)
positive
negative    1666
neutral     2498
positive    1930
dtype: int64
2020-02-25 23:40:52.665282[System]: ./data/sentiment/Trump-2016-9-10-Top.gzip saved.(Sentiment)
2020-02-25 23:40:54.568603[System]: ./data/raw_analysis/Trump-2016-9-11-Top.gzip saved.(Anaysis)
2020-02-25 23:40:54.568940[Syst

2020-02-25 23:41:51.544447[System]: ./data/raw_analysis/Trump-2016-9-28-Top.gzip saved.(Anaysis)
2020-02-25 23:41:51.544874[System]: Start processing Trump-2016-9-28-Top.(Tokens)
2020-02-25 23:41:51.821505[System]: ./data/tokens/Trump-2016-9-28-Top.gzip saved.(Tokens)
positive
negative    1896
neutral     3930
positive    3611
dtype: int64
2020-02-25 23:41:51.831559[System]: ./data/sentiment/Trump-2016-9-28-Top.gzip saved.(Sentiment)
2020-02-25 23:41:55.084962[System]: ./data/raw_analysis/Trump-2016-9-29-Top.gzip saved.(Anaysis)
2020-02-25 23:41:55.085271[System]: Start processing Trump-2016-9-29-Top.(Tokens)
2020-02-25 23:41:55.318312[System]: ./data/tokens/Trump-2016-9-29-Top.gzip saved.(Tokens)
positive
negative    1653
neutral     3447
positive    2785
dtype: int64
2020-02-25 23:41:55.326676[System]: ./data/sentiment/Trump-2016-9-29-Top.gzip saved.(Sentiment)
2020-02-25 23:41:58.692505[System]: ./data/raw_analysis/Trump-2016-10-1-Top.gzip saved.(Anaysis)
2020-02-25 23:41:58.692907[

2020-02-25 23:43:32.862471[System]: ./data/raw_analysis/Trump-2016-10-18-Top.gzip saved.(Anaysis)
2020-02-25 23:43:32.862863[System]: Start processing Trump-2016-10-18-Top.(Tokens)
2020-02-25 23:43:33.212475[System]: ./data/tokens/Trump-2016-10-18-Top.gzip saved.(Tokens)
positive
negative    2559
neutral     5184
positive    4356
dtype: int64
2020-02-25 23:43:33.223480[System]: ./data/sentiment/Trump-2016-10-18-Top.gzip saved.(Sentiment)
2020-02-25 23:43:38.221731[System]: ./data/raw_analysis/Trump-2016-10-19-Top.gzip saved.(Anaysis)
2020-02-25 23:43:38.222000[System]: Start processing Trump-2016-10-19-Top.(Tokens)
2020-02-25 23:43:38.569829[System]: ./data/tokens/Trump-2016-10-19-Top.gzip saved.(Tokens)
positive
negative    2532
neutral     5298
positive    4224
dtype: int64
2020-02-25 23:43:38.580964[System]: ./data/sentiment/Trump-2016-10-19-Top.gzip saved.(Sentiment)
2020-02-25 23:43:46.736667[System]: ./data/raw_analysis/Trump-2016-10-20-Top.gzip saved.(Anaysis)
2020-02-25 23:43:4

In [10]:
# from prettytable import PrettyTable
# from collections import Counter


# # for label, data in (('Token', tokens)):
# pt = PrettyTable(field_names=['Token', 'Count'])
# c = Counter(tokens)
# [ pt.add_row(kv) for kv in c.most_common()]
# pt.align['Token'], pt.align['Count'] = 'l', 'r' # Set column alignment
# # print(pt) 

# print(type(pt))
# #https://stackoverflow.com/questions/46527086/how-to-remove-stopwords-from-common-words-list-in-python

In [None]:
# !python3.6 -m pip install pillow
# !python3.6 -m pip install wordcloud

In [None]:
# from PIL import Image
# from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
# import matplotlib.pyplot as plt

# comment_words = ' '
# for words in tokens: 
#     comment_words = comment_words + words + ' '

# wordcloud = WordCloud(width = 800, height = 800, 
#                 background_color ='white').generate(comment_words) 
  
    
# # plot the WordCloud image                        
# plt.figure(figsize = (8, 8), facecolor = None) 
# plt.imshow(wordcloud) 
# plt.axis("off") 
# plt.tight_layout(pad = 0) 
  
# plt.show() 
# # # https://www.geeksforgeeks.org/generating-word-cloud-python/

# Appendix
[1] **Twitter Sentiment Analysis using Python,**, *https://www.geeksforgeeks.org/twitter-sentiment-analysis-using-python/*