In [2]:
import pandas as pd 
import numpy as np

In [3]:
tweet_df = pd.read_csv('../data/trump_tweets_jay.csv')
hedo_df = pd.read_csv('../data/word_sentiment_rating.csv')
tweet_df_scores = pd.read_csv('../data/tweet_df_cleaned.csv')

## Tweet dataframe 

In [3]:
# check for content and shape
tweet_df

Unnamed: 0,id,text,isRetweet,isDeleted,device,favorites,retweets,date
0,98454970654916608,Republicans and Democrats have both created ou...,f,f,TweetDeck,49,255,2011-08-02 18:07:48
1,1234653427789070336,I was thrilled to be back in the Great city of...,f,f,Twitter for iPhone,73748,17404,2020-03-03 01:34:50
2,1218010753434820614,RT @CBS_Herridge: READ: Letter to surveillance...,t,f,Twitter for iPhone,0,7396,2020-01-17 03:22:47
3,1304875170860015617,The Unsolicited Mail In Ballot Scam is a major...,f,f,Twitter for iPhone,80527,23502,2020-09-12 20:10:58
4,1218159531554897920,RT @MZHemingway: Very friendly telling of even...,t,f,Twitter for iPhone,0,9081,2020-01-17 13:13:59
...,...,...,...,...,...,...,...,...
54437,1319484210101379072,RT @EliseStefanik: President @realDonaldTrump ...,t,f,Twitter for iPhone,0,9912,2020-10-23 03:42:05
54438,1319444420861829121,RT @TeamTrump: LIVE: Presidential Debate #Deba...,t,f,Twitter for iPhone,0,8249,2020-10-23 01:03:58
54439,1319384118849949702,Just signed an order to support the workers of...,f,f,Twitter for iPhone,175950,36098,2020-10-22 21:04:21
54440,1319345719829008387,Suburban women want Safety &amp; Security. Joe...,f,f,Twitter for iPhone,95325,19639,2020-10-22 18:31:46


In [4]:
# compare tweet list with no re-tweets
no_retweets = tweet_df['isRetweet'] == 'f'
tweet_df[no_retweets].shape

(45270, 8)

## The hedonometer word sentiment scores dataframe

In [6]:
print(hedo_df.head())
print(hedo_df.shape)

   Unnamed: 0  happs  rank  stdDev            word
0           0    5.1  6648    0.99       according
1           1    5.1  6649    1.58  administrative
2           2    5.1  6650    1.25          albert
3           3    5.1  6651    1.31          alleen
4           4    5.1  6652    1.30           allen
(10187, 5)


## Tokenize tweets and remove the stopwords to take away "filler" type words

In [7]:
# Use nltk for tokenizer and stopwords removal
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jaysu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jaysu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
text_tokens=tweet_df['text'].apply(lambda x: word_tokenize(x))

In [9]:
# Notice the structure of the first tweet with "stopwords"
text_tokens[0]

['Republicans',
 'and',
 'Democrats',
 'have',
 'both',
 'created',
 'our',
 'economic',
 'problems',
 '.']

In [10]:
# Tokenize each tweet, remove stopwords, and append into a list to be added to dataframe later
tokens = []
# Cache the stopword function to speed up runtime. Otherwise the loop will access the method each iteration and slow down for loop. 
cachedStopWords = stopwords.words('english')
for tweet in tweet_df['text']:
    text_tokens = word_tokenize(tweet)
    token = [word for word in text_tokens if not word in cachedStopWords]
    tokens.append(token)
#     print(token)

In [11]:
# Inpsect list object for removal of stopwords
tokens[0]

['Republicans', 'Democrats', 'created', 'economic', 'problems', '.']

## Pull the score of each word in each Tweet. Make sure that the list structure is intact.

In [15]:
# Look through the tokens list and access the value of the word with .loc()
# Append words in a tweet list, then append that list to the scores list
# Make sure to turn all words into lowercase first to match the hedo_df words
word_and_scores = []
for token in tokens:
    tweet = []
    for word in token:
        score = hedo_df.loc[hedo_df['word'] == word.lower(), 'happs']
        if score.values.size:
            tweet.append((word, score.values[0]))
#             print(word,score.values[0])
    word_and_scores.append(tweet)

In [16]:
word_and_scores[0]

[('Republicans', 4.22),
 ('Democrats', 5.5),
 ('created', 6.06),
 ('economic', 5.36),
 ('problems', 2.92)]

In [19]:
# Populate only the scores of the individual words of each tweet in list of lists
scores_only = []
for token in tokens:
    tweet = []
    for word in token:
        score = hedo_df.loc[hedo_df['word'] == word.lower(), 'happs']
        if score.values.size:
            tweet.append(score.values[0])
#             print(score.values[0])
    scores_only.append(tweet)

In [20]:
scores_only[0]

[4.22, 5.5, 6.06, 5.36, 2.92]

## Aggregation functions of the score list - sum and mean

In [22]:
# how to .agg() a new colum with mean and sum? 
import statistics as st
sum_scores = [sum(x) for x in scores_only]

In [23]:
sum_scores[:5]

[24.059999999999995,
 142.1,
 49.019999999999996,
 147.05999999999997,
 63.41999999999999]

In [25]:
# To calculate mean you must create a function with a try/except logic for 'nan' values
def check(data):
    try:
        return st.mean(data)
    except:
        return np.nan
avg_scores = [check(x) for x in scores_only]

In [26]:
avg_scores[:5]

[4.812, 6.459090909090909, 5.446666666666666, 4.902, 5.765454545454546]

## Append the tweet dataframe with the new lists

In [28]:
# Inspect the DF with the newly added columns
tweet_df['word_and_scores'] = word_and_scores
tweet_df['scores_only'] = scores_only
tweet_df['avg_scores'] = avg_scores
tweet_df['sum_scores'] = sum_scores
tweet_df.head()

Unnamed: 0,id,text,isRetweet,isDeleted,device,favorites,retweets,date,word_and_scores,scores_only,avg_scores,sum_scores
0,98454970654916608,Republicans and Democrats have both created ou...,f,f,TweetDeck,49,255,2011-08-02 18:07:48,"[(Republicans, 4.22), (Democrats, 5.5), (creat...","[4.22, 5.5, 6.06, 5.36, 2.92]",4.812,24.06
1,1234653427789070336,I was thrilled to be back in the Great city of...,f,f,Twitter for iPhone,73748,17404,2020-03-03 01:34:50,"[(I, 5.92), (back, 5.18), (Great, 7.88), (city...","[5.92, 5.18, 7.88, 5.76, 5.5, 5.52, 5.44, 6.14...",6.459091,142.1
2,1218010753434820614,RT @CBS_Herridge: READ: Letter to surveillance...,t,f,Twitter for iPhone,0,7396,2020-01-17 03:22:47,"[(RT, 4.88), (READ, 6.52), (Letter, 5.94), (co...","[4.88, 6.52, 5.94, 3.78, 6.36, 4.96, 5.6, 5.0,...",5.446667,49.02
3,1304875170860015617,The Unsolicited Mail In Ballot Scam is a major...,f,f,Twitter for iPhone,80527,23502,2020-09-12 20:10:58,"[(The, 4.98), (Mail, 6.36), (In, 5.5), (Ballot...","[4.98, 6.36, 5.5, 5.16, 5.76, 2.36, 6.36, 5.44...",4.902,147.06
4,1218159531554897920,RT @MZHemingway: Very friendly telling of even...,t,f,Twitter for iPhone,0,9081,2020-01-17 13:13:59,"[(RT, 4.88), (Very, 6.12), (friendly, 7.66), (...","[4.88, 6.12, 7.66, 5.3, 6.26, 4.94, 5.6, 5.62,...",5.765455,63.42


In [4]:
# Save the cleaned tweet_df as a .csv for rest of team to use
tweet_df.to_csv('tweet_df_cleaned.csv')

In [5]:
# create new column
tweet_df_scores['sentiment_category'] = ''
tweet_df_scores['word_count']= ''
tweet_df_scores['word_count'] = tweet_df['scores_only'].str.len()

# assign sentiment category
tweet_df_scores.loc[tweet_df.avg_scores >= 5.5, 'sentiment_category'] = 'positive'
tweet_df_scores.loc[(tweet_df.avg_scores > 5) & (tweet_df.avg_scores < 5.5), 'sentiment_category'] = 'neutral'
tweet_df_scores.loc[tweet_df.avg_scores <= 5 , 'sentiment_category'] = 'negative'


KeyError: 'scores_only'

In [None]:

tweet_df_scores.head()