In [1]:
# get sentiments for all tweets and save to file

# for this notebook to work, you first need to set up a Google Cloud Natural 
# Language API project by following the steps under Prerequisites in 
# https://cloud.google.com/natural-language/docs/sentiment-tutorial

In [None]:
from gcp_sentiment import GCPSentiment
from dill import dump as dilldump

# function to get sentiment from GCP
credentialspath = "/home/vagrant/google_service_account_credentials/" +\
    "project_our-rock-218016.credentials.json"
get_sentence_sentiments = GCPSentiment(credentialspath
    ).get_sentence_sentiments

# sample use
get_sentence_sentiments([" ", "y'all gonna lie?. i'm tellin you go to smh"])

In [10]:
## read in tweets df

import pandas as pd

tweets_df = pd.read_csv("data/twcs_clean.csv", nrows=None) # small test
print(tweets_df.info())
tweets_df.head()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2811774 entries, 0 to 2811773
Data columns (total 7 columns):
tweet_id                   int64
time                       object
author_id                  object
text                       object
response_tweet_id          object
in_response_to_tweet_id    float64
inbound                    bool
dtypes: bool(1), float64(1), int64(1), object(4)
memory usage: 131.4+ MB
None


Unnamed: 0,tweet_id,time,author_id,text,response_tweet_id,in_response_to_tweet_id,inbound
0,790341,2008-05-08 20:13:59,SouthwestAir,@34622 Have FUN at the lecture tonight! Tell P...,790326,790342.0,False
1,790326,2008-05-08 21:04:16,308466,@SouthwestAir Can you pls enter the HI market ...,"790327,790328,790325,790329,790330,790331,7903...",790341.0,True
2,1757947,2010-02-16 15:49:47,529256,KTAR.com - Foreclosures still big problem in V...,1757946,,True
3,2291020,2010-03-31 15:24:29,665443,@665445 Do you know if Carl's Jr serves lunch ...,2291018,,True
4,2291018,2010-03-31 16:53:27,CarlsJr,@665443 We serve lunch all day!,2291019,2291020.0,False


In [None]:
### saving sentiments as dill files, each containing
# a list of 1000 sentiments

import re, os
from time import sleep

tweets = tweets_df["text"].values

## strip handles e.g. @delta
handle_regobj = re.compile(r"@\w+")
def strip_handles(tweet,
    handle_regobj = handle_regobj):
    
    t_stripped = " ".join(handle_regobj.split(tweet))
    return t_stripped

print("   Stripping handles from tweets ...")
tweets = list(map(strip_handles, tweets))

# create save directory
if not os.path.exists("data/sentis/"): os.makedirs("data/sentis/")

batchsize = 1000
ix_end_batch = -1
while ix_end_batch != len(tweets) - 1:
    
    # creating batch
    ix_start_batch = ix_end_batch + 1
    ix_end_batch = min(len(tweets) - 1, ix_start_batch + batchsize - 1)
    batch_tweets = tweets[ix_start_batch : ix_end_batch + 1]
    
    # getting sentiments
    print("   Getting sentiments for " + str(ix_end_batch - ix_start_batch + 1) + 
          " tweets from " + str(ix_start_batch) + " to " + str(ix_end_batch) + 
          " ...")
    batch_sentis = get_sentence_sentiments(batch_tweets)
    sleep(10)
    print("   n_sentis =", len(batch_sentis))
    
    # saving to file
    path = "data/sentis/batch_sentis_" + str(ix_start_batch).zfill(9) + "_to_" + \
        str(ix_end_batch).zfill(9) + ".list.dill"
    
    with open(path, "wb") as f:
        dilldump(batch_sentis, f)

   Stripping handles from tweets ...
   Getting sentiments for 1000 tweets from 1000000 to 1000999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1001000 to 1001999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1002000 to 1002999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1003000 to 1003999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1004000 to 1004999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1005000 to 1005999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1006000 to 1006999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1007000 to 1007999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1008000 to 1008999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1009000 to 1009999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1010000 to 1010999 ...
   n_sentis = 1000
   Getting sentiments for 10

   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1100000 to 1100999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1101000 to 1101999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1102000 to 1102999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1103000 to 1103999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1104000 to 1104999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1105000 to 1105999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1106000 to 1106999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1107000 to 1107999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1108000 to 1108999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1109000 to 1109999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1110000 to 1110999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 111

   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1197000 to 1197999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1198000 to 1198999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1199000 to 1199999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1200000 to 1200999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1201000 to 1201999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1202000 to 1202999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1203000 to 1203999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1204000 to 1204999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1205000 to 1205999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1206000 to 1206999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1207000 to 1207999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 120

   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1294000 to 1294999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1295000 to 1295999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1296000 to 1296999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1297000 to 1297999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1298000 to 1298999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1299000 to 1299999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1300000 to 1300999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1301000 to 1301999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1302000 to 1302999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1303000 to 1303999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1304000 to 1304999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 130

   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1391000 to 1391999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1392000 to 1392999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1393000 to 1393999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1394000 to 1394999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1395000 to 1395999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1396000 to 1396999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1397000 to 1397999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1398000 to 1398999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1399000 to 1399999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1400000 to 1400999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1401000 to 1401999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 140

   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1491000 to 1491999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1492000 to 1492999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1493000 to 1493999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1494000 to 1494999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1495000 to 1495999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1496000 to 1496999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1497000 to 1497999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1498000 to 1498999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1499000 to 1499999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1500000 to 1500999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1501000 to 1501999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 150

   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1591000 to 1591999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1592000 to 1592999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1593000 to 1593999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1594000 to 1594999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1595000 to 1595999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1596000 to 1596999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1597000 to 1597999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1598000 to 1598999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1599000 to 1599999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1600000 to 1600999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 1601000 to 1601999 ...
   n_sentis = 1000
   Getting sentiments for 1000 tweets from 160