In [1]:
import os
import subprocess
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [2]:
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)

In [3]:
!hadoop fs -du -s -h 'gs://msca-bdp-tweets/Tweets/'

2.3 T  2.3 T  gs://msca-bdp-tweets/Tweets


## 1. Load data and filter relevant features

In [4]:
directory = "gs://msca-bdp-tweets/Tweets/"
file = '*.json'
path = directory + file

In [None]:
tweets_df = spark.read.json(path)

In [None]:
tweets_df.schema.names

['contributors',
 'coordinates',
 'created_at',
 'display_text_range',
 'entities',
 'extended_entities',
 'extended_tweet',
 'favorite_count',
 'favorited',
 'filter_level',
 'geo',
 'id',
 'id_str',
 'in_reply_to_screen_name',
 'in_reply_to_status_id',
 'in_reply_to_status_id_str',
 'in_reply_to_user_id',
 'in_reply_to_user_id_str',
 'is_quote_status',
 'lang',
 'limit',
 'place',
 'possibly_sensitive',
 'quote_count',
 'quoted_status',
 'quoted_status_id',
 'quoted_status_id_str',
 'quoted_status_permalink',
 'reply_count',
 'retweet_count',
 'retweeted',
 'retweeted_status',
 'scopes',
 'source',
 'text',
 'timestamp_ms',
 'truncated',
 'user',
 'withheld_copyright',
 'withheld_in_countries']

In [None]:
tweets_df1 = tweets_df.select(col('created_at'), col('favorite_count'), col('id'), col('lang'), \
                              col('retweeted_status.retweet_count').alias('rs_retweet_count'), col('reply_count'), \
                              col('retweet_count'), col('text'),col('timestamp_ms'), col('user.id').alias('user_id'), col('user.name').alias('user_name'), \
                              col('user.screen_name').alias('user_screen_name'), col('user.location').alias('user_location'), col('user.verified').alias('user_verified'), \
                              col('user.followers_count').alias('user_followers_count'), col('user.friends_count').alias('user_friends_count'), \
                              col('user.favourites_count').alias('user_favourites_count'), col('user.created_at').alias('user_created_at'))

In [8]:
tweets_df1.limit(5)

created_at,favorite_count,id,lang,rs_retweet_count,reply_count,retweet_count,text,timestamp_ms,user_id,user_name,user_screen_name,user_location,user_verified,user_followers_count,user_friends_count,user_favourites_count,user_created_at
Tue Aug 22 01:15:...,0,899802055317082112,en,17.0,,0,RT @jprince713: T...,1503364502158,1869685285,Stephen Erickson,StephenErickso,Erbenova CZ,False,1440,1310,4655,Mon Sep 16 00:16:...
Tue Aug 22 01:15:...,0,899802054645882881,en,,,0,You not from AMER...,1503364501998,286886991,Dwayne mckell,officialmckell,"Philadelphia, PA",False,119618,9892,3421,Sat Apr 23 22:18:...
Tue Aug 22 01:15:...,0,899802053492432896,en,30860.0,,0,RT @_liyahhhhhhh_...,1503364501723,1016646704,John Garabedian,john_garabedian,,False,569,277,32024,Mon Dec 17 04:05:...
Tue Aug 22 01:15:...,0,899802055824551936,en,,,0,For anyone wonder...,1503364502279,3342588748,LaurieLou,LauraTrent11,,False,149,21,14061,Tue Jun 23 10:32:...
Tue Aug 22 01:15:...,0,899802056017367041,en,326.0,,0,RT @FoxNews: High...,1503364502325,4149769393,kye stephens,kyestephens4,,False,76,179,2707,Fri Nov 06 20:34:...


## 2. Filter universities

In [None]:
tweets_df2 = tweets_df1.where("lower(text) like '%university of chicago%' or lower(text) like '%northwestern university%' or lower(text) like '%boston university%' or lower(text) like '%northeastern university%'")
tweets_df2.cache()

created_at,favorite_count,id,lang,rs_retweet_count,reply_count,retweet_count,text,timestamp_ms,user_id,user_name,user_screen_name,user_location,user_verified,user_followers_count,user_friends_count,user_favourites_count,user_created_at
Thu Jun 22 23:17:...,0,878029090665541633,en,2.0,,0,RT @SGVNSports: S...,1498173422863,2858376295,audrey sellers,_audreysellers_,probably at the f...,False,558,403,7390,Thu Oct 16 19:05:...
Thu Jun 22 23:19:...,0,878029655277617152,en,3.0,,0,RT @SGVNSports: S...,1498173557477,3002070924,Marcellus Pearson,cellyp32,"Upland, CA",False,783,677,3725,Thu Jan 29 19:30:...
Thu Jun 22 23:32:...,0,878032945352716288,en,,,0,Good evening Nort...,1498174341892,795645416473853954,Northeastern WxSTEM,NUWxSTEM,"Boston, MA",False,55,10,0,Mon Nov 07 15:13:...
Thu Jun 22 23:35:...,0,878033732753432576,en,,,0,Professor Bruce C...,1498174529623,103719930,tbs eFM This Morning,efmthismorning,"Seoul, South Korea",False,1283,690,196,Mon Jan 11 01:33:...
Thu Jun 22 23:36:...,0,878034060903186432,en,,,0,Debunking Fake Ne...,1498174607860,22257074,Eric Newton,EricNewton1,"Phoenix, Miami, S...",False,3478,2530,25,Sat Feb 28 14:01:...
Thu Jun 22 23:53:...,0,878038274115239941,en,,,0,Isaac Asimov was ...,1498175612368,808611253275029504,Thomas Bergere,drivinsouth429,,False,1,6,2,Tue Dec 13 09:55:...
Thu Jun 22 23:53:...,0,878038287461392384,en,80.0,,0,RT @_audreyseller...,1498175615550,1340688360,deanna,deanna_aa,"Riverside, CA",False,287,227,2155,Wed Apr 10 01:25:...
Thu Jun 22 23:55:...,0,878038679796670464,en,1.0,,0,RT @efmthismornin...,1498175709090,287999434,tbs eFM 101.3MHz,1013eFM,"Seoul, Korea",True,2304,2269,36,Tue Apr 26 02:11:...
Thu Jun 22 23:55:...,0,878038831911510016,en,4.0,,0,RT @SGVNSports: S...,1498175745357,2329383511,Dakota Casper,Dakotacameron_,,False,230,206,1162,Wed Feb 05 21:26:...
Thu Jun 22 23:56:...,0,878038928179298304,en,,,0,Congratulations t...,1498175768309,220073795,Presunto Culpable,PresuntoC,"Berkeley, CA",False,33120,8917,2310,Fri Nov 26 18:11:...


In [9]:
tweets_df2.where("lower(text) like '%university of chicago%'").count()

299651

In [10]:
tweets_df2.where("lower(text) like '%northwestern university%'").count()

129090

In [11]:
tweets_df2.where("lower(text) like '%boston university%'").count()

352727

In [12]:
tweets_df2.where("lower(text) like '%northeastern university%'").count()

96366

In [None]:
# For Hadoop
tweets_df2.write.json('temp2')

In [None]:
# For cloud
dir_out = 'temp4'
bucket_write = 'msca-bdp-students-bucket/shared_data/jejiang'

tweets_df2.write.json('gs://' + bucket_write + '/' + dir_out)

In [11]:
!hadoop fs -du -s -h 'gs://msca-bdp-students-bucket/shared_data/jejiang/temp4/'

493.3 M  493.3 M  gs://msca-bdp-students-bucket/shared_data/jejiang/temp4


## 3. Filter influential users

In [None]:
tweets_df3 = tweets_df1.where("lower(user_screen_name) like '%chiftf_uchicago%' or lower(user_screen_name) like '%money_personal%' or lower(user_screen_name) like '%brooklinepads%' or lower(user_screen_name) like '%nuwxstem%' or lower(user_screen_name) like '%goal58014091%' or lower(user_screen_name) like '%davidgura%' or lower(user_screen_name) like '%markosilberhand%' or lower(user_screen_name) like '%dreeprovince%'")
tweets_df3.cache()

created_at,favorite_count,id,lang,rs_retweet_count,reply_count,retweet_count,text,timestamp_ms,user_id,user_name,user_screen_name,user_location,user_verified,user_followers_count,user_friends_count,user_favourites_count,user_created_at
Thu Jun 22 23:32:...,0,878032945352716288,en,,,0,Good evening Nort...,1498174341892,795645416473853954,Northeastern WxSTEM,NUWxSTEM,"Boston, MA",False,55,10,0,Mon Nov 07 15:13:...
Sat Jul 29 20:48:...,0,891400093756252160,en,,,0,It's windy at Nor...,1501361318421,795645416473853954,Northeastern WxSTEM,NUWxSTEM,"Boston, MA",False,55,10,0,Mon Nov 07 15:13:...
Sat Aug 19 15:19:...,0,898927291409084416,en,,,0,Updated eclipse f...,1503155942190,795645416473853954,Northeastern WxSTEM,NUWxSTEM,"Boston, MA",False,58,10,0,Mon Nov 07 15:13:...
Mon Aug 21 22:15:...,0,899756974069436416,en,,,0,#brooklineapartme...,1503353753951,48053733,Brookline Apartments,BrooklinePads,"Brookline, MA",False,129,217,1,Wed Jun 17 18:10:...
Mon Aug 21 23:24:...,0,899774305554321409,en,,,0,Northeastern Univ...,1503357886099,795645416473853954,Northeastern WxSTEM,NUWxSTEM,"Boston, MA",False,59,10,0,Mon Nov 07 15:13:...
Mon Aug 21 23:32:...,0,899776347479748608,en,,,0,Good evening Nort...,1503358372932,795645416473853954,Northeastern WxSTEM,NUWxSTEM,"Boston, MA",False,59,10,0,Mon Nov 07 15:13:...
Mon Sep 04 20:26:...,0,904802832523681792,en,,0.0,0,It's windy at Nor...,1504556780354,795645416473853954,Northeastern WxSTEM,NUWxSTEM,"Boston, MA",False,61,10,0,Mon Nov 07 15:13:...
Mon Sep 04 23:01:...,0,904841786950127617,en,,0.0,0,Northeastern Univ...,1504566067813,795645416473853954,Northeastern WxSTEM,NUWxSTEM,"Boston, MA",False,61,10,0,Mon Nov 07 15:13:...
Mon Sep 04 23:33:...,0,904849807658950656,en,,0.0,0,Good evening Nort...,1504567980099,795645416473853954,Northeastern WxSTEM,NUWxSTEM,"Boston, MA",False,61,10,0,Mon Nov 07 15:13:...
Tue Sep 05 00:04:...,0,904857733484171268,en,,0.0,0,It's windy at Nor...,1504569869763,795645416473853954,Northeastern WxSTEM,NUWxSTEM,"Boston, MA",False,61,10,0,Mon Nov 07 15:13:...


In [None]:
# For cloud
dir_out = 'temp5'
bucket_write = 'msca-bdp-students-bucket/shared_data/jejiang'

tweets_df3.write.json('gs://' + bucket_write + '/' + dir_out)

In [None]:
!hadoop fs -du -s -h 'gs://msca-bdp-students-bucket/shared_data/jejiang/temp5/'

28.4 M  28.4 M  gs://msca-bdp-students-bucket/shared_data/jejiang/temp5
