# Big Data Platforms - Final Project
### Data Filtering (Spark Script)
### By Kyla Ronellenfitsch
### March 19, 2021

## EDA and Variable Selection

In [96]:
# Import relevant packages 
import os
import pandas as pd
import time
import numpy as np
import matplotlib.pyplot as plt
from itertools import islice
import re

import shutil
# import sh
from pyspark.sql.functions import *
import pyspark.sql.functions as psf
from pyspark.sql.types import *

In [2]:
# Instatiate storage 
from google.cloud import storage

In [3]:
# Preview all files 
!hadoop fs -ls "gs://msca-bdp-tweets/Tweets/" | head -10

Found 30241 items
-rwx------   3 root root   55220293 2020-10-21 00:01 gs://msca-bdp-tweets/Tweets/tweets201706221015.json
-rwx------   3 root root   52384528 2020-10-20 22:16 gs://msca-bdp-tweets/Tweets/tweets201706221115.json
-rwx------   3 root root   56190692 2020-10-20 22:45 gs://msca-bdp-tweets/Tweets/tweets201706221215.json
-rwx------   3 root root   56992695 2020-10-21 00:00 gs://msca-bdp-tweets/Tweets/tweets201706221315.json
-rwx------   3 root root   54702790 2020-10-21 00:16 gs://msca-bdp-tweets/Tweets/tweets201706221415.json
-rwx------   3 root root   66415029 2020-10-21 02:43 gs://msca-bdp-tweets/Tweets/tweets201706221515.json
-rwx------   3 root root   63298555 2020-10-21 02:13 gs://msca-bdp-tweets/Tweets/tweets201706221615.json
-rwx------   3 root root   55417269 2020-10-20 22:04 gs://msca-bdp-tweets/Tweets/tweets201706221715.json
-rwx------   3 root root   54057246 2020-10-21 03:06 gs://msca-bdp-tweets/Tweets/tweets201706221815.json


In [51]:
# Read in one file to understand structure and conduct EDA
%time twitter_ex = spark.read.json('gs://msca-bdp-tweets/Tweets/tweets201706221115.json')
twitter_ex.count()

CPU times: user 5.39 ms, sys: 130 µs, total: 5.52 ms
Wall time: 9.79 s


9458

In [95]:
# print schema. This was essential for variable selection, but not printed here due to length

#twitter_ex.printSchema()

In [53]:
# selection of words for universities 
words = ["uchicago", 
         "university of chicago", 
         "depaul", 
         "depaul university", 
         "depaulu",
         "northwestern",
         "northwesternu",
         "northwestern university",
         "university of illinois at chicago",
         "uic.edu", 
         " uic ",
         " uic's "
         ]

In [54]:
# time data filtering 
%time fltr = twitter_ex.filter(psf.lower(twitter_ex.text).rlike('|'.join(words)))

CPU times: user 5.7 ms, sys: 7.96 ms, total: 13.7 ms
Wall time: 217 ms


In [55]:
# signficant reduction in volume 
fltr.count()

101

In [56]:
# preview data
fltr.limit(5).toPandas()

Unnamed: 0,contributors,coordinates,created_at,display_text_range,entities,extended_entities,extended_tweet,favorite_count,favorited,filter_level,...,quoted_status_id_str,retweet_count,retweeted,retweeted_status,source,text,timestamp_ms,truncated,user,withheld_in_countries
0,,,Thu Jun 22 15:16:19 +0000 2017,,"([], None, [], [(depaulnewsline.com/features/f...",,,0,False,low,...,,0,False,,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",Future now: @DePaulU's innovative strategies d...,1498144579490,False,"(False, Tue Aug 05 16:38:08 +0000 2014, True, ...",
1,,,Thu Jun 22 15:17:15 +0000 2017,,"([], None, [], [(northwesternmutual.com/news-r...",,,0,False,low,...,,0,False,,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",Northwestern Mutual Life Insurance and Financi...,1498144635844,False,"(False, Sun Mar 10 14:23:19 +0000 2013, True, ...",
2,,,Thu Jun 22 15:17:53 +0000 2017,,"([([66, 79], Northwestern), ([86, 94], B1GCats...",,,0,False,low,...,,0,False,,"<a href=""http://paper.li"" rel=""nofollow"">Paper...",Read Daily @NU_Alumni for summary of last 24 h...,1498144673637,False,"(False, Tue Feb 10 18:11:05 +0000 2009, False,...",
3,,,Thu Jun 22 15:18:23 +0000 2017,"[0, 91]","([], [(pic.twitter.com/xApl204yoj, https://twi...","([(pic.twitter.com/xApl204yoj, https://twitter...",,0,False,low,...,,0,False,,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",When a visitor comes to the Medill campus and ...,1498144703822,False,"(False, Tue May 31 04:08:57 +0000 2011, False,...",
4,,,Thu Jun 22 15:18:42 +0000 2017,,"([], None, [], [], [])",,,0,False,low,...,,0,False,,"<a href=""http://twitter.com/download/iphone"" r...",I keep getting emails from DePaul and I'm so s...,1498144722730,False,"(False, Sun Dec 15 05:31:59 +0000 2013, False,...",


In [58]:
# filter relevant columns, time
%time fltr2 = twitter_ex.filter(psf.lower(twitter_ex.text).rlike('|'.join(words))).\
select([col("created_at").alias("tweet_created"),\
        "text",\
        "retweet_count",\
        "favorite_count",\
        "filter_level",\
        "quoted_status_id_str",\
        "user.created_at",\
        "user.default_profile_image",\
        "user.description",\
        "user.favourites_count",\
        "user.followers_count",\
        "user.following",\
        "user.friends_count",\
        "user.id_str",\
        "user.screen_name",\
        "user.statuses_count",\
        "user.location",\
        "user.time_zone",\
        "user.verified"])

CPU times: user 12.5 ms, sys: 10 µs, total: 12.5 ms
Wall time: 75.8 ms


In [59]:
# save as pandas df 
df = fltr2.toPandas()

In [60]:
# preview
df.head()

Unnamed: 0,tweet_created,text,retweet_count,favorite_count,filter_level,quoted_status_id_str,created_at,default_profile_image,description,favourites_count,followers_count,following,friends_count,id_str,screen_name,statuses_count,location,time_zone,verified
0,Thu Jun 22 15:16:19 +0000 2017,Future now: @DePaulU's innovative strategies d...,0,0,low,,Tue Aug 05 16:38:08 +0000 2014,False,Newsline is a publication for the university c...,490,611,,281,2709796375,DePaulNewsline,1365,Chicago,,False
1,Thu Jun 22 15:17:15 +0000 2017,Northwestern Mutual Life Insurance and Financi...,0,0,low,,Sun Mar 10 14:23:19 +0000 2013,False,"Compensation consultant, teacher and student o...",0,87,,153,1257129522,JimSillery,506,"Eden Prairie, MN",,False
2,Thu Jun 22 15:17:53 +0000 2017,Read Daily @NU_Alumni for summary of last 24 h...,0,0,low,,Tue Feb 10 18:11:05 +0000 2009,False,Promoting #Northwestern University alumni & #B...,17934,1373,,2439,20531237,NU_alumni,11382,"Chicago, IL",Central Time (US & Canada),False
3,Thu Jun 22 15:18:23 +0000 2017,When a visitor comes to the Medill campus and ...,0,0,low,,Tue May 31 04:08:57 +0000 2011,False,Dillard University Alum | Medill MSJ '17 | sen...,34189,2926,,1765,308281620,AstasiaWill,115663,out here,Central Time (US & Canada),False
4,Thu Jun 22 15:18:42 +0000 2017,I keep getting emails from DePaul and I'm so s...,0,0,low,,Sun Dec 15 05:31:59 +0000 2013,False,🇲🇽 ...,2446,155,,522,2246686820,cynxcv,2969,Chicago,Eastern Time (US & Canada),False


In [61]:
# describe data 
df.describe()

Unnamed: 0,retweet_count,favorite_count,favourites_count,followers_count,friends_count,statuses_count
count,101.0,101.0,101.0,101.0,101.0,101.0
mean,0.0,0.0,3844.841584,5292.079208,1079.910891,29709.049505
std,0.0,0.0,8264.886249,29349.339669,4242.378871,96250.963816
min,0.0,0.0,0.0,3.0,0.0,6.0
25%,0.0,0.0,54.0,200.0,185.0,722.0
50%,0.0,0.0,853.0,525.0,424.0,4390.0
75%,0.0,0.0,3017.0,1751.0,849.0,14371.0
max,0.0,0.0,46995.0,287626.0,42693.0,793808.0


In [62]:
# show missing values and data type 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101 entries, 0 to 100
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   tweet_created          101 non-null    object
 1   text                   101 non-null    object
 2   retweet_count          101 non-null    int64 
 3   favorite_count         101 non-null    int64 
 4   filter_level           101 non-null    object
 5   quoted_status_id_str   6 non-null      object
 6   created_at             101 non-null    object
 7   default_profile_image  101 non-null    bool  
 8   description            90 non-null     object
 9   favourites_count       101 non-null    int64 
 10  followers_count        101 non-null    int64 
 11  following              0 non-null      object
 12  friends_count          101 non-null    int64 
 13  id_str                 101 non-null    object
 14  screen_name            101 non-null    object
 15  statuses_count         

In [64]:
# show location 
pd.DataFrame(np.unique(df[['location']].astype(str), return_counts=True)).T[:20]

Unnamed: 0,0,1
0,North America / Canada,1
1,436 oldham,1
2,"58th St & Ellis Ave, Chicago",1
3,814,1
4,"Ann Arbor, Michigan",1
5,Asleep\n,1
6,"Beckville, TX",1
7,"Beckville, Tx",1
8,"Beckville,TX",1
9,"Bergen County, New Jersey",1


In [65]:
# show timezone 
pd.DataFrame(np.unique(df[['time_zone']].astype(str), return_counts=True)).T

Unnamed: 0,0,1
0,Arizona,1
1,Asia/Calcutta,1
2,Athens,1
3,Atlantic Time (Canada),1
4,Central Time (US & Canada),26
5,Eastern Time (US & Canada),16
6,Mountain Time (US & Canada),2
7,Pacific Time (US & Canada),14
8,Quito,1
9,Rome,1


In [66]:
#aggregate number of tweets based on id
pd.DataFrame(np.unique(df[['id_str']].astype(str), return_counts=True)).T.sort_values(by=1, ascending=False)

Unnamed: 0,0,1
50,3282487556,3
76,68506542,3
4,1176776017,2
22,2430625008,2
25,249409119,2
...,...,...
31,261714115,1
30,261711658,1
29,258922521,1
28,2535245767,1


In [72]:
pd.set_option('display.max_colwidth', 200)

In [73]:
# preview northwestern data 
df[df['text'].str.contains("Northwestern", na=False)][:10]

Unnamed: 0,tweet_created,text,retweet_count,favorite_count,filter_level,quoted_status_id_str,created_at,default_profile_image,description,favourites_count,followers_count,following,friends_count,id_str,screen_name,statuses_count,location,time_zone,verified
1,Thu Jun 22 15:17:15 +0000 2017,Northwestern Mutual Life Insurance and Financial Services https://t.co/5BD852EmkV,0,0,low,,Sun Mar 10 14:23:19 +0000 2013,False,"Compensation consultant, teacher and student of human behavior: my philosophy is that convetional wisdon is always conventional but rarely wisdom",0,87,,153,1257129522,JimSillery,506,"Eden Prairie, MN",,False
2,Thu Jun 22 15:17:53 +0000 2017,Read Daily @NU_Alumni for summary of last 24 hours on Twitter for #Northwestern &amp; #B1GCats https://t.co/dikjyJ9WAs Stories via @SarahKuta,0,0,low,,Tue Feb 10 18:11:05 +0000 2009,False,Promoting #Northwestern University alumni & #B1GCats topics since 2009. Use my Lists to find other @NUAlumni,17934,1373,,2439,20531237,NU_alumni,11382,"Chicago, IL",Central Time (US & Canada),False
3,Thu Jun 22 15:18:23 +0000 2017,When a visitor comes to the Medill campus and asks us if we are all from Northwestern...... https://t.co/xApl204yoj,0,0,low,,Tue May 31 04:08:57 +0000 2011,False,Dillard University Alum | Medill MSJ '17 | send sneakers (10/8): astasiawilli@gmail,34189,2926,,1765,308281620,AstasiaWill,115663,out here,Central Time (US & Canada),False
8,Thu Jun 22 15:21:54 +0000 2017,Ha! No. I wouldn't be seen with one of those. \nNorthwestern https://t.co/FFFl09zpEW,0,0,low,8.778698170588282e+17,Tue Sep 01 18:08:12 +0000 2009,False,"Seahawks beat writer for The News Tribune. Former AP sports writer in Seattle, UW director of writing. West Point graduate. Steubenville, Ohio native.",2850,19029,,1061,70744212,gbellseattle,48534,Seattle,Pacific Time (US & Canada),True
9,Thu Jun 22 15:22:01 +0000 2017,"Northwestern fan or not, The Play was one for the ages. Cast your vote! https://t.co/syFcumi2Qq",0,0,low,8.778610894966456e+17,Sun Jun 03 15:05:39 +0000 2012,False,"What I Love: my family, my work, Northwestern Wildcats, clever wordplay",892,59,,260,598482468,FrancieTurk,42,,,False
10,Thu Jun 22 15:22:12 +0000 2017,"RT @drewwalters23: Orlando city awards Gala, excited to help our city's youth #Northwestern https://t.co/3SX8woPI4d",0,0,low,,Tue Dec 14 23:43:40 +0000 2010,False,Hey! Take me strong! I'm here- http://tinyurl.com/me6tpm9,348,5,,2,226744924,turecaboo12,464,,,False
11,Thu Jun 22 15:24:06 +0000 2017,@hboulware @corrcomm The author is director of Northwestern's legal studies program. Oy. https://t.co/eCNUtFR5zJ,0,0,low,,Mon Sep 28 06:27:55 +0000 2009,False,Long-ago journalist for the likes of NYT and LAT and current author/co-author of nearly 20 books who thinks that sarcasm is the highest form of patriotism.,3190,2678,,196,77943492,joelengel,47583,Southern Calif,Pacific Time (US & Canada),False
12,Thu Jun 22 15:24:35 +0000 2017,RT @DaGold3nChild: Extremely blessed to receive an offer from Northwestern State #Forkem #AGTG #RecruitLC https://t.co/nTJCP2FzmI,0,0,low,,Mon Aug 04 03:24:29 +0000 2014,False,LCHS 18' |WR3| Isaiah 44:24|#Ekenation Click that link 👇🏾👇🏾,2711,1128,,921,2705526270,DaGold3nChild,5896,"Houston, TX",,False
13,Thu Jun 22 15:25:05 +0000 2017,College Coaches; @NKBaseball 17u plays in the @PastimeBaseball Northwestern TOC this weekend.,0,0,low,,Sun Apr 06 15:32:53 +0000 2014,False,Grad Assistant/Recruiting Coordinator for Trinity International University Baseball and Head Coach for Northern Knights Baseball. Bemidji State Baseball alum,767,242,,625,2430625008,CoachKatz23,782,,,False
14,Thu Jun 22 15:27:10 +0000 2017,Schedule;\nThurs- 1:30 and Northwestern\nFri- 3:45 and 6:00 at Trinity International University \nSat- 1:30 at Trout Park,0,0,low,,Sun Apr 06 15:32:53 +0000 2014,False,Grad Assistant/Recruiting Coordinator for Trinity International University Baseball and Head Coach for Northern Knights Baseball. Bemidji State Baseball alum,767,242,,625,2430625008,CoachKatz23,783,,,False


In [77]:
# preview uic data 
df[df['text'].str.contains(" UIC ", na=False)][:10]

Unnamed: 0,tweet_created,text,retweet_count,favorite_count,filter_level,quoted_status_id_str,created_at,default_profile_image,description,favourites_count,followers_count,following,friends_count,id_str,screen_name,statuses_count,location,time_zone,verified
35,Thu Jun 22 15:42:14 +0000 2017,Presale Alert! Presale Update: A Perfect Circle Presale Tickets at UIC Pavilion in Chicago https://t.co/4Xud9q1T0P… https://t.co/4kIj37Di8P,0,0,low,,Wed Nov 25 20:01:57 +0000 2009,False,We provide Free Presale Passwords for Concerts and Sporting Events,0,135,,174,92599443,concertcodes,27932,USA,Eastern Time (US & Canada),False


In [68]:
# define bucket
my_bucket = 'msca-bdp-students-bucket/shared_data/kmr1'

In [69]:
# save sample
fltr2.write.format("com.databricks.spark.csv").\
option("header","true").\
mode('overwrite').\
save('gs://' + my_bucket + '/sample')

### Reading/Filtering Full Data

In [None]:
# read in data. Note ~350 million records, taking 25 minutes to read in 
%time data_read = spark.read.json('gs://msca-bdp-tweets/Tweets')
data_read.count()

CPU times: user 234 ms, sys: 69.6 ms, total: 304 ms
Wall time: 25min 27s


348412387

In [None]:
# words for filtering 
words = ["uchicago", 
         "university of chicago", 
         "depaul", 
         "depaul university", 
         "depaulu",
         "northwestern",
         "northwesternu",
         "northwestern university",
         "university of illinois at chicago",
         "uic.edu", 
         " uic ",
         " uic's "
         ]

In [None]:
# filter keywords and relevant columns 
%time data_filter = data_read.filter(psf.lower(data_read.text).rlike('|'.join(words))).\
select([col("created_at").alias("tweet_created"),\
        "text",\
        "lang",\
        "retweet_count",\
        "favorite_count",\
        "filter_level",\
        "user.statuses_count",\
        "quoted_status_id_str",\
        "user.created_at",\
        "user.default_profile_image",\
        "user.description",\
        "user.favourites_count",\
        "user.followers_count",\
        "user.following",\
        "user.friends_count",\
        "user.id_str",\
        "user.screen_name",\
        "user.location",\
        "user.time_zone",\
        "user.verified"])


CPU times: user 16.1 ms, sys: 5.18 ms, total: 21.2 ms
Wall time: 312 ms


In [None]:
# define bucket
my_bucket = 'msca-bdp-students-bucket/shared_data/kmr1'

In [None]:
# save data 
data_filter.write.format("com.databricks.spark.csv").\
option("header","true").\
mode('overwrite').\
save('gs://' + my_bucket + '/data_filter3')

In [35]:
# preview 
data_filter.limit(5).toPandas()

Unnamed: 0,tweet_created,text,lang,retweet_count,favorite_count,filter_level,quoted_status_id_str,created_at,default_profile_image,description,favourites_count,followers_count,following,friends_count,id_str,location,time_zone,verified
0,Thu Jun 22 23:17:13 +0000 2017,...A SIGNIFICANT WEATHER ADVISORY HAS BEEN ISS...,en,0,0,low,,Thu Oct 18 05:51:36 +0000 2012,False,"AmericaAlerts provides alerts for weather, ear...",0,602,,607,888181566,"Sebastopol, CA",,False
1,Thu Jun 22 23:17:56 +0000 2017,@SheWhoVotes He went to Northwestern Law Schoo...,en,0,0,low,,Sat Feb 20 23:45:00 +0000 2016,False,Trust me I'm verified! - Voted 3rd Least influ...,1189,85,,107,701190848751403008,Antarctica,,False
2,Thu Jun 22 23:19:12 +0000 2017,@DIRECTV will u like 2b mah girlfriend northwe...,en,0,0,low,,Wed Mar 22 01:46:38 +0000 2017,False,,3,8,,66,844364663672492032,,Eastern Time (US & Canada),False
3,Thu Jun 22 23:21:03 +0000 2017,'The Play' by Northwestern is nominated for Be...,en,0,0,low,,Wed Sep 23 21:00:56 +0000 2009,False,'I have a high tolerance and an expense accoun...,12014,210,,467,76755128,Chicago,America/Chicago,False
4,Thu Jun 22 23:21:10 +0000 2017,Tornado Warning for..Northwestern Prairie Coun...,en,0,0,low,,Wed Dec 11 17:46:29 +0000 2013,False,"White County 9-1-1 Dispatch Center. Searcy, A...",5,219,,77,2241132438,"Searcy, Arkansas",,False


### Create University-Specific Datasets 

In [3]:
# read in data
!gsutil ls 'gs://msca-bdp-students-bucket/shared_data/kmr1/data_filter3' | head -10

gs://msca-bdp-students-bucket/shared_data/kmr1/data_filter3/
gs://msca-bdp-students-bucket/shared_data/kmr1/data_filter3/_SUCCESS
gs://msca-bdp-students-bucket/shared_data/kmr1/data_filter3/part-00000-3477c4a9-8263-4df9-b59a-d273b1b980ea-c000.csv
gs://msca-bdp-students-bucket/shared_data/kmr1/data_filter3/part-00001-3477c4a9-8263-4df9-b59a-d273b1b980ea-c000.csv
gs://msca-bdp-students-bucket/shared_data/kmr1/data_filter3/part-00002-3477c4a9-8263-4df9-b59a-d273b1b980ea-c000.csv
gs://msca-bdp-students-bucket/shared_data/kmr1/data_filter3/part-00003-3477c4a9-8263-4df9-b59a-d273b1b980ea-c000.csv
gs://msca-bdp-students-bucket/shared_data/kmr1/data_filter3/part-00004-3477c4a9-8263-4df9-b59a-d273b1b980ea-c000.csv
gs://msca-bdp-students-bucket/shared_data/kmr1/data_filter3/part-00005-3477c4a9-8263-4df9-b59a-d273b1b980ea-c000.csv
gs://msca-bdp-students-bucket/shared_data/kmr1/data_filter3/part-00006-3477c4a9-8263-4df9-b59a-d273b1b980ea-c000.csv
gs://msca-bdp-students-bucket/shared_data/kmr1/data

In [4]:
# define bucket
my_bucket = 'msca-bdp-students-bucket/shared_data/kmr1/data_filter3'

In [6]:
# read in data
%time data_filter = spark.read.csv('gs://msca-bdp-students-bucket/shared_data/kmr1/data_filter3', header='true', inferSchema='true', sep=',', quote='"')
data_filter.cache()

In [7]:
# note ~6 million records across all 4 universities 
data_filter.count()

5988744

In [8]:
# preview data 
data_filter.limit(10).toPandas()

Unnamed: 0,tweet_created,text,lang,retweet_count,favorite_count,filter_level,statuses_count,quoted_status_id_str,created_at,default_profile_image,description,favourites_count,followers_count,following,friends_count,id_str,screen_name,location,time_zone,verified
0,Wed Mar 14 20:15:02 +0000 2018,RT @kimbrolyclaire: @NorthwesternU It looks to...,en,0.0,0,low,74555.0,,Fri Mar 26 12:27:01 +0000 2010,false,,115991.0,645.0,,1547.0,126610908,kitty2city,,,False
1,Wed Mar 14 20:15:02 +0000 2018,there’s reports of a gunman at northwestern rn...,en,0.0,0,low,3647.0,,Tue Jun 06 23:18:14 +0000 2017,false,i only yell in my tweets and rt guanlin pics,5097.0,71.0,,182.0,872231183702274049,guanlinvibes,chi,,False
2,Wed Mar 14 20:15:02 +0000 2018,RT @KatieLittle: View of snipers from Maple/Em...,en,0.0,0,low,494.0,,Wed Jul 20 01:08:14 +0000 2016,false,Student @northwesternu | Evans Scholar🏌🏻| Poli...,236.0,42.0,,128.0,755569975855833088,DylanGresik,Chicago,,False
3,Wed Mar 14 20:15:02 +0000 2018,Anxiously waiting to hear from my northwestern...,,,,,,,,,,,,,,,,,,
4,"This is insane. https://t.co/tSd2E7Z4Fo""",en,0,0.0,low,7041,9.740153780188568e+17,Tue Apr 09 00:53:33 +0000 2013,false,Getting my doctorals @NorthwesternU | black ac...,3005,273.0,,251.0,1337981354.0,_thaWRIGHTway,"Chicago, IL",,false,
5,Wed Mar 14 20:15:03 +0000 2018,RT @Evan_Rosenfeld: BREAKING: Northwestern Uni...,en,0.0,0,low,1357.0,974011208687869955,Sun Jul 20 09:09:43 +0000 2008,false,"dem, filthy liberal, glock lover, pro-choice, ...",622.0,73.0,,178.0,15499922,TheGameGuru,"Ohio, USA",Eastern Time (US & Canada),False
6,Wed Mar 14 20:15:03 +0000 2018,RT @dianexlston: Also I feel like? We probably...,en,0.0,0,low,28361.0,,Sat Mar 12 11:09:25 +0000 2016,false,,20958.0,3701.0,,3741.0,708610844993658880,muttslikeme,Eridu,,False
7,Wed Mar 14 20:15:03 +0000 2018,RT @dcopaken: This is my daughter. Right now. ...,en,0.0,0,low,42703.0,,Mon Sep 29 01:35:39 +0000 2008,false,Married to a former Roller Derby Rockstar. Als...,3758.0,321.0,,1440.0,16505796,SaysMyDerbyWife,,Eastern Time (US & Canada),False
8,Wed Mar 14 20:15:03 +0000 2018,RT @chicagotribune: Breaking: Police respond t...,en,0.0,0,low,121823.0,,Sun Jul 26 02:29:22 +0000 2009,false,✨ Love ✨ Human Rights ✨ Animal Rights ✨ Fan Gi...,12998.0,2401.0,,1423.0,60210356,Phoenixs_Flame_,Chicagoland,Central Time (US & Canada),False
9,Wed Mar 14 20:15:03 +0000 2018,RT @thehill: #BREAKING: Shots fired at Northwe...,en,0.0,0,low,27365.0,,Fri Nov 13 04:57:06 +0000 2015,false,"I believe in the truth, the whole truth, and n...",39672.0,129.0,,114.0,4219854912,TaniaMaglasu,"Whitby, Ontario",,False


In [9]:
# filter out english tweets only 
%time data_filter = data_filter.filter(col('lang')=='en')

CPU times: user 2.09 ms, sys: 571 µs, total: 2.66 ms
Wall time: 82.5 ms


In [10]:
# this creates significant reduction in volume 
data_filter.count()

3432368

In [11]:
# define column data structure 
data_filter = data_filter.withColumn("tweet_created",col("tweet_created").cast('string')).\
withColumn("text",col("text").cast('string')).\
withColumn("retweet_count",col("retweet_count").cast('integer')).\
withColumn("favorite_count",col("favorite_count").cast('integer')).\
withColumn("filter_level",col("filter_level").cast('string')).\
withColumn("quoted_status_id_str",col("quoted_status_id_str").cast('string')).\
withColumn("created_at",col("created_at").cast('string')).\
withColumn("default_profile_image",col("default_profile_image").cast('string')).\
withColumn("description",col("description").cast('string')).\
withColumn("favourites_count",col("favourites_count").cast('integer')).\
withColumn("followers_count",col("followers_count").cast('integer')).\
withColumn("following",col("following").cast('string')).\
withColumn("friends_count",col("friends_count").cast('integer')).\
withColumn("id_str",col("id_str").cast('string')).\
withColumn("location",col("location").cast('string')).\
withColumn("time_zone",col("time_zone").cast('string')).\
withColumn("verified",col("verified").cast('string'))

In [12]:
# define bucket
my_bucket = 'msca-bdp-students-bucket/shared_data/kmr1'

In [10]:
# filter out uchicago data 
uchi_words = ["uchicago", 
         "university of chicago"
         ]

%time uchicago = data_filter.filter(psf.lower(data_filter.text).rlike('|'.join(uchi_words)))

CPU times: user 4.11 ms, sys: 0 ns, total: 4.11 ms
Wall time: 22.5 ms


In [11]:
# number of records 
uchicago.count()

572868

In [12]:
# save as pandas df 
%time uchicago_df = uchicago.toPandas()

CPU times: user 5.91 s, sys: 744 ms, total: 6.65 s
Wall time: 13 s


In [21]:
# save as csv file
uchicago_df.to_csv('gs://' + my_bucket + '/uchicago_df')

In [14]:
# filter out uchicago words 
depaul_words = [
         "depaul", 
         "depaul university", 
         "depaulu"
         ]

%time depaul = data_filter.filter(psf.lower(data_filter.text).rlike('|'.join(depaul_words)))

# results in 504,813 records 
depaul.count()

CPU times: user 2.56 ms, sys: 176 µs, total: 2.73 ms
Wall time: 65 ms


504813

In [22]:
# save as csv 
%time depaul_df = depaul.toPandas()
depaul_df.to_csv('gs://' + my_bucket + '/depaul_df')

CPU times: user 6.92 s, sys: 235 ms, total: 7.15 s
Wall time: 11.7 s


In [13]:
# filter out northwestern data
northwestern_words = [
         "northwestern",
         "northwesternu",
         "northwestern university",
         ]

%time northwestern = data_filter.filter(psf.lower(data_filter.text).rlike('|'.join(northwestern_words)))

# results in 2.1 million tweets, which are significant reduced in later analysis 
northwestern.count()

CPU times: user 2.65 ms, sys: 0 ns, total: 2.65 ms
Wall time: 27 ms


2122479

In [14]:
# save as csv 
%time northwestern_df = northwestern.toPandas()
northwestern_df.to_csv('gs://' + my_bucket + '/northwestern_df')

CPU times: user 22.9 s, sys: 1.81 s, total: 24.7 s
Wall time: 39.3 s


In [18]:
# filter out uic data 
uic_words = ["university of illinois at chicago",
             "uic.edu", 
             " uic ",
             " uic's "
            ]

%time uic = data_filter.filter(psf.lower(data_filter.text).rlike('|'.join(uic_words)))

# results in 182,460 records 
uic.count()

CPU times: user 2.58 ms, sys: 0 ns, total: 2.58 ms
Wall time: 62.5 ms


182460

In [19]:
# save as pandas dataframe
%time uic_df = uic.toPandas()
uic_df.to_csv('gs://' + my_bucket + '/uic_df')

CPU times: user 1.9 s, sys: 56.5 ms, total: 1.96 s
Wall time: 6.9 s


In [15]:
# unpersist data file 
data_filter.unpersist()

DataFrame[tweet_created: string, text: string, lang: string, retweet_count: int, favorite_count: int, filter_level: string, statuses_count: string, quoted_status_id_str: string, created_at: string, default_profile_image: string, description: string, favourites_count: int, followers_count: int, following: string, friends_count: int, id_str: string, screen_name: string, location: string, time_zone: string, verified: string]