In [1]:
import os
import shutil
from itertools import islice
import requests

import pandas as pd
import matplotlib.pyplot as plt

import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import functions as F
from pyspark.sql.types import *

# Create spark_session
spark = SparkSession.builder.getOrCreate()
conf = spark.sparkContext._conf.setAll([('spark.executor.memory', '32g'), ('spark.app.name', 'Spark Updated Conf'), 
                                        ('spark.executor.cores', '32'), ('spark.cores.max', '32'), ('spark.driver.memory','32g')])
spark.sparkContext.getConf().getAll()

from IPython.display import clear_output
clear_output(wait = False)

spark.version

import time
start_time = time.time()

In [2]:
from google.cloud import storage

## Reading files and filtering based on Twitter Doc

In [3]:
path = "gs://msca-bdp-tweets/"

In [4]:
def clean_up(df):
    return df\
            .filter('user.followers_count > 0')\
            .filter('possibly_sensitive == FALSE or possibly_sensitive is NULL')\
            .filter('withheld_in_countries is NULL')\
            .filter('truncated == "False"')\
            .filter('lang == "en"')\
            .withColumn("text", F.lower(F.col("text")))

In [5]:
%%time
df_tweets_master = spark.read.json(path + 'final_project/')

22/12/05 14:49:48 WARN org.apache.spark.sql.execution.datasources.SharedInMemoryCache: Evicting cached table partition metadata from memory due to size constraints (spark.sql.hive.filesourcePartitionFileCacheSize = 262144000 bytes). This may impact query planning performance.
                                                                                

CPU times: user 1.72 s, sys: 386 ms, total: 2.11 s
Wall time: 7min 50s


22/12/05 14:56:32 WARN org.apache.spark.sql.catalyst.util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [6]:
df_tweets_master.printSchema()

#Useful columns = created_at, entities.hashtags.text, place.country_code, retweet_count, retweet_status, text, user.name, user.followers_count, user.verified, reply_count 

root
 |-- coordinates: struct (nullable = true)
 |    |-- coordinates: array (nullable = true)
 |    |    |-- element: double (containsNull = true)
 |    |-- type: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- display_text_range: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- entities: struct (nullable = true)
 |    |-- hashtags: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- indices: array (nullable = true)
 |    |    |    |    |-- element: long (containsNull = true)
 |    |    |    |-- text: string (nullable = true)
 |    |-- media: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- additional_media_info: struct (nullable = true)
 |    |    |    |    |-- description: string (nullable = true)
 |    |    |    |    |-- embeddable: boolean (nullable = true)
 |    |    |    |    |-- monetizable: boolean (nullable = true)
 |    |    |    |   

In [7]:
%%time
df_tweets_master_filtered = clean_up(df_tweets_master)
print(df_tweets_master_filtered.count())

[Stage 4:>                                                          (0 + 1) / 1]

77410172
CPU times: user 807 ms, sys: 168 ms, total: 976 ms
Wall time: 3min 21s


                                                                                

In [8]:
df_tweets_master_filtered.limit(10).toPandas()

                                                                                

Unnamed: 0,coordinates,created_at,display_text_range,entities,extended_entities,extended_tweet,favorite_count,favorited,filter_level,geo,...,retweeted_from,retweeted_status,source,text,timestamp_ms,truncated,tweet_text,user,withheld_copyright,withheld_in_countries
0,,Tue May 24 22:09:56 +0000 2022,,"([], None, [], [], [(28785486, 28785486, [3, 7...",,,0,False,low,,...,ABC,"(None, Tue May 24 21:54:24 +0000 2022, [0, 140...","<a href=""http://twitter.com/download/iphone"" r...","rt @abc: “why are you here?!""\n\na furious sen...",1653430196731,False,"“Why are you here?!""\n\nA furious Sen. Chris M...","(False, Thu Mar 05 21:39:02 +0000 2009, False,...",,
1,,Tue May 24 22:09:56 +0000 2022,,"([], None, [], [], [(14529929, 14529929, [3, 1...",,,0,False,low,,...,jaketapper,"(None, Tue May 24 20:42:47 +0000 2022, None, (...","<a href=""https://mobile.twitter.com"" rel=""nofo...",rt @jaketapper: fifteen have been killed in a ...,1653430196767,False,Fifteen have been killed in a shooting at Robb...,"(False, Fri Jun 24 02:59:29 +0000 2011, False,...",,
2,,Tue May 24 22:09:56 +0000 2022,,"([], None, [], [], [(29585629, 29585629, [3, 1...",,,0,False,low,,...,Josh_Moon,"(None, Tue May 24 21:01:49 +0000 2022, None, (...","<a href=""http://twitter.com/download/iphone"" r...",rt @josh_moon: 14 dead elementary school kids....,1653430196816,False,14 dead elementary school kids. 14. Not from C...,"(False, Wed Jul 08 02:17:31 +0000 2009, False,...",,
3,,Tue May 24 22:09:56 +0000 2022,,"([], None, [], [], [(18166778, 18166778, [3, 1...",,,0,False,low,,...,Jim_Jordan,"(None, Tue May 24 18:52:33 +0000 2022, None, (...","<a href=""http://twitter.com/download/android"" ...",rt @jim_jordan: last week we learned:\n\n-hill...,1653430196814,False,Last week we learned:\n\n-Hillary Clinton orde...,"(False, Thu Feb 19 21:48:10 +0000 2009, True, ...",,
4,,Tue May 24 22:09:56 +0000 2022,,"([], None, [], [], [(3169003537, 3169003537, [...",,,0,False,low,,...,meganbang3,"(None, Mon May 23 11:17:26 +0000 2022, [0, 140...","<a href=""http://twitter.com/download/iphone"" r...",rt @meganbang3: my son was not allowed to walk...,1653430196900,False,My son was not allowed to walk in his high sch...,"(False, Mon Mar 19 02:44:38 +0000 2012, True, ...",,
5,,Tue May 24 22:09:56 +0000 2022,,"([], None, [], [], [(2468096389, 2468096389, [...",,,0,False,low,,...,jewishaction,"(None, Tue May 24 21:54:01 +0000 2022, None, (...","<a href=""http://twitter.com/download/iphone"" r...",rt @jewishaction: we're horrified by the news ...,1653430196899,False,We're horrified by the news of the shooting at...,"(False, Fri May 31 18:32:52 +0000 2013, False,...",,
6,,Tue May 24 22:09:56 +0000 2022,,"([], None, [], [], [(1577522323, 1577522323, [...",,,0,False,low,,...,LRiddickESPN,"(None, Tue May 24 20:50:02 +0000 2022, None, (...","<a href=""http://twitter.com/download/iphone"" r...",rt @lriddickespn: uh…question …wtf are those o...,1653430196992,False,Uh…question …WTF are those of you “in charge” ...,"(False, Fri Jul 29 13:40:33 +0000 2016, True, ...",,
7,,Tue May 24 22:09:57 +0000 2022,,"([], None, [], [], [(1435243987200266241, 1435...",,,0,False,low,,...,Gisele23935327,"(None, Tue May 24 22:08:18 +0000 2022, None, (...","<a href=""https://mobile.twitter.com"" rel=""nofo...",rt @gisele23935327: who did not shoot up a sch...,1653430197678,False,Who did not shoot up a school today?\nA border...,"(False, Thu Jul 29 04:55:18 +0000 2010, False,...",,
8,,Tue May 24 22:09:57 +0000 2022,,"([], None, [], [], [(50317463, 50317463, [3, 1...",,,0,False,low,,...,MichaelSteele,"(None, Tue May 24 20:56:58 +0000 2022, None, (...","<a href=""http://twitter.com/download/iphone"" r...",rt @michaelsteele: 14 children and a teacher h...,1653430197674,False,14 children and a teacher have been killed in ...,"(False, Fri Aug 06 03:28:10 +0000 2010, False,...",,
9,,Tue May 24 22:09:57 +0000 2022,,"([], None, [], [], [(16815644, 16815644, [3, 1...",,,0,False,low,,...,ABCPolitics,"(None, Tue May 24 21:55:33 +0000 2022, [0, 140...","<a href=""http://twitter.com/download/iphone"" r...",rt @abcpolitics: sen. chris murphy delivers re...,1653430197693,False,"Sen. Chris Murphy delivers remarks on Uvalde, ...","(False, Wed Jul 19 18:57:41 +0000 2017, True, ...",,


## Getting Keywords from Wikipedia and fine-tuning the keywords set for filtering

In [9]:
%%time

import os
print(os.getcwd())
os.chdir('/usr/')

%run WikipediaKeyWordsExtraction.ipynb

Loaded Wikipedia Word Extractorfile!
CPU times: user 7.25 s, sys: 513 ms, total: 7.77 s
Wall time: 42.5 s


In [10]:
pages = ['K-12', 'K-12_education_in_the_United_States','No_Child_Left_Behind_Act']

keywords = []
for i in pages:
    words = get_key_words_from_wikipedia_page(i)
    for j in words:
        keywords.append(j)

In [11]:
keywords = list(set(keywords))
print(len(keywords))
print(keywords)

45
['districts', 'parents', 'education', 'progress', 'district', 'children', 'kindergarten', 'student', 'found', 'child', 'grade', 'testing', 'private', 'state', 'public', 'standards', 'scores', 'degree', 'schools', 'teachers', 'american', 'elementary', 'improve', 'level', 'primary', 'educational', 'systems', 'school', 'achievement', 'disabilities', 'grades', 'accountability', 'college', 'community', 'states', 'students', 'standardized', 'classes', 'local', 'requirements', 'united', 'include', 'years', 'secondary', 'commonly']


In [12]:
keywords.remove('student')
keywords.remove('school')
keywords.remove('child')
keywords.remove('testing')
keywords.remove('college')
keywords.remove('american')
keywords.remove('children')
keywords.remove('elementary')
keywords.remove('primary')
keywords.remove('secondary')


usual_keywords = ['math', 'science', 'primary_school', 'elementary_school', 'high_school', 'pre_school', 
                  'middle_school', 'k12', 'k-12', 'books', 'curriculum', 'learning', 'colleges', 'tuition'] ## 30millions tweets have these words

keywords_to_avoid = ['kill', 'die', 'dead', 'shoot', 'gun', 'murder', 'slaughter', 'shot', 'sex', 'fuck', 'porn', 
                     'terror', 'protest', 'violen', 'ukraine', 'crush', 'arrest', 'kidnap', 'victim', 'massacre', 'trans', 'lgbt'] ### Explicit sentences are still visible

for i in usual_keywords:
    if i not in keywords:
        keywords.append(i)

keywords = list(set(keywords))
print(len(keywords))
print("Final Keywords: ", keywords)

49
Final Keywords:  ['districts', 'parents', 'education', 'elementary_school', 'progress', 'district', 'kindergarten', 'found', 'grade', 'private', 'state', 'math', 'public', 'standards', 'scores', 'colleges', 'degree', 'schools', 'teachers', 'improve', 'level', 'educational', 'curriculum', 'k-12', 'learning', 'systems', 'achievement', 'middle_school', 'books', 'disabilities', 'grades', 'tuition', 'accountability', 'community', 'states', 'high_school', 'students', 'standardized', 'classes', 'local', 'requirements', 'science', 'k12', 'pre_school', 'united', 'include', 'primary_school', 'years', 'commonly']


In [13]:
filter_string = 'text like "%' + keywords[0] + '%"'

for i in keywords[1:]:
    filter_string += ' or text like "%' + i + '%"'

In [14]:
sample = df_tweets_master_filtered.limit(2500).cache()
sample.count()

                                                                                

2500

In [15]:
%%time
pd.set_option('display.max_colwidth', 10000)
res = sample.filter(filter_string).select(['text'])
display(res.limit(5).toPandas())
print(res.count())


Unnamed: 0,text
0,rt @ninaturner: the ruling class has made it so a college degree is the most accepted way for the poor and working poor to make it to the m…
1,"rt @zjemptv: the national catholic bioethics center enforces exclusions of trans care in catholic hospital systems via audits of a ""catholi…"
2,"rt @spiffnyy: i feel like companies always play up back-to-school clothing for young students, as if teachers and school staff don’t also p…"
3,"rt @billius27: if you are surprised that governments and public health including the cdc are taking a hands off approach to covid now, reme…"
4,"rt @ucnz: catch uc people speaking at this year’s nz international education conference - grad @theabbasnazari, nzisa pres vikram selvaraj…"


1081
CPU times: user 22 ms, sys: 7.95 ms, total: 30 ms
Wall time: 1.09 s


In [16]:
filter_string_words_to_avoid = "text not like '%" + keywords_to_avoid[0] + "%'"    

for i in keywords_to_avoid[1:]:
    filter_string_words_to_avoid += " and text not like '%" + i + "%'"  
    
print(filter_string_words_to_avoid)
display(res.filter(filter_string_words_to_avoid).select(['text']).limit(5).toPandas())
print(res.filter(filter_string_words_to_avoid).select(['text']).count())


text not like '%kill%' and text not like '%die%' and text not like '%dead%' and text not like '%shoot%' and text not like '%gun%' and text not like '%murder%' and text not like '%slaughter%' and text not like '%shot%' and text not like '%sex%' and text not like '%fuck%' and text not like '%porn%' and text not like '%terror%' and text not like '%protest%' and text not like '%violen%' and text not like '%ukraine%' and text not like '%crush%' and text not like '%arrest%' and text not like '%kidnap%' and text not like '%victim%' and text not like '%massacre%' and text not like '%trans%' and text not like '%lgbt%'


Unnamed: 0,text
0,rt @ninaturner: the ruling class has made it so a college degree is the most accepted way for the poor and working poor to make it to the m…
1,"rt @spiffnyy: i feel like companies always play up back-to-school clothing for young students, as if teachers and school staff don’t also p…"
2,"rt @billius27: if you are surprised that governments and public health including the cdc are taking a hands off approach to covid now, reme…"
3,"rt @ucnz: catch uc people speaking at this year’s nz international education conference - grad @theabbasnazari, nzisa pres vikram selvaraj…"
4,"rt @cfbnerds: colleges ranked by nfl snaps (off &amp; def) in 2021 season:\n1.alabama - 35,224\n2.lsu - 27,503\n3.ohio state - 26,614\n4.georgia -…"


727


In [17]:
from tqdm import tqdm
res = []
for keys in tqdm(keywords):
    filter_string_temp = 'text like "%' + keys + '%"'
    temp = sample.filter(filter_string_temp).count()
    res.append([temp, keys])
    
sorted(res,key=lambda l:l[0], reverse = True)[:10]

100%|██████████| 49/49 [00:07<00:00,  6.16it/s]


[[248, 'elementary_school'],
 [242, 'schools'],
 [125, 'high_school'],
 [94, 'parents'],
 [77, 'students'],
 [69, 'public'],
 [64, 'state'],
 [64, 'years'],
 [47, 'teachers'],
 [33, 'primary_school']]

In [24]:
df_tweets_master_filtered_keywords = df_tweets_master_filtered.filter(filter_string_words_to_avoid)

In [25]:
@F.udf
def get_importance(text):
    global keywords
    words = text.split()
    total_count = 0
    for i in keywords:
        occurance_count = words.count(i)
        total_count += occurance_count

    if(total_count > 1):
        return 1
    else:
        return 0

#On original 3.7million
df_tweets_master_filtered_keywords = df_tweets_master_filtered_keywords.withColumn("important", get_importance("text"))

In [None]:
%%time
df_tweets_master_filtered_keywords_doubleFiltered = df_tweets_master_filtered_keywords.filter("important == 1")
df_tweets_master_filtered_keywords_doubleFiltered.count()

[Stage 124:>                                                        (0 + 1) / 1]

CPU times: user 4.26 s, sys: 1 s, total: 5.26 s
Wall time: 17min 25s


                                                                                

3119357

### Storing intermediate results

In [None]:
%%time 
df_tweets_master_filtered_keywords_doubleFiltered.write.mode("overwrite").\
                            parquet("gs://msca-bdp-students-bucket/shared_data/jasmeetsingh/df_tweets_master_filtered_keywords_doubleFiltered")

                                                                                

CPU times: user 5.03 s, sys: 1.44 s, total: 6.47 s
Wall time: 27min 26s


In [None]:
%%time 
df_tweets_master_filtered_keywords_doubleFiltered_verifying = spark.read.\
                            parquet("gs://msca-bdp-students-bucket/shared_data/jasmeetsingh/df_tweets_master_filtered_keywords_doubleFiltered")
df_tweets_master_filtered_keywords_doubleFiltered_verifying.count()



CPU times: user 134 ms, sys: 40.9 ms, total: 175 ms
Wall time: 49.6 s


                                                                                

3119357

In [None]:
print("--- Total Time to run the notebook: %s seconds ---" % (time.time() - start_time))

--- Total Time to run the notebook: 5383.542690992355 seconds ---
