# BDP Final Assignment: Twitter Education, Part 1 (Data Preprocessing)

# Setup environment and tools

In [1]:
#Ensure we are using the right kernel
spark.version

'3.1.3'

In [2]:
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', None)
pd.reset_option('display.max_rows')

#import warnings
#warnings.filterwarnings(action='ignore')
#warnings.simplefilter('ignore')

In [3]:
from pyspark.sql.functions import *
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql import SparkSession

GCP Tools and Functions

In [4]:
from google.cloud import storage

In [5]:
# List all files in given COS directory
def list_blobs(bucket_name, folder_name):
    gcs_client = storage.Client()
    bucket = gcs_client.bucket(bucket_name)
    blobs = list(bucket.list_blobs(prefix=folder_name))

    for blob in blobs:
        print(blob.name + '\t' + str(blob.size))

# List all files in given COS directory
def list_blobs_pd(bucket_name, folder_name):
    gcs_client = storage.Client()
    bucket = gcs_client.bucket(bucket_name)
    blobs = list(bucket.list_blobs(prefix=folder_name))

    blob_name = []
    blob_size = []
    
    for blob in blobs:
        blob_name.append(blob.name)
        blob_size.append(blob.size)

    blobs_df = pd.DataFrame(list(zip(blob_name, blob_size)), columns=['Name','Size'])

    blobs_df = blobs_df.style.format({"Size": "{:,.0f}"}) 
    
    return blobs_df        

# Delete folder from COS bucket
def delete_folder(bucket_name, folder_name):
    gcs_client = storage.Client()
    bucket = gcs_client.bucket(bucket_name)
    blobs = list(bucket.list_blobs(prefix=folder_name))

    for blob in blobs:
        blob.delete()

In [6]:
# Reading data from open bucket, avaible to all students
bucket_read = 'msca-bdp-tweets'
folder_read = 'final_project'

In [7]:
spark = SparkSession.builder.getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled",True)

# Part 1: Loading data

According to the twitter developer documentation, we know that:
* strcture: 
    * tweet info: tweet content, tweet information (quotes, favorites), time created, user info (name, id, location, etc.)
    * retweet (if exists): tweet info from the original tweet (formatted as same as above)
* the info we want: 

## Read full data

In [8]:
all_json_path = 'gs://msca-bdp-tweets/final_project'

In [9]:
%%time

educationJSON_DF = spark.read.json(all_json_path)

22/12/06 10:25:55 WARN org.apache.spark.sql.execution.datasources.SharedInMemoryCache: Evicting cached table partition metadata from memory due to size constraints (spark.sql.hive.filesourcePartitionFileCacheSize = 262144000 bytes). This may impact query planning performance.
                                                                                

CPU times: user 1.27 s, sys: 247 ms, total: 1.51 s
Wall time: 7min 3s


22/12/06 10:31:46 WARN org.apache.spark.sql.catalyst.util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [10]:
educationJSON_DF.printSchema()

root
 |-- coordinates: struct (nullable = true)
 |    |-- coordinates: array (nullable = true)
 |    |    |-- element: double (containsNull = true)
 |    |-- type: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- display_text_range: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- entities: struct (nullable = true)
 |    |-- hashtags: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- indices: array (nullable = true)
 |    |    |    |    |-- element: long (containsNull = true)
 |    |    |    |-- text: string (nullable = true)
 |    |-- media: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- additional_media_info: struct (nullable = true)
 |    |    |    |    |-- description: string (nullable = true)
 |    |    |    |    |-- embeddable: boolean (nullable = true)
 |    |    |    |    |-- monetizable: boolean (nullable = true)
 |    |    |    |   

# Part 2: Data Cleaning

Steps of data cleaning are:
1. filtering out irrelevant tweets
2. 
3. 

## Filtering

In [11]:
#select variables

data = educationJSON_DF.select([educationJSON_DF.created_at,
                                educationJSON_DF.id,
                                educationJSON_DF.lang,
                                educationJSON_DF.text,
                                educationJSON_DF.retweet_count,
                                educationJSON_DF.favorite_count,
                                educationJSON_DF.quote_count,
                                
                                educationJSON_DF.retweeted,
                                #educationJSON_DF.retweeted_status,
                                educationJSON_DF.retweeted_status['favorite_count'].alias('rtstatus_favorite_count'),
                                educationJSON_DF.retweeted_status['retweet_count'].alias('rtstatus_retweet_count'),
                                educationJSON_DF.retweeted_status['quote_count'].alias('rtstatus_quote_count'),
                                educationJSON_DF.retweeted_status.entities.hashtags['text'].alias('rt_hashtags'),
                                
                                educationJSON_DF.retweeted_status.user['id'].alias('source_rt_usr_id'),
                                educationJSON_DF.retweeted_status['id'].alias('source_rt_id'),
                                
                                educationJSON_DF.place['full_name'].alias('location'),
                                educationJSON_DF.place['country'].alias('country'),
                                educationJSON_DF.place['country_code'].alias('country_code'),
                                educationJSON_DF.user['verified'].alias('verified_user'),
                                educationJSON_DF.user['id'].alias("user_id"),
                                educationJSON_DF.user['name'].alias("user_name"),
                                educationJSON_DF.user['followers_count'].alias("followers_count"),
                                educationJSON_DF.user['description'].alias("user_description"),
                                
                                #educationJSON_DF.quoted_status,
                                #educationJSON_DF.quoted_status['favorite_count'].alias('qt_favorite_count'),
                                #educationJSON_DF.quoted_status['retweet_count'].alias('qt_retweet_count'),
                                #educationJSON_DF.quoted_status['quote_count'].alias('qt_quote_count')
                               ])

In [None]:
#data.printSchema()

There are about 99992797 tweets.

```
%%time
data.count()
---
count: 99992797
```

### Now we are ready to filter tweets

Education: Related words (from https://relatedwords.org/relatedto/education)

In [13]:
# a set of topics
# Ref: https://www.teachthought.com/twitter-hashtags-for-teacher/

race_equity = ['blackedu', 'latinoedu', 'nativeedu', 'urbaned', 'nclb', 'occupyeducation', 'diversity', 'heritage']
literaycy = ['literacy', 'multiliteracy', 'infolit', 'homeschooling', 'hiphomeschool'] 
tech_digital = ['digitalcitizenship', 'digcit', 'edtech', 'cyberbullying', 'k12online']
generally_popular = ['elearning', 'ntchat', 'passiondriven', 'elearning']
special_need = ['ece', 'specialneeds','dyslexia','tck']
curricumlum = ['commoncore', 'cchat', 'books']
college = ['ACT', 'SAT', 'scholarship', 'tuition', 'studentloan']
noise = ['primary education', 'secondary education', 'higher education', 'k12', 'teacher', 'parenting']

keywords = race_equity + literaycy + tech_digital + generally_popular + special_need + curricumlum + college + noise

In [14]:
%%time
# convert columns with strings (tweet text) to lowercase
data = data.withColumn("text",F.lower(F.col("text")))

# create dataframes with strings with education-related words
edu_data = data.filter((data.lang == 'en')) \
               .filter(data.text.rlike('|'.join(keywords)))

CPU times: user 3.81 ms, sys: 814 µs, total: 4.63 ms
Wall time: 71.8 ms


In [15]:
%%time
edu_data.count()
# last result=3565699

22/12/06 10:33:44 WARN org.apache.spark.deploy.yarn.YarnAllocator: Container from a bad node: container_1670295291905_0025_01_000007 on host: hub-msca-bdp-dphub-students-backup-hjiang248-sw-zg85.c.msca-bdp-students.internal. Exit status: 143. Diagnostics: [2022-12-06 10:33:44.456]Container killed on request. Exit code is 143
[2022-12-06 10:33:44.488]Container exited with a non-zero exit code 143. 
[2022-12-06 10:33:44.489]Killed by external signal
.
22/12/06 10:33:44 WARN org.apache.spark.scheduler.cluster.YarnSchedulerBackend$YarnSchedulerEndpoint: Requesting driver to remove executor 7 for reason Container from a bad node: container_1670295291905_0025_01_000007 on host: hub-msca-bdp-dphub-students-backup-hjiang248-sw-zg85.c.msca-bdp-students.internal. Exit status: 143. Diagnostics: [2022-12-06 10:33:44.456]Container killed on request. Exit code is 143
[2022-12-06 10:33:44.488]Container exited with a non-zero exit code 143. 
[2022-12-06 10:33:44.489]Killed by external signal
.
22/12/0

CPU times: user 1.11 s, sys: 207 ms, total: 1.32 s
Wall time: 6min


                                                                                

7448390

In [15]:
edu_data.printSchema()

root
 |-- created_at: string (nullable = true)
 |-- id: long (nullable = true)
 |-- lang: string (nullable = true)
 |-- text: string (nullable = true)
 |-- retweet_count: long (nullable = true)
 |-- favorite_count: long (nullable = true)
 |-- quote_count: long (nullable = true)
 |-- retweeted: string (nullable = true)
 |-- rtstatus_favorite_count: long (nullable = true)
 |-- rtstatus_retweet_count: long (nullable = true)
 |-- rtstatus_quote_count: long (nullable = true)
 |-- source_rt_usr_id: long (nullable = true)
 |-- source_rt_id: long (nullable = true)
 |-- location: string (nullable = true)
 |-- country: string (nullable = true)
 |-- country_code: string (nullable = true)
 |-- verified_user: boolean (nullable = true)
 |-- user_id: long (nullable = true)
 |-- user_name: string (nullable = true)
 |-- followers_count: long (nullable = true)
 |-- user_description: string (nullable = true)



In [17]:
# Saving results into individual bucket, students must update to their own bucket `msca-bdp-students-bucket` and use `CNET ID` as a folder prefix
bucket_write = 'msca-bdp-students-bucket'
folder_write = 'shared_data/hjiang248/final_sdf_v9'

In [None]:
%%time
#edu_data.write.format('parquet').\
#mode('overwrite').\
#save('gs://' + bucket_write + '/' + folder_write)

22/12/06 10:49:05 WARN org.apache.spark.deploy.yarn.YarnAllocator: Container from a bad node: container_1670295291905_0025_01_000040 on host: hub-msca-bdp-dphub-students-backup-hjiang248-sw-k0mv.c.msca-bdp-students.internal. Exit status: 143. Diagnostics: [2022-12-06 10:49:04.961]Container killed on request. Exit code is 143
[2022-12-06 10:49:04.961]Container exited with a non-zero exit code 143. 
[2022-12-06 10:49:04.968]Killed by external signal
.
22/12/06 10:49:05 WARN org.apache.spark.deploy.yarn.YarnAllocator: Container from a bad node: container_1670295291905_0025_01_000043 on host: hub-msca-bdp-dphub-students-backup-hjiang248-sw-k0mv.c.msca-bdp-students.internal. Exit status: 143. Diagnostics: [2022-12-06 10:49:04.961]Container killed on request. Exit code is 143
[2022-12-06 10:49:04.969]Container exited with a non-zero exit code 143. 
[2022-12-06 10:49:04.970]Killed by external signal
.
22/12/06 10:49:05 ERROR org.apache.spark.scheduler.cluster.YarnScheduler: Lost executor 40 o

CPU times: user 1.98 s, sys: 340 ms, total: 2.32 s
Wall time: 10min 43s


In [None]:
#list_blobs(bucket_name=bucket_write,
#           folder_name=folder_write)

In [None]:
#delete_folder(bucket_name=bucket_write,
#           folder_name=folder_write)

# Appendix: Don't run codes below

In [29]:
# check the structure of entiteis
#test = data.limit(1000).filter((data.rt_hashtags.isNotNull()) & (F.size('rt_hashtags') > 0)).select(['text', 'rt_hashtags']).toPandas()
#test.head(5)

                                                                                

Unnamed: 0,text,rt_hashtags
0,"RT @AzuWeyane: Dictator Isaias will not stop the #TigrayGenocide. And with the help of @AbiyAhmedAli working to consolidate his power, Erit…",[TigrayGenocide]
1,RT @ChiActivist312: Do you think that @DarrenBaileyIL should drop out of the @ILGOP Gubernatorial Primary if he's unwilling to ban #CRT and…,[CRT]
2,RT @PhenomMixtapes: Menchville (VA) vs Wayne Country Day School highlights at #PhenomTeamCamp #PhenomHoops https://t.co/3v0xVy7o3Q,"[PhenomTeamCamp, PhenomHoops]"
3,RT @JordanWhitney23: 100% committed to the University of Washington! #WeRollin #GoHuskies #DawgD ☔️🐺 @KalenDeBoer @WilliamInge1 @PlayerProM…,"[WeRollin, GoHuskies, DawgD]"
4,RT @narayan_somesh: #MODIJIextendNEETUG\nSir we are poor NEET aspirants\ndon't afford private medical colleges.\nGive some time to secure govt…,[MODIJIextendNEETUG]


In [34]:
#len(test.iloc[0,0])

140

In [35]:
#test.iloc[0,0]

'RT @AzuWeyane: Dictator Isaias will not stop the #TigrayGenocide. And with the help of @AbiyAhmedAli working to consolidate his power, Erit…'

```
edu_data = data.filter((data.lang == 'en')) \
               .filter((data.text.contains("elementary"))\
                          | (data.text.contains("university"))\
                          | (data.text.contains("college"))\
                          | (data.text.contains("primary education"))\
                          | (data.text.contains("secondary education"))\
                          | (data.text.contains("higher education"))\
                          | (data.text.contains("teaching"))\
                          | (data.text.contains("textbook"))\
                          | (data.text.contains("tuition"))\
                          | (data.text.contains("literacy"))\
                          | (data.text.contains("learning"))\
                          | (data.text.contains("curriculum"))\
                          | (data.text.contains("k-12"))\
                          | (data.text.contains("k12"))
                        )
```

```
# a set of topics

race_equity = ['#blackedu', '#latinoedu', '#nativeedu', '#urbaned', '#nclb', '#occupyeducation', '#diversity', '#heritage']
literaycy = ['#literacy', '#multiliteracy', '#infolit', '#homeschooling', '#hiphomeschool'] 
tech_digital = ['#digitalcitizenship', '#digcit', '#edtech', '#cyberbullying', '#k12online']
generally_popular = ['#elearning', '#ntchat', '#passiondriven', '#elearning']
special_need = ['#ece', '#specialneeds','#dyslexia','#tck']
curricumlum = ['#commoncore', '#cchat', '#books']
college = ['#ACT', '#SAT', '#scholarship', 'tuition', '#studentloan']

keywords = race_equity + literaycy + tech_digital + generally_popular + special_need + curricumlum + college
```

### Extract the most frequent keywords in education

For data cleaning, we need to filter out those irrelevant tweets. However, how do we define 'irrelevant'? The method used here is to find out whether a tweet's text contains relevant keywords. Therefore, the question is converted to two parts: (1) find the most frequent keywords in education, and (2) filter tweets based on these keywords.

How do we find keywords?
* extract text from main dataframe and convert to RDD
* for each element of RDD, use `re` and `stopwords` to filter unnecessary words
* use RDD to count word frequency and sort by value

In [11]:
#!pip uninstall -y nltk
#!pip install nltk --upgrade --no-cache-dir

Found existing installation: nltk 3.6.4
Uninstalling nltk-3.6.4:
  Successfully uninstalled nltk-3.6.4
[0mCollecting nltk
  Downloading nltk-3.7-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting regex>=2021.8.3
  Downloading regex-2022.10.31-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (772 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.3/772.3 kB[0m [31m270.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: regex, nltk
  Attempting uninstall: regex
    Found existing installation: regex 2021.4.4
    Uninstalling regex-2021.4.4:
      Successfully uninstalled regex-2021.4.4
Successfully installed nltk-3.7 regex-2022.10.31
[0m

In [12]:
#import nltk
#nltk.download('stopwords')
#from nltk.corpus import stopwords

#import re

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [19]:
#text_df = data #.limit(1000)

# convert the tweet text to RDD
#text = text_df.rdd.map(lambda x : x['text']).filter(lambda x: x is not None)

# filter and regulate text rdd
#StopWords = stopwords.words("english")

#tokens = text\
             .map( lambda document: document.strip().lower())\
             .map( lambda document: re.split(" ", document))\
             .map( lambda word: [x for x in word if x.isalnum()])\
             .map( lambda word: [x for x in word if x not in StopWords])\
             .map( lambda word: [x for x in word if len(x) > 3] ) \
             .map( lambda word: ' '.join(word))

In [20]:
#tokens.take(5)

In [None]:
#%%time

#wordCounts = tokens.flatMap(lambda text: text.split(' ')) \
#                    .map(lambda word: (word, 1)) \
#                    .reduceByKey(lambda a, b: a+b)

#wordCountsSorted = wordCounts.map(lambda x:(x[1],x[0])).sortByKey(ascending=False)
#wordCountsSorted.take(50)

The result has an almost infinite length of warnings. I copied it down as below:
```
CPU times: user 7.57 s, sys: 2.46 s, total: 10 s
Wall time: 21min 52s
                                                                                
[(35704754, 'school'),
 (9900037, 'college'),
 (7681303, 'high'),
 (7494595, 'university'),
 (5473403, 'schools'),
 (4173993, 'students'),
 (3820340, 'like'),
 (3012269, 'kids'),
 (2831975, 'people'),
 (2669057, 'professor'),
 (2444927, 'back'),
 (2419163, 'children'),
 (2357668, 'first'),
 (2336066, 'student'),
 (2202714, 'public'),
 (2129934, 'year'),
 (2062591, 'would'),
 (2017622, 'time'),
 (1933909, 'know'),
 (1874478, 'going'),
 (1837505, 'want'),
 (1757324, 'went'),
 (1747276, 'years'),
 (1735947, 'need'),
 (1665897, 'football'),
 (1597627, 'help'),
 (1589832, 'state'),
 (1578455, 'every'),
 (1547869, 'make'),
 (1532546, 'teacher'),
 (1511755, 'today'),
 (1450172, 'education'),
 (1439676, 'think'),
 (1427540, 'elementary'),
 (1414859, 'live'),
 (1399213, 'never'),
 (1391406, 'even'),
 (1379058, 'still'),
 (1369890, 'good'),
 (1364244, 'middle'),
 (1359632, 'teachers'),
 (1328999, 'last'),
 (1315163, 'parents'),
 (1243790, 'take'),
 (1240966, 'texas'),
 (1235432, 'said'),
 (1231646, 'many'),
 (1209927, 'really'),
 (1206415, 'class'),
 (1182268, 'free')]
 
 ```