In [78]:
import pandas as pd
import os
import csv
import json
import re
import numpy as np
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.types import StructType, StructField, BooleanType, IntegerType, StringType, LongType, DoubleType
import nltk
import twarc
from tableschema import Table

In [44]:
spark = SparkSession.builder.master('local').appName('local').getOrCreate()

In [71]:
NUM_TWEETS_PER_FILE = 1000
DATA_FULL_TWEETS_DIRECTORY   = 'full-tweets'
DATA_FULL_TWEETS_FILE_PREFIX = 'tweets-'

DATA_RAW_TWEETS_PATH       = '../data/raw'
DATA_PROCESSED_TWEETS_PATH = '../data/processed'

LABELED_PATH = '../data/processed/full-tweets-labeled/'

LDA_PATH = '../data/lda/'
LDA_FILE = 'LDA_Output.csv'

In [72]:
df_schema = StructType([
    StructField('id', LongType(), True),
    StructField('prediction', DoubleType(), True),
    StructField('weighted_label', DoubleType(), True),
])

lda_schema = StructType([
    StructField('id', LongType(), True),
    StructField('topic_index', StringType(), True),
    StructField('topic_label', StringType(), True),
])

tableau_schema = StructType([
    StructField('id', LongType(), True),
    StructField('date', StringType(), True),
    StructField('label', IntegerType(), True),
   
    StructField('score_naive', DoubleType(), True),
    StructField('score_evo', DoubleType(), True),
    
    StructField('topic', StringType(), True),
    StructField('topic_lda', StringType(), True),
    
    StructField("latitude", DoubleType(), True),
    StructField("longitude", DoubleType(), True),
    
    StructField("isUS", BooleanType(), True),
    StructField("state", StringType(), True),
])

In [73]:
df_all_tweet_ids = spark.read.csv('../data/processed/all_tweet_ids.csv', header=True)

df_all_tweet_ids.show()
total_num_tweets = df_all_tweet_ids.count()

+-------------------+
|           tweet_id|
+-------------------+
|1240728065983959040|
|1240728187136610306|
|1240728221986906113|
|1240728361556750338|
|1240728639358017536|
|1240728647524323338|
|1240728942664794112|
|1240728970368253953|
|1240729290427248645|
|1240729372363014147|
|1240729785883639808|
|1240729879525679105|
|1240730058974621696|
|1240731018220404736|
|1240731410614358021|
|1240732013029502981|
|1240732830411501568|
|1240732896958115841|
|1240733109584347136|
|1240733265461272578|
+-------------------+
only showing top 20 rows



In [47]:
df = spark.read.csv(LABELED_PATH + '/' + '*.csv', header=True, schema=df_schema)
df_lda = spark.read.csv(LDA_PATH + '/' + '*.csv', header=False, schema=lda_schema)
df.printSchema()
df_lda.printSchema()

df_tableau = df.join(lda, on=['id'])

df_tableau.show()

root
 |-- id: long (nullable = true)
 |-- prediction: double (nullable = true)
 |-- weighted_label: double (nullable = true)

root
 |-- id: long (nullable = true)
 |-- topic_index: string (nullable = true)
 |-- topic_label: string (nullable = true)

+-------------------+----------+-------------------+-----------+---------------+
|                 id|prediction|     weighted_label|topic_index|    topic_label|
+-------------------+----------+-------------------+-----------+---------------+
|1246892082888945666|       0.0|0.34590212711893237|    Topic 1|          Other|
|1246892725158449152|       1.0| 0.9765624810792524|    Topic 1|          Other|
|1246894604307312640|       1.0| 0.9851553133346029|    Topic 2|Social Distance|
|1246894744174759950|       0.0| 0.2760483246626915|    Topic 3|          Masks|
|1246895626236964873|       0.0|0.14001506904275743|    Topic 4|       Lockdown|
|1246895919536246785|       0.0|0.17449963481495315|    Topic 1|          Other|
|1246897071942250497|

In [108]:
tweet_ids = [list(row) for row in df_all_tweet_ids.collect()]

path = DATA_PROCESSED_TWEETS_PATH + '/' + DATA_FULL_TWEETS_DIRECTORY + '/' + '*.jsonl'
df_tweets = spark.read.json(path, multiLine=True)

df_tweets = df_tweets.select('id', 'place', 'coordinates', 'created_at')

In [109]:
df_tableau = df_tableau.join(df_tweets, on=['id'])
df_tableau.printSchema()
df_tableau.first()

root
 |-- id: long (nullable = true)
 |-- prediction: double (nullable = true)
 |-- weighted_label: double (nullable = true)
 |-- topic_index: string (nullable = true)
 |-- topic_label: string (nullable = true)
 |-- place: struct (nullable = true)
 |    |-- bounding_box: struct (nullable = true)
 |    |    |-- coordinates: array (nullable = true)
 |    |    |    |-- element: array (containsNull = true)
 |    |    |    |    |-- element: array (containsNull = true)
 |    |    |    |    |    |-- element: double (containsNull = true)
 |    |    |-- type: string (nullable = true)
 |    |-- contained_within: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- country: string (nullable = true)
 |    |-- country_code: string (nullable = true)
 |    |-- full_name: string (nullable = true)
 |    |-- id: string (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- place_type: string (nullable = true)
 |    |-- url: string (nullable = true)
 |-- co

Row(id=1283997638673801216, prediction=0.0, weighted_label=0.355967552135683, topic_index='success fit day time lockdown slip manga pull average step goal forgot capture night long walk collect', topic_label=None, place=Row(bounding_box=Row(coordinates=[[[-2.353808, 52.0924673], [-2.286305, 52.0924673], [-2.286305, 52.1462783], [-2.353808, 52.1462783]]], type='Polygon'), contained_within=[], country='United Kingdom', country_code='GB', full_name='Great Malvern, England', id='1cea65b77bc560b3', name='Great Malvern', place_type='city', url='https://api.twitter.com/1.1/geo/id/1cea65b77bc560b3.json'), coordinates=Row(coordinates=[-2.31667, 52.1167], type='Point'), created_at='Fri Jul 17 05:31:07 +0000 2020', place=Row(bounding_box=Row(coordinates=[[[-2.353808, 52.0924673], [-2.286305, 52.0924673], [-2.286305, 52.1462783], [-2.353808, 52.1462783]]], type='Polygon'), contained_within=[], country='United Kingdom', country_code='GB', full_name='Great Malvern, England', id='1cea65b77bc560b3', n