In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, LongType, DoubleType
from pyspark.sql.functions import lit, isnan, size, col

In [2]:
STS_RAW_TRAIN_PATH = '../data/sts/training.1600000.processed.noemoticon.csv'
STS_RAW_TEST_PATH  = '../data/ststestdata.manual.2009.06.14.csv'

STS_PROCESED_TRAIN_PATH = '../data/processed/sts/sts_train'
STS_PROCESED_TEST_PATH  = '../data/processed/sts/sts_test'

COVID_PROCESSED_PATH = '../data/processed/full-tweets-sanitized/tweets-santized'

In [3]:
spark = SparkSession.builder.master('local').appName('local').getOrCreate()

# Load Data

In [4]:
sts_processed_schema = StructType([
    StructField('id', LongType(), True),
    StructField('text', StringType(), True),
    StructField('label', IntegerType(), True),
])

covid_processed_schema = StructType([
    StructField('id', LongType(), True),
    StructField('text', StringType(), True),
])

In [5]:
df_sts_processed_train = spark.read.csv(STS_PROCESED_TRAIN_PATH + '/' + '*.csv', header=False, schema=sts_processed_schema).withColumn('type', lit('train'))
df_sts_processed_test  = spark.read.csv(STS_PROCESED_TEST_PATH + '/' + '*.csv', header=False, schema=sts_processed_schema).withColumn('type', lit('test'))
df_covid_processed     = spark.read.csv(COVID_PROCESSED_PATH + '/' + '*.csv', header=False, schema=covid_processed_schema).select('id', 'text', lit(None).alias('label')).withColumn('type', lit('covid'))

In [6]:
df_all = df_sts_processed_train.union(df_sts_processed_test).union(df_covid_processed)

In [7]:
# Clean up null rows on text column
df_all = df_all.filter(~ col("text").isNull())

# Build Pipeline
1. Tokenize Words
2. Build Feature Vector

In [8]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer

In [9]:
# Hyper Parameters
VOCAB_SIZE = 10000
MIN_DF     = 5

# Default Stop Words
default_stop_words = StopWordsRemover.loadDefaultStopWords("english")

In [10]:
tokenizer          = RegexTokenizer(inputCol="text", outputCol="tokenized_text", pattern="\\W")
stop_words_remover = StopWordsRemover(inputCol="tokenized_text", outputCol="filtered_text").setStopWords(default_stop_words)
vectorizer         = CountVectorizer(inputCol="filtered_text", outputCol="features", vocabSize=VOCAB_SIZE, minDF=MIN_DF)

pipeline = Pipeline(stages=[tokenizer, stop_words_remover, vectorizer])

In [11]:
pipeline_fit = pipeline.fit(df_all)
df_all_fit   = pipeline_fit.transform(df_all)

In [12]:
df_all_fit.show()

+----------+--------------------+-----+-----+--------------------+--------------------+--------------------+
|        id|                text|label| type|      tokenized_text|       filtered_text|            features|
+----------+--------------------+-----+-----+--------------------+--------------------+--------------------+
|1467810369|awww that is a bu...|    0|train|[awww, that, is, ...|[awww, bummer, sh...|(10000,[2,13,105,...|
|1467810672|is upset that he ...|    0|train|[is, upset, that,...|[upset, update, f...|(10000,[7,72,147,...|
|1467810917|i dived many time...|    0|train|[i, dived, many, ...|[dived, many, tim...|(10000,[6,151,188...|
|1467811184|my whole body fee...|    0|train|[my, whole, body,...|[whole, body, fee...|(10000,[5,340,402...|
|1467811193|no it is not beha...|    0|train|[no, it, is, not,...|[behaving, m, mad...|(10000,[0,23,607]...|
|1467811372|  not the whole crew|    0|train|[not, the, whole,...|       [whole, crew]|(10000,[340,1924]...|
|1467811592|       

# Train Naive Bayes
Train the NB model

In [13]:
from pyspark.ml.classification import NaiveBayes

In [14]:
df_train = df_all_fit.filter(df_all_fit.type == 'train')
df_test  = df_all_fit.filter(df_all_fit.type == 'test')
df_covid = df_all_fit.filter(df_all_fit.type == 'covid')

In [15]:
nb = NaiveBayes()
model = nb.fit(df_train)

In [16]:
preds = model.transform(df_test)

In [17]:
preds.show()

+---+--------------------+-----+----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
| id|                text|label|type|      tokenized_text|       filtered_text|            features|       rawPrediction|         probability|prediction|
+---+--------------------+-----+----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|  3|i loooooooovvvvvv...|    4|test|[i, loooooooovvvv...|[loooooooovvvvvve...|(10000,[42,133,10...|[-34.201567574022...|[0.09307668114179...|       1.0|
|  4|reading my kindle...|    4|test|[reading, my, kin...|[reading, kindle,...|(10000,[3,8,248,4...|[-47.781892805679...|[0.04470161962106...|       1.0|
|  5|ok first assesmen...|    4|test|[ok, first, asses...|[ok, first, asses...|(10000,[63,110,59...|[-42.262571262719...|[0.25596519931714...|       1.0|
|  6|you you will love...|    4|test|[you, you, will, ...|[love, kindle, ve.

# Labeling COVID Dataset

In [24]:
COVID_LABELED_PATH = '../data/processed/full-tweets-labeled'

In [25]:
covid_preds = model.transform(df_covid)

In [26]:
covid_preds.show()

+-------------------+--------------------+-----+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|                 id|                text|label| type|      tokenized_text|       filtered_text|            features|       rawPrediction|         probability|prediction|
+-------------------+--------------------+-----+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|1246892082888945666|my location at mo...| null|covid|[my, location, at...|[location, modi, ...|(10000,[1,44,207,...|[-166.84942564826...|[0.40145002817456...|       1.0|
|1246892725158449152|corona covid figh...| null|covid|[corona, covid, f...|[corona, covid, f...|(10000,[1,44,60,1...|[-125.12106286087...|[0.32846164192647...|       1.0|
|1246894604307312640|covid digital pai...| null|covid|[covid, digital, ...|[covid, digital, ...|(10000,[1,44,463,...|[-79.079529527734...|[0.1508

In [27]:
from pyspark.sql.functions import udf

weighted_prob = udf(lambda v: float(v[1]), DoubleType())
covid_preds_final = covid_preds.withColumn("weighted_label", weighted_prob("probability")).select("id", "prediction", "weighted_label")

In [28]:
covid_preds_final.repartition(1).write.csv(COVID_LABELED_PATH)