In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, LongType, DoubleType
from pyspark.sql.functions import lit, isnan, size, col

In [43]:
STS_RAW_TRAIN_FILE = '../data/sts/training.1600000.processed.noemoticon.csv'
STS_RAW_TEST_FILE  = '../data/sts/testdata.manual.2009.06.14.csv'

STS_PROCESED_TRAIN_PATH = '../data/processed/sts/sts_train'
STS_PROCESED_TEST_PATH  = '../data/processed/sts/sts_test'

COVID_PROCESSED_PATH = '../data/processed/full-tweets-sanitized/tweets-sanitized'

In [4]:
spark = SparkSession.builder.master('local').appName('local').getOrCreate()

# Load Data

In [68]:
sts_raw_schema = StructType([
    StructField('label', IntegerType(), True),
    StructField('id', LongType(), True),
    StructField('date', StringType(), True),
    StructField('query', StringType(), True),
    StructField('user', StringType(), True),
    StructField('text', StringType(), True),
])

sts_processed_schema = StructType([
    StructField('id', LongType(), True),
    StructField('text', StringType(), True),
])

covid_processed_schema = StructType([
    StructField('id', LongType(), True),
    StructField('text', StringType(), True),
])

In [105]:
# Google Drive file name     - md5sum 
# Final_preprocessed_sts.csv - ec4e0de0560e2ce9a3c11055b6f41894
# Test_data_processed.csv    - ee4e572acdbb6dc129ca397f7d3f37bc
# 
# Recover the labels from the raw data necessary for training and testing
# 
df_sts_processed_train = spark.read.csv(STS_PROCESED_TRAIN_PATH + '/' + '*.csv', header=False, schema=sts_processed_schema).withColumn('type', lit('train'))
df_sts_processed_test  = spark.read.csv(STS_PROCESED_TEST_PATH + '/' + '*.csv', header=False, schema=sts_processed_schema).withColumn('type', lit('test'))

df_sts_raw_train = spark.read.csv(STS_RAW_TRAIN_FILE, header=False, schema=sts_raw_schema).select('id', 'text', 'label').withColumnRenamed('text', 'raw_text')
df_sts_processed_train_with_labels = df_sts_processed_train.join(df_sts_raw_train, on=['id']).select('id', 'text', 'label', 'type')

df_sts_raw_test = spark.read.csv(STS_RAW_TEST_FILE, header=False, schema=sts_raw_schema).select('id', 'text', 'label').withColumnRenamed('text', 'raw_text')
df_sts_processed_test_with_labels = df_sts_processed_test.join(df_sts_raw_test, on=['id']).select('id', 'text', 'label', 'type')

In [106]:
df_covid_processed     = spark.read.csv(COVID_PROCESSED_PATH + '/' + '*.csv', header=False, schema=covid_processed_schema).select('id', 'text', lit(None).alias('label')).withColumn('type', lit('covid'))

In [107]:
df_all = df_sts_processed_train_with_labels.union(df_sts_processed_test_with_labels).union(df_covid_processed)

In [108]:
# Clean up null rows on text column
df_all = df_all.filter(~ col("text").isNull())

# Build Pipeline
1. Tokenize Words
2. Build Feature Vector

In [109]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer

In [110]:
# Hyper Parameters
VOCAB_SIZE = 10000
MIN_DF     = 5

# Default Stop Words
default_stop_words = StopWordsRemover.loadDefaultStopWords("english")

In [111]:
tokenizer          = RegexTokenizer(inputCol="text", outputCol="tokenized_text", pattern="\\W")
stop_words_remover = StopWordsRemover(inputCol="tokenized_text", outputCol="filtered_text").setStopWords(default_stop_words)
vectorizer         = CountVectorizer(inputCol="filtered_text", outputCol="features", vocabSize=VOCAB_SIZE, minDF=MIN_DF)

pipeline = Pipeline(stages=[tokenizer, stop_words_remover, vectorizer])

In [112]:
pipeline_fit = pipeline.fit(df_all)
df_all_fit   = pipeline_fit.transform(df_all)

In [113]:
df_all_fit.show()
df_all.groupBy('type').count().show()

+----------+--------------------+-----+-----+--------------------+--------------------+--------------------+
|        id|                text|label| type|      tokenized_text|       filtered_text|            features|
+----------+--------------------+-----+-----+--------------------+--------------------+--------------------+
|1467860144|hate limit letter...|    0|train|[hate, limit, let...|[hate, limit, let...|(10000,[22,78,87,...|
|1467862225|website fyi pit w...|    4|train|[website, fyi, pi...|[website, fyi, pi...|(10000,[13,103,15...|
|1467889791|call hillsong sai...|    0|train|[call, hillsong, ...|[call, hillsong, ...|(10000,[3,15,24,3...|
|1467898027|         thought mac|    4|train|      [thought, mac]|      [thought, mac]|(10000,[138,668],...|
|1467904302|www nicki like ha...|    0|train|[www, nicki, like...|[www, nicki, like...|(10000,[5,60,71,1...|
|1467928749|tire ddd want sle...|    0|train|[tire, ddd, want,...|[tire, ddd, want,...|(10000,[13,32,35,...|
|1467946810|  mine 

# Train Naive Bayes
Train the NB model

In [114]:
from pyspark.ml.classification import NaiveBayes

In [115]:
df_train = df_all_fit.filter(df_all_fit.type == 'train')
df_test  = df_all_fit.filter(df_all_fit.type == 'test')
df_covid = df_all_fit.filter(df_all_fit.type == 'covid')

In [116]:
nb = NaiveBayes()
model = nb.fit(df_train)

In [117]:
preds = model.transform(df_test)

In [118]:
preds.show()

+---+--------------------+-----+----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
| id|                text|label|type|      tokenized_text|       filtered_text|            features|       rawPrediction|         probability|prediction|
+---+--------------------+-----+----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|  3|kind cool fantasy...|    4|test|[kind, cool, fant...|[kind, cool, fant...|(10000,[39,124,28...|[-29.738939664474...|[0.08647603750995...|       1.0|
|  4|read kind love le...|    4|test|[read, kind, love...|[read, kind, love...|(10000,[3,4,40,28...|[-48.318688106123...|[0.06833971544105...|       1.0|
|  5|passes kind fuck ...|    4|test|[passes, kind, fu...|[passes, kind, fu...|(10000,[189,272,2...|[-22.292437079434...|[0.39840010410599...|       1.0|
|  6|love kind ive mon...|    4|test|[love, kind, ive,...|[love, kind, ive,.

# Testing Naive Bayes Model

In [133]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
e = evaluator.evaluate(preds.filter(preds['prediction'] == 0))

print("Accuracy:", "{:.2%}".format(e))

Accuracy: 52.10%


# Labeling COVID Dataset

In [119]:
COVID_LABELED_PATH = '../data/processed/full-tweets-labeled'

In [120]:
covid_preds = model.transform(df_covid)

In [121]:
covid_preds.show()

+-------------------+--------------------+-----+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|                 id|                text|label| type|      tokenized_text|       filtered_text|            features|       rawPrediction|         probability|prediction|
+-------------------+--------------------+-----+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|               null|     fully_processed| null|covid|   [fully_processed]|   [fully_processed]|       (10000,[],[])|[-0.6927803369175...|[0.50018345546886...|       0.0|
|1246892082888945666|local modi sunday...| null|covid|[local, modi, sun...|[local, modi, sun...|(10000,[43,166,29...|[-150.28050776985...|[0.65409787288106...|       0.0|
|1246892725158449152|corona vid fight ...| null|covid|[corona, vid, fig...|[corona, vid, fig...|(10000,[1,43,58,6...|[-157.36454688188...|[0.0234

In [122]:
from pyspark.sql.functions import udf

weighted_prob = udf(lambda v: float(v[1]), DoubleType())
covid_preds_final = covid_preds.withColumn("weighted_label", weighted_prob("probability")).select("id", "prediction", "weighted_label")

In [123]:
covid_preds_final.repartition(1).write.csv(COVID_LABELED_PATH)

In [128]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(preds)

0.25175702811244977