#### This file contains the sentiment analysis model that will classify events as ```success``` or ```failure```

In [102]:
import time

## Sklearn Implementation

In [103]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import CountVectorizer

In [104]:
sentiment140_sklearn = pd.read_parquet('../data-processed/sentiment140_model_data.parquet')

In [105]:
sentiment140_sklearn.head()

Unnamed: 0,text,label
0,love health4uandpet u guy r best,1
1,im meet one besti tonight cant wait girl talk,1
2,darealsunisakim thank twitter add sunisa got m...,1
3,sick realli cheap hurt much eat real food plu ...,1
4,lovesbrooklyn2 effect everyon,1


#### Preparing word embeddings

In [106]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(sentiment140_sklearn['text'])
y = sentiment140_sklearn['label']

In [107]:
Xtr, Xts, ytr, yts = train_test_split(
    X,
    y,
    test_size=0.3, 
    random_state=42
)

In [108]:
start_time_sklearn = time.time()
model_sklearn = LogisticRegression(random_state=42, max_iter=100).fit(Xtr.astype(int), ytr)

In [109]:
yhat = model_sklearn.predict(Xts.astype(int))

In [110]:
time_sklearn = time.time() - start_time_sklearn

In [111]:
mse_sklearn = mean_squared_error(yts, yhat)

## Pyspark Implementation

In [112]:
import pyspark
import pyspark.sql.functions as F
import pyspark.sql.types as T
import os
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import CountVectorizer

In [113]:
conf = pyspark.SparkConf()
# conf.set('spark.ui.proxyBase', '/user/' + os.environ['JUPYTERHUB_USER'] + '/proxy/4041')
conf.set('spark.driver.memory','8g')
conf.set('spark.ui.showConsoleProgress', False)
try:
    sc = pyspark.SparkContext(conf=conf)
    spark = pyspark.SQLContext.getOrCreate(sc)
except:
    print('Spark context already exists, continuing with', sc)

Spark context already exists, continuing with <SparkContext master=local[*] appName=pyspark-shell>


In [114]:
sentiment140_pyspark = spark.read.parquet('../data-processed/sentiment140_model_data.parquet')

#### Preparing word embeddings

In [115]:
sentiment140_pyspark = sentiment140_pyspark.withColumn('text', F.split(F.col('text'), ' '))

In [116]:
cv = CountVectorizer(inputCol='text', outputCol='embeddings')

In [117]:
model = cv.fit(sentiment140_pyspark)

In [118]:
sentiment140_pyspark = model.transform(sentiment140_pyspark)

In [119]:
sentiment140_pyspark = sentiment140_pyspark.withColumn('label', F.col('label').cast(T.IntegerType()))

In [120]:
sentiment140_pyspark.show(10)

+--------------------+-----+--------------------+
|                text|label|          embeddings|
+--------------------+-----+--------------------+
|[love, health4uan...|    1|(3216,[5,34,125,1...|
|[im, meet, one, b...|    1|(3216,[0,9,20,46,...|
|[darealsunisakim,...|    1|(3216,[12,15,39,5...|
|[sick, realli, ch...|    1|(3216,[24,28,37,6...|
|[lovesbrooklyn2, ...|    1|(3216,[133,1149,1...|
|[productoffear, t...|    1|(3216,[12,24,37,5...|
|[rkeithhil, than,...|    1|(3216,[55,102,265...|
|[keepinupwkri, je...|    1|(3216,[5,6,7,31,5...|
|[tommcfli, ah, co...|    1|(3216,[39,123,230...|
|[e4voip, respond,...|    1|(3216,[111,360,52...|
+--------------------+-----+--------------------+
only showing top 10 rows



In [121]:
train, test = sentiment140_pyspark.randomSplit([0.7, 0.3], seed=42)

In [122]:
model_pyspark = LogisticRegression(maxIter=100, featuresCol='embeddings', labelCol='label')

In [123]:
start_time_pyspark = time.time()
model_fitted_pyspark = model_pyspark.fit(train)

In [124]:
model_predicted_pyspark = model_fitted_pyspark.transform(test)

In [125]:
time_pyspark = time.time() - start_time_pyspark

In [126]:
model_predicted_pyspark.withColumn('prediction', F.col('prediction').cast(T.IntegerType()))\
.select(['text', 'label', 'prediction'])\
.show(10)

+--------------------+-----+----------+
|                text|label|prediction|
+--------------------+-----+----------+
|[21, day, till, c...|    1|         0|
|[a5hleyf, im, spe...|    0|         0|
|[aaronrva, bathro...|    0|         0|
|[across, univers,...|    1|         0|
|[adriman, roflmao...|    1|         0|
|           [aghsnow]|    0|         0|
|[ahh, tedium, fix...|    1|         0|
|[albinla, think, ...|    1|         1|
|[alicayaba, cuuut...|    0|         1|
|[allanatrogu, tha...|    1|         1|
+--------------------+-----+----------+
only showing top 10 rows



In [127]:
evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='label', metricName='mse')
mse_pyspark = evaluator.evaluate(model_predicted_pyspark)

#### Comparing the performance of models

In [128]:
from tabulate import tabulate

In [129]:
print(
  tabulate(
    [['Sklearn', time_sklearn, mse_sklearn],
    ['PySpark', time_pyspark, mse_pyspark]],
    headers=['Model type', 'Training and inference time', 'Mean Squared Error']
  )
)

Model type      Training and inference time    Mean Squared Error
------------  -----------------------------  --------------------
Sklearn                           0.0217271              0.373333
PySpark                           0.881626               0.421875
