In [19]:
#Importing pyspark session
import pyspark

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [20]:
#Importing pyspark package
from pyspark.sql import SparkSession
from pyspark.sql.functions import isnan, when, count, col,trim,round, length
from pyspark import SparkContext

import pyspark.sql.functions as F
from pyspark.sql.types import *
spark=SparkSession.builder.appName('drug_dataset').getOrCreate()


VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Clean and Prepare the Data

In [21]:
#Importing train data from S3
df_train = spark.read.csv('s3://capstone-drug-dataset/captsone-drug-dataset/train_raw.csv',inferSchema=True, header=True,quote='"',escape= "\"",multiLine=True)
columnmap = {}
for column in df_train.columns:
  if column.endswith("\r"):
    columnmap[column] = column.rstrip()
for c in columnmap.keys():
  df_train = df_train.withColumn(columnmap[c], F.col(c))
  df_train = df_train.drop(c)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [22]:
#Importing test data from S3
df_test = spark.read.csv('s3://capstone-drug-dataset/captsone-drug-dataset/test_raw.csv',inferSchema=True, header=True,quote='"',escape= "\"",multiLine=True)
for column in df_test.columns:
  if column.endswith("\r"):
    columnmap[column] = column.rstrip()
for c in columnmap.keys():
  df_test = df_test.withColumn(columnmap[c], F.col(c))
  df_test = df_test.drop(c)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [23]:
#Test data samples
df_test.show(10)
df_train.printSchema
df_train = df_train.withColumn("usefulCount",round(df_train["usefulCount"]).cast('integer'))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------+---------------+--------------------+--------------------+------+---------+-----------+
|uniqueID|       drugName|           condition|              review|rating|     date|usefulCount|
+--------+---------------+--------------------+--------------------+------+---------+-----------+
|  163740|    Mirtazapine|          Depression|"I&#039;ve tried ...|    10|28-Feb-12|       22.0|
|  206473|     Mesalamine|Crohn's Disease, ...|"My son has Crohn...|     8|17-May-09|       17.0|
|  159672|        Bactrim|Urinary Tract Inf...|"Quick reduction ...|     9|29-Sep-17|        3.0|
|   39293|       Contrave|         Weight Loss|"Contrave combine...|     9| 5-Mar-17|       35.0|
|   97768|Cyclafem 1 / 35|       Birth Control|"I have been on t...|     9|22-Oct-15|        4.0|
|  208087|        Zyclara|           Keratosis|"4 days in on fir...|     4| 3-Jul-14|       13.0|
|  215892|         Copper|       Birth Control|"I&#039;ve had th...|     6| 6-Jun-16|        1.0|
|  169852|  Amitript

In [24]:
#Joining train and test data set
df = df_train.join(df_test, on=['uniqueID', 'drugName', 'condition','review','rating','date','usefulCount'], how='left_outer')

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [25]:
#Computing setniment column based on rating
sentiment = when(col("rating")<=5, 0).otherwise(1)

df = df.withColumn("sentiment",sentiment)
df = df.withColumn('length',length(df['review']))



VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Feature Transformation

In [26]:
from pyspark.ml.feature import Tokenizer,StopWordsRemover, CountVectorizer,IDF,StringIndexer

tokenizer = Tokenizer(inputCol="review", outputCol="token_text")
stopremove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
count_vec = CountVectorizer(inputCol='stop_tokens',outputCol='c_vec')
idf = IDF(inputCol="c_vec", outputCol="tf_idf")
pos_neg = StringIndexer(inputCol='sentiment',outputCol='label')

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [27]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [28]:
clean_up = VectorAssembler(inputCols=['tf_idf','length'],outputCol='features')

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [29]:
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.classification import LinearSVC
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.classification import RandomForestClassifier

from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.classification import OneVsRest


VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Building pipeline and fit model

In [30]:
from pyspark.ml import Pipeline
data_prep_pipe = Pipeline(stages=[pos_neg,tokenizer,stopremove,count_vec,idf,clean_up])
cleaner = data_prep_pipe.fit(df)
cleaner

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

PipelineModel_b8d612344e61

In [31]:
clean_data = cleaner.transform(df)
clean_data = clean_data.select(['label','features'])
clean_data.show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  1.0|(169991,[31,100,1...|
|  0.0|(169991,[3,9,26,2...|
|  0.0|(169991,[0,1,2,6,...|
|  0.0|(169991,[0,2,3,6,...|
|  0.0|(169991,[0,2,3,14...|
|  0.0|(169991,[1,2,6,9,...|
|  0.0|(169991,[1,3,16,1...|
|  0.0|(169991,[1,2,4,7,...|
|  1.0|(169991,[0,13,14,...|
|  0.0|(169991,[0,2,3,7,...|
|  0.0|(169991,[3,6,17,2...|
|  0.0|(169991,[2,10,27,...|
|  0.0|(169991,[1,3,15,3...|
|  0.0|(169991,[6,13,25,...|
|  0.0|(169991,[1,7,10,1...|
|  1.0|(169991,[24,28,32...|
|  1.0|(169991,[1,4,7,8,...|
|  1.0|(169991,[9,17,70,...|
|  0.0|(169991,[4,8,12,2...|
|  0.0|(169991,[1,2,3,28...|
+-----+--------------------+
only showing top 20 rows

## Random Forest Model Estimator and Training the data

In [32]:
(training,testing) = clean_data.randomSplit([0.7,0.3])
rf = RandomForestClassifier(featuresCol='features', labelCol='label')
rf_model = rf.fit(clean_data)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Prediction on training data

In [33]:
pred_training_rf = rf_model.transform(training)
show_columns = ['features', 'label', 'prediction', 'rawPrediction','probability']
pred_training_rf.select(show_columns).show(5, truncate=True)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+-----+----------+--------------------+--------------------+
|            features|label|prediction|       rawPrediction|         probability|
+--------------------+-----+----------+--------------------+--------------------+
|(169991,[0,1,2,3,...|  0.0|       0.0|[14.0788132806948...|[0.70394066403474...|
|(169991,[0,1,2,3,...|  0.0|       0.0|[14.0556573799784...|[0.70278286899892...|
|(169991,[0,1,2,3,...|  0.0|       0.0|[14.3401020538502...|[0.71700510269251...|
|(169991,[0,1,2,3,...|  0.0|       0.0|[14.2122993879656...|[0.71061496939828...|
|(169991,[0,1,2,3,...|  0.0|       0.0|[13.8535607590820...|[0.69267803795410...|
+--------------------+-----+----------+--------------------+--------------------+
only showing top 5 rows

## Evaluator


In [34]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
print('Accuracy on training data (areaUnderROC): ', evaluator.setMetricName('areaUnderROC').evaluate(pred_training_rf))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Accuracy on training data (areaUnderROC):  0.6948504254324511

## Prediction on test data

In [35]:
pred_testing_rf = rf_model.transform(testing)
pred_testing_rf.select(show_columns).show(5, truncate=True)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+-----+----------+--------------------+--------------------+
|            features|label|prediction|       rawPrediction|         probability|
+--------------------+-----+----------+--------------------+--------------------+
|(169991,[0,1,2,3,...|  0.0|       0.0|[14.0604897865933...|[0.70302448932966...|
|(169991,[0,1,2,3,...|  0.0|       0.0|[13.9840785177487...|[0.69920392588743...|
|(169991,[0,1,2,3,...|  0.0|       0.0|[14.0100682370540...|[0.70050341185270...|
|(169991,[0,1,2,4,...|  0.0|       0.0|[14.2741341098856...|[0.71370670549428...|
|(169991,[0,1,2,5,...|  0.0|       0.0|[14.2006748490103...|[0.71003374245051...|
+--------------------+-----+----------+--------------------+--------------------+
only showing top 5 rows

In [36]:
print('Accuracy on testing data (areaUnderROC): ', evaluator.setMetricName('areaUnderROC').evaluate(pred_testing_rf))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Accuracy on testing data (areaUnderROC):  0.6886114797811672

## Confusion Matrix

In [37]:
label_pred_train = pred_training_rf.select('label', 'prediction')
label_pred_train.rdd.zipWithIndex().countByKey()


label_pred_test = pred_testing_rf.select('label', 'prediction')
label_pred_test.rdd.zipWithIndex().countByKey()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

defaultdict(<class 'int'>, {Row(label=0.0, prediction=0.0): 33915, Row(label=1.0, prediction=0.0): 14514})

## Accuracy of the model

In [39]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
acc_eval = BinaryClassificationEvaluator()
acc = acc_eval.evaluate(pred_testing_rf)
print("Accuracy of model at predicting sentiment was: {}".format(acc))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Accuracy of model at predicting sentiment was: 0.6886114797811672