## Please run this notebook in google colab

## Setup environment

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
% cd //content/drive/MyDrive/cse6250

/content/drive/MyDrive/cse6250


In [None]:
from google.colab import auth
auth.authenticate_user()
print('Authenticated')

%load_ext google.colab.data_table
# GCP proejct
project_id = 'nlp-332020'
project_number = '1054321893028'

%env GCLOUD_PROJECT=project_id
# authenticate colab notebook
import sys
if "google.colab" in sys.modules:
    from google.colab import auth
    auth.authenticate_user()

from google.cloud import bigquery
client = bigquery.Client(project=project_id)

Authenticated
env: GCLOUD_PROJECT=project_id


In [None]:
!pip install pyspark



In [None]:
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql import SQLContext
spark = SparkSession.builder \
    .master("local[1]") \
    .appName("Readmit") \
    .getOrCreate()

sc = spark.sparkContext
sqlContext = SQLContext(sc)
base = "/content/drive/My Drive/cse6250"

## ETL, text clean and data balance

In [None]:
import os
import etl, utils  # code doing ETL and text cleaning

In [None]:
psc = etl.readmission_etl(spark, client, nDays=30, s=11000)
etl.save_sparkDF(spark, psc, 30)
# psc = sqlContext.read.parquet( os.path.join(base,"readmit_30.parquet"))

In [None]:
psc.groupBy('LABEL').count().show()

In [None]:
from pyspark.sql import functions as F
psc.withColumn('length', length(psc['TEXT'])).select('LABEL','length').groupBy('LABEL').agg(F.min('length'), F.max('length'), F.mean('length') ).show()

In [None]:
df = utils.segment_text(spark, sqlContext, psc)

In [None]:
# balance the data
df_train, df_val, df_test = utils.class_balance(df)

In [None]:
base = "/content/drive/MyDrive/cse6250/"
path = 'readmit_30days'

In [None]:
# save data for modeling
utils.save_data(df_train, path, 'train.csv')
utils.save_data(df_test, path, 'test.csv')
utils.save_data(df_val, path, 'val.csv')

## Modeling with PySpark

### Read in csv file to spark dataframe for modeling with pyspark

In [None]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, CountVectorizer, StringIndexer, VectorAssembler, StopWordsRemover, RegexTokenizer, Word2Vec
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, NaiveBayes, GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator 
from pyspark.ml.tuning import CrossValidator,  TrainValidationSplit, ParamGridBuilder 
from pyspark.sql.functions import countDistinct

In [None]:
# from pyspark.sql import SparkSession
# spark = SparkSession.builder \
#     .master("local[1]") \
#     .appName("Readmit") \
#     .getOrCreate()

In [None]:
base = "/content/drive/MyDrive/cse6250/"
path = 'readmit_30'
train_set = spark.read.csv(os.path.join(base, path, "train.csv"), inferSchema=True, header = True)
val_set = spark.read.csv(os.path.join(base, path, "val.csv"), inferSchema=True, header = True)
test_set = spark.read.csv(os.path.join(base, path, "test.csv"), inferSchema=True, header = True)

In [None]:
test_set.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- ID: integer (nullable = true)
 |-- Label: integer (nullable = true)
 |-- TEXT: string (nullable = true)



In [None]:
test_set.select(countDistinct("Label")).show()

+---------------------+
|count(DISTINCT Label)|
+---------------------+
|                    2|
+---------------------+



### Build classification models on TF-IDF tokens

In [None]:
# get tf-idf tokens
# ref: https://github.com/adarsh-tyagi/Apache-Spark-ML/blob/master/NLP_Code_Along.ipynb
tokenizer = Tokenizer(inputCol="TEXT", outputCol="token_text")
stopremove = StopWordsRemover(inputCol='token_text',outputCol='words')
cv = CountVectorizer(vocabSize=2**16, inputCol="words", outputCol='cv')
idf = IDF(inputCol='cv', outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
tokenPipeline = Pipeline(stages = [tokenizer, stopremove, cv, idf])
token = tokenPipeline.fit(train_set)

#### Define functions to do prediction and evaluation

In [None]:
def get_pred(token, labelcol, classifier, train_set, val_set, test_set ):
  label_stringIdx = StringIndexer(inputCol = labelcol, outputCol = "label")
  pipeline = Pipeline(stages=[token, label_stringIdx, classifier])
  model = pipeline.fit(train_set)
  tr_df = model.transform(train_set)
  val_df = model.transform(val_set)
  te_df = model.transform(test_set)
  # print(te_df.groupBy('label', 'prediction').count().show())
  print('train set metric')
  get_metric(tr_df)
  print('val set metric')
  get_metric(val_df)
  print('test set metric')
  get_metric(te_df)
  return model

In [None]:
eva  = BinaryClassificationEvaluator()
# ref: https://shihaojran.com/distributed-machine-learning-using-pyspark/
def get_metric(predictions):
  # calculate AUC
  auc = eva.evaluate(predictions, {eva.metricName: 'areaUnderROC'})
  print('AUROC: %0.3f' % auc)
  aucpr = eva.evaluate(predictions, {eva.metricName: 'areaUnderPR'})
  print('AUCPR: %0.3f' % aucpr)
  # compute TN, TP, FN, and FP
  predictions.groupBy('label', 'prediction').count().show()
  # Calculate the elements of the confusion matrix
  TN = predictions.filter('prediction = 0 AND label = prediction').count()
  TP = predictions.filter('prediction = 1 AND label = prediction').count()
  FN = predictions.filter('prediction = 0 AND label <> prediction').count()
  FP = predictions.filter('prediction = 1 AND label <> prediction').count()
  # calculate accuracy, precision, recall, and F1-score
  accuracy = (TN + TP) / (TN + TP + FN + FP)
  precision = TP / (TP + FP)
  recall = TP / (TP + FN)
  F =  2 * (precision*recall) / (precision + recall)
  print('precision: %0.3f' % precision)
  print('recall: %0.3f' % recall)
  print('accuracy: %0.3f' % accuracy)
  print('F1 score: %0.3f' % F)
  print('\n')

 

#### Apply logistic regression on TF-IDF tokens and use CV to do hyper-parameter tuning

In [None]:
lr = LogisticRegression(maxIter=20)
lr.setRegParam(0.0001)
get_pred(token, 'Label', lr, train_set, val_set, test_set )

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|  902|
|  0.0|       1.0|  556|
|  1.0|       0.0|  674|
|  0.0|       0.0|  876|
+-----+----------+-----+

None
start get metric
AUROC: 0.783
AUCPR: 0.778
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0| 8995|
|  0.0|       1.0| 3607|
|  1.0|       0.0| 3980|
|  0.0|       0.0| 9494|
+-----+----------+-----+

precision: 0.714
recall: 0.693
accuracy: 0.709
F1 score: 0.703
AUROC: 0.607
AUCPR: 0.613
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|  836|
|  0.0|       1.0|  545|
|  1.0|       0.0|  720|
|  0.0|       0.0|  848|
+-----+----------+-----+

precision: 0.605
recall: 0.537
accuracy: 0.571
F1 score: 0.569
AUROC: 0.613
AUCPR: 0.619
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|  902|
|  0.0|       1.0|  556|
|  1.0|       0.0|  674|
|  0.0|       0.0|  8

In [None]:
label_stringIdx = StringIndexer(inputCol = "Label", outputCol = "label")
lr = LogisticRegression()
params = ParamGridBuilder()
params = params.addGrid(lr.regParam, [1e-5, 0.00005, .0001, .0005, 0.001, 0.005, 0.01]).addGrid(lr.maxIter, [10, 20]).addGrid(lr.elasticNetParam, [0, .5, 1])
          #.0005, 0.001, 0.005, 0.01]) 
params = params.build()

f1eva = MulticlassClassificationEvaluator(metricName='fMeasureByLabel', metricLabel=1, beta=1.0) 
pipeline = Pipeline(stages = [token, label_stringIdx, lr])
cv = CrossValidator(estimator=pipeline,
                    estimatorParamMaps=params,
                    evaluator= f1eva,
                    numFolds=5)

In [None]:
f1eva.evaluate(cvModel.transform(test_set))

0.5964678440519826

In [None]:
cvModel = cv.fit(train_set)
bestModel = cvModel.bestModel

In [None]:
print(cvModel.avgMetrics[0])

5 0.617183798744118


In [None]:
# model_path = os.path.join(base + "LGmodel_cv")
# cvModel.write().save(model_path)

In [None]:
pred = bestModel.transform(test_set)
get_metric(pred)

AUROC: 0.630
AUCPR: 0.629
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|  895|
|  0.0|       1.0|  530|
|  1.0|       0.0|  681|
|  0.0|       0.0|  902|
+-----+----------+-----+

precision: 0.628
recall: 0.568
accuracy: 0.597
F1 score: 0.596




In [None]:
## get best param
cvModel.getEstimatorParamMaps()[ np.argmax(cvModel.avgMetrics) ]
# 0.628637	reg0.00500	iter10	elesticNetParam0.5

{Param(parent='LogisticRegression_49cc9e0386f5', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.5,
 Param(parent='LogisticRegression_49cc9e0386f5', name='maxIter', doc='max number of iterations (>= 0).'): 10,
 Param(parent='LogisticRegression_49cc9e0386f5', name='regParam', doc='regularization parameter (>= 0).'): 0.005}

In [None]:
## use code from https://stackoverflow.com/questions/51230726/extract-results-from-crossvalidator-with-paramgrid-in-pyspark
params = [{p.name: v for p, v in m.items()} for m in cvModel.getEstimatorParamMaps()]
pd.DataFrame.from_dict([
    {cvModel.getEvaluator().getMetricName(): metric, **ps} 
    for ps, metric in zip(params, cvModel.avgMetrics)
])

Unnamed: 0,fMeasureByLabel,regParam,maxIter,elasticNetParam
0,0.617184,1e-05,10,0.0
1,0.616692,1e-05,10,0.5
2,0.616972,1e-05,10,1.0
3,0.611275,1e-05,20,0.0
4,0.606526,1e-05,20,0.5
5,0.60689,1e-05,20,1.0
6,0.617413,5e-05,10,0.0
7,0.616781,5e-05,10,0.5
8,0.616795,5e-05,10,1.0
9,0.612459,5e-05,20,0.0


#### Apply random forest on tf-idf tokens and tried different hyper-parameters

In [None]:
rf = RandomForestClassifier( numTrees=100, maxDepth=20, subsamplingRate=0.88)
get_pred(token, 'Label', rf, train_set, val_set, test_set ) #take 12mins

train set metric
AUROC: 0.902
AUCPR: 0.911
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0| 9898|
|  0.0|       1.0| 1643|
|  1.0|       0.0| 3077|
|  0.0|       0.0|11458|
+-----+----------+-----+

precision: 0.858
recall: 0.763
accuracy: 0.819
F1 score: 0.807


val set metric
AUROC: 0.638
AUCPR: 0.638
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|  854|
|  0.0|       1.0|  519|
|  1.0|       0.0|  702|
|  0.0|       0.0|  874|
+-----+----------+-----+

precision: 0.622
recall: 0.549
accuracy: 0.586
F1 score: 0.583


test set metric
AUROC: 0.642
AUCPR: 0.642
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|  941|
|  0.0|       1.0|  531|
|  1.0|       0.0|  635|
|  0.0|       0.0|  901|
+-----+----------+-----+

precision: 0.639
recall: 0.597
accuracy: 0.612
F1 score: 0.617




In [None]:
rf = RandomForestClassifier( numTrees=100, maxDepth=20 )
rf100 = get_pred(token, 'Label', rf, train_set, val_set, test_set )

train set metric
AUROC: 0.905
AUCPR: 0.914
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0| 9707|
|  0.0|       1.0| 1390|
|  1.0|       0.0| 3268|
|  0.0|       0.0|11711|
+-----+----------+-----+

precision: 0.875
recall: 0.748
accuracy: 0.821
F1 score: 0.806


val set metric
AUROC: 0.621
AUCPR: 0.625
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|  838|
|  0.0|       1.0|  500|
|  1.0|       0.0|  718|
|  0.0|       0.0|  893|
+-----+----------+-----+

precision: 0.626
recall: 0.539
accuracy: 0.587
F1 score: 0.579


test set metric
AUROC: 0.635
AUCPR: 0.632
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|  918|
|  0.0|       1.0|  528|
|  1.0|       0.0|  658|
|  0.0|       0.0|  904|
+-----+----------+-----+

precision: 0.635
recall: 0.582
accuracy: 0.606
F1 score: 0.608




In [None]:
rf = RandomForestClassifier( numTrees=100, maxDepth=30 )
rf100_30 = get_pred(token, 'Label', rf, train_set, val_set, test_set )

train set metric
AUROC: 0.975
AUCPR: 0.978
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|11010|
|  0.0|       1.0|  373|
|  1.0|       0.0| 1965|
|  0.0|       0.0|12728|
+-----+----------+-----+

precision: 0.967
recall: 0.849
accuracy: 0.910
F1 score: 0.904


val set metric
AUROC: 0.630
AUCPR: 0.631
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|  881|
|  0.0|       1.0|  524|
|  1.0|       0.0|  675|
|  0.0|       0.0|  869|
+-----+----------+-----+

precision: 0.627
recall: 0.566
accuracy: 0.593
F1 score: 0.595


test set metric
AUROC: 0.639
AUCPR: 0.636
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|  963|
|  0.0|       1.0|  556|
|  1.0|       0.0|  613|
|  0.0|       0.0|  876|
+-----+----------+-----+

precision: 0.634
recall: 0.611
accuracy: 0.611
F1 score: 0.622




In [None]:
rf = RandomForestClassifier( numTrees=200, maxDepth=20 )
rf200 = get_pred(token, 'Label', rf, train_set, val_set, test_set ) #take 12mins

train set metric
AUROC: 0.914
AUCPR: 0.922
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0| 9875|
|  0.0|       1.0| 1273|
|  1.0|       0.0| 3100|
|  0.0|       0.0|11828|
+-----+----------+-----+

precision: 0.886
recall: 0.761
accuracy: 0.832
F1 score: 0.819


val set metric
AUROC: 0.634
AUCPR: 0.632
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|  845|
|  0.0|       1.0|  510|
|  1.0|       0.0|  711|
|  0.0|       0.0|  883|
+-----+----------+-----+

precision: 0.624
recall: 0.543
accuracy: 0.586
F1 score: 0.581


test set metric
AUROC: 0.646
AUCPR: 0.642
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|  912|
|  0.0|       1.0|  529|
|  1.0|       0.0|  664|
|  0.0|       0.0|  903|
+-----+----------+-----+

precision: 0.633
recall: 0.579
accuracy: 0.603
F1 score: 0.605




#### Also tried NaiveBayes and GBT but the performance is not good and GBT took long very time to train

In [None]:
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")
nbmodel = get_pred(token, 'Label', nb, train_set, val_set, test_set )

train set metric
AUROC: 0.490
AUCPR: 0.485
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0| 9809|
|  0.0|       1.0| 3444|
|  1.0|       0.0| 3166|
|  0.0|       0.0| 9657|
+-----+----------+-----+

precision: 0.740
recall: 0.756
accuracy: 0.747
F1 score: 0.748


val set metric
AUROC: 0.480
AUCPR: 0.513
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|  924|
|  0.0|       1.0|  563|
|  1.0|       0.0|  632|
|  0.0|       0.0|  830|
+-----+----------+-----+

precision: 0.621
recall: 0.594
accuracy: 0.595
F1 score: 0.607


test set metric
AUROC: 0.504
AUCPR: 0.516
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0| 1012|
|  0.0|       1.0|  573|
|  1.0|       0.0|  564|
|  0.0|       0.0|  859|
+-----+----------+-----+

precision: 0.638
recall: 0.642
accuracy: 0.622
F1 score: 0.640




In [None]:
gbt = GBTClassifier(maxIter=5, maxDepth=20,  seed=42, leafCol="leafId")
gbtmodel = get_pred(token, 'Label', gbt, train_set, val_set, test_set )
# take 48min

train set metric
AUROC: 0.905
AUCPR: 0.911
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0| 9684|
|  0.0|       1.0| 1560|
|  1.0|       0.0| 3291|
|  0.0|       0.0|11541|
+-----+----------+-----+

precision: 0.861
recall: 0.746
accuracy: 0.814
F1 score: 0.800


val set metric
AUROC: 0.564
AUCPR: 0.572
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|  778|
|  0.0|       1.0|  537|
|  1.0|       0.0|  778|
|  0.0|       0.0|  856|
+-----+----------+-----+

precision: 0.592
recall: 0.500
accuracy: 0.554
F1 score: 0.542


test set metric
AUROC: 0.556
AUCPR: 0.565
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|  807|
|  0.0|       1.0|  556|
|  1.0|       0.0|  769|
|  0.0|       0.0|  876|
+-----+----------+-----+

precision: 0.592
recall: 0.512
accuracy: 0.560
F1 score: 0.549




### Build classification models on word2vec embeddings

In [None]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer, StringIndexer, VectorAssembler, HashingTF, IDF, Word2Vec

## build word2vec tokens 
regex_tokenizer = RegexTokenizer(pattern='\\W')\
                  .setInputCol("TEXT")\
                  .setOutputCol("tokens")
 
extra_stopwords = ['the',  'and', 'to', 'of', 'was', 'with', 'a', 'on', 'mg', 'in', 'for', 'tablet', 'no', 'is', 'po', 'patient', 's', 'he', 'blood', 'at', 'daily', 'sig', 'or', 'as', 'one',
 'she', 'day', 'discharge', 'his', 'left', 'history', 'am', 'her', 'were', 'you', 'right', 'by', 'your', 'not', 'pm', 'be', 'had', 'pt', 'pain', 'this', 'q', 'from', 'p', 'that', 'an']
stopwords_remover = StopWordsRemover()\
                    .setInputCol('tokens')\
                    .setOutputCol('filtered_words')\
                    .setStopWords(extra_stopwords)

word2Vec = Word2Vec(vectorSize=1000, minCount=10)\
           .setInputCol("filtered_words")\
           .setOutputCol("features")


In [None]:
# label_string_idx = StringIndexer(inputCol = 'Label', outputCol = "label")
lr = LogisticRegression(maxIter=10, regParam=0.0001)
# pipeline_wv_lr = Pipeline().setStages([word2Vec, label_string_idx, lr])
pipeline_wv = Pipeline().setStages([regex_tokenizer, stopwords_remover, word2Vec])
wvtoken = pipeline_wv.fit(train_set)
wvmodel = get_pred(wvtoken, 'Label', lr, train_set, val_set, test_set )

# pipeline_wv_lr = Pipeline().setStages([wvtoken, label_string_idx, lr])
# model_wv_lr = pipeline_wv_lr.fit(train_set)
# predictions_wv_lr = model_wv_lr.transform(test_set)
# # take 8 min with vectorsize 1000

train set metric
AUROC: 0.665
AUCPR: 0.644
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0| 8007|
|  0.0|       1.0| 4960|
|  1.0|       0.0| 4968|
|  0.0|       0.0| 8141|
+-----+----------+-----+

precision: 0.617
recall: 0.617
accuracy: 0.619
F1 score: 0.617


val set metric
AUROC: 0.625
AUCPR: 0.629
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|  896|
|  0.0|       1.0|  543|
|  1.0|       0.0|  660|
|  0.0|       0.0|  850|
+-----+----------+-----+

precision: 0.623
recall: 0.576
accuracy: 0.592
F1 score: 0.598


test set metric
AUROC: 0.634
AUCPR: 0.637
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|  942|
|  0.0|       1.0|  565|
|  1.0|       0.0|  634|
|  0.0|       0.0|  867|
+-----+----------+-----+

precision: 0.625
recall: 0.598
accuracy: 0.601
F1 score: 0.611




In [None]:
rf = RandomForestClassifier( numTrees=100, maxDepth=20, subsamplingRate=0.88)
wvmodel_rf = get_pred(wvtoken, 'Label', rf, train_set, val_set, test_set )
# take 14mins to run

train set metric
AUROC: 1.000
AUCPR: 1.000
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|12975|
|  0.0|       0.0|13101|
+-----+----------+-----+

precision: 1.000
recall: 1.000
accuracy: 1.000
F1 score: 1.000


val set metric
AUROC: 0.618
AUCPR: 0.625
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|  894|
|  0.0|       1.0|  574|
|  1.0|       0.0|  662|
|  0.0|       0.0|  819|
+-----+----------+-----+

precision: 0.609
recall: 0.575
accuracy: 0.581
F1 score: 0.591


test set metric
AUROC: 0.613
AUCPR: 0.613
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|  939|
|  0.0|       1.0|  616|
|  1.0|       0.0|  637|
|  0.0|       0.0|  816|
+-----+----------+-----+

precision: 0.604
recall: 0.596
accuracy: 0.583
F1 score: 0.600




#### reduce maxSentenceLength from default 1000 to 512

In [None]:
word2Vec1 = Word2Vec(vectorSize=1000, minCount=10, maxSentenceLength=512)\
           .setInputCol("filtered_words")\
           .setOutputCol("features")
           
# default Word2Vec(*, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, seed=None, inputCol=None, outputCol=None, windowSize=5, maxSentenceLength=1000
lr = LogisticRegression(maxIter=10, regParam=0.0001)
pipeline_wv1 = Pipeline().setStages([regex_tokenizer, stopwords_remover, word2Vec1])
wvtoken1 = pipeline_wv1.fit(train_set)
wvmodel1 = get_pred(wvtoken1, 'Label', lr, train_set, val_set, test_set )

train set metric
AUROC: 0.662
AUCPR: 0.642
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0| 7876|
|  0.0|       1.0| 4851|
|  1.0|       0.0| 5099|
|  0.0|       0.0| 8250|
+-----+----------+-----+

precision: 0.619
recall: 0.607
accuracy: 0.618
F1 score: 0.613


val set metric
AUROC: 0.625
AUCPR: 0.630
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|  859|
|  0.0|       1.0|  512|
|  1.0|       0.0|  697|
|  0.0|       0.0|  881|
+-----+----------+-----+

precision: 0.627
recall: 0.552
accuracy: 0.590
F1 score: 0.587


test set metric
AUROC: 0.636
AUCPR: 0.637
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|  931|
|  0.0|       1.0|  546|
|  1.0|       0.0|  645|
|  0.0|       0.0|  886|
+-----+----------+-----+

precision: 0.630
recall: 0.591
accuracy: 0.604
F1 score: 0.610




In [None]:
lr1 = LogisticRegression(maxIter=10, regParam=0.005, elasticNetParam=0.5)
wvmodel2 = get_pred(wvtoken1, 'Label', lr1, train_set, val_set, test_set )

train set metric
AUROC: 0.657
AUCPR: 0.635
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0| 8165|
|  0.0|       1.0| 5214|
|  1.0|       0.0| 4810|
|  0.0|       0.0| 7887|
+-----+----------+-----+

precision: 0.610
recall: 0.629
accuracy: 0.616
F1 score: 0.620


val set metric
AUROC: 0.623
AUCPR: 0.629
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|  911|
|  0.0|       1.0|  577|
|  1.0|       0.0|  645|
|  0.0|       0.0|  816|
+-----+----------+-----+

precision: 0.612
recall: 0.585
accuracy: 0.586
F1 score: 0.599


test set metric
AUROC: 0.629
AUCPR: 0.629
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|  974|
|  0.0|       1.0|  587|
|  1.0|       0.0|  602|
|  0.0|       0.0|  845|
+-----+----------+-----+

precision: 0.624
recall: 0.618
accuracy: 0.605
F1 score: 0.621




In [None]:
rf1 = RandomForestClassifier(numTrees=50, maxDepth=10)
wvmodel_rf1 = get_pred(wvtoken1, 'Label', rf1, train_set, val_set, test_set )

train set metric
AUROC: 0.922
AUCPR: 0.921
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|11141|
|  0.0|       1.0| 2347|
|  1.0|       0.0| 1834|
|  0.0|       0.0|10754|
+-----+----------+-----+

precision: 0.826
recall: 0.859
accuracy: 0.840
F1 score: 0.842


val set metric
AUROC: 0.610
AUCPR: 0.618
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|  863|
|  0.0|       1.0|  584|
|  1.0|       0.0|  693|
|  0.0|       0.0|  809|
+-----+----------+-----+

precision: 0.596
recall: 0.555
accuracy: 0.567
F1 score: 0.575


test set metric
AUROC: 0.620
AUCPR: 0.624
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|  947|
|  0.0|       1.0|  601|
|  1.0|       0.0|  629|
|  0.0|       0.0|  831|
+-----+----------+-----+

precision: 0.612
recall: 0.601
accuracy: 0.591
F1 score: 0.606




In [None]:
rf2 = RandomForestClassifier(numTrees=50, maxDepth=20)
wvmodel_rf2 = get_pred(wvtoken1, 'Label', rf2, train_set, val_set, test_set )

train set metric
AUROC: 1.000
AUCPR: 1.000
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|12974|
|  0.0|       1.0|    1|
|  1.0|       0.0|    1|
|  0.0|       0.0|13100|
+-----+----------+-----+

precision: 1.000
recall: 1.000
accuracy: 1.000
F1 score: 1.000


val set metric
AUROC: 0.606
AUCPR: 0.615
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|  866|
|  0.0|       1.0|  566|
|  1.0|       0.0|  690|
|  0.0|       0.0|  827|
+-----+----------+-----+

precision: 0.605
recall: 0.557
accuracy: 0.574
F1 score: 0.580


test set metric
AUROC: 0.609
AUCPR: 0.616
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|  931|
|  0.0|       1.0|  619|
|  1.0|       0.0|  645|
|  0.0|       0.0|  813|
+-----+----------+-----+

precision: 0.601
recall: 0.591
accuracy: 0.580
F1 score: 0.596




#### reduce maxSentenceLength from default 1000 to 320

In [None]:
word2Vec2 = Word2Vec(vectorSize=1000, minCount=10, maxSentenceLength=320)\
           .setInputCol("filtered_words")\
           .setOutputCol("features")
           
# default Word2Vec(*, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, seed=None, inputCol=None, outputCol=None, windowSize=5, maxSentenceLength=1000
pipeline_wv2 = Pipeline().setStages([regex_tokenizer, stopwords_remover, word2Vec2])


lr = LogisticRegression(maxIter=10, regParam=0.0001)
wvtoken2 = pipeline_wv2.fit(train_set)
wvmodel2 = get_pred(wvtoken2, 'Label', lr, train_set, val_set, test_set )

train set metric
AUROC: 0.665
AUCPR: 0.644
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0| 8024|
|  0.0|       1.0| 4945|
|  1.0|       0.0| 4951|
|  0.0|       0.0| 8156|
+-----+----------+-----+

precision: 0.619
recall: 0.618
accuracy: 0.620
F1 score: 0.619


val set metric
AUROC: 0.628
AUCPR: 0.633
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|  899|
|  0.0|       1.0|  536|
|  1.0|       0.0|  657|
|  0.0|       0.0|  857|
+-----+----------+-----+

precision: 0.626
recall: 0.578
accuracy: 0.595
F1 score: 0.601


test set metric
AUROC: 0.632
AUCPR: 0.636
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|  931|
|  0.0|       1.0|  577|
|  1.0|       0.0|  645|
|  0.0|       0.0|  855|
+-----+----------+-----+

precision: 0.617
recall: 0.591
accuracy: 0.594
F1 score: 0.604


