In [0]:
from pyspark.sql.types import *
import pyspark.sql.functions as F
from elasticsearch import Elasticsearch, helpers
from pyspark.sql.functions import col, split, randn
from pyspark.sql.types import ArrayType, IntegerType
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression, LogisticRegressionModel
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import BinaryClassificationMetrics, MulticlassMetrics
import matplotlib.pyplot as plt
import pandas as pd 
import numpy as np 
from pyspark.mllib.random import RandomRDDs

ES_HOST= 'da2020w-0016.eastus.cloudapp.azure.com' # VM Server 
es = Elasticsearch([{'host': ES_HOST}], timeout=60000)

In [0]:
schema_structfields = [
                      StructField("_id", MapType(StringType(), StringType(), True),True), 
                       StructField("actualDelay",LongType(),True),
                       StructField("angle",DoubleType(),True),
                       StructField("anomaly",BooleanType(),True),
                       StructField("areaId",LongType(),True),
                       StructField("areaId1",LongType(),True),
                       StructField("areaId2",LongType(),True),
                       StructField("areaId3",LongType(),True),
                       StructField("atStop",BooleanType(),True),
                       StructField("busStop",LongType(),True),
                       StructField("calendar",MapType(StringType(), StringType(),True),True),
                       StructField("congestion",BooleanType(),True),
                       StructField("currentHour",LongType(),True),
                       StructField("dateType",LongType(),True),
                       StructField("dateTypeEnum",StringType(),True),
                       StructField("delay",LongType(),True),
                       StructField("direction",LongType(),True),
                       StructField("distanceCovered",DoubleType(),True),
                       StructField("ellapsedTime",LongType(),True),
                       StructField("filteredActualDelay",LongType(),True),
                       StructField("gridID",StringType(),True),
                       StructField("journeyPatternId",StringType(),True), 
                       StructField("justLeftStop",BooleanType(),True),
                       StructField("justStopped",BooleanType(),True),
                       StructField("latitude",DoubleType(),True), 
                       StructField("lineId",StringType(),True), 
                       StructField('loc',StructType([StructField('coordinates',ArrayType(DoubleType(),True),True),StructField('type',StringType(),True)]),True),
                       StructField("longitude",DoubleType(),True),  
                       StructField("poiId",LongType(),True),
                       StructField("poiId2",LongType(),True),
                       StructField("probability",DoubleType(),True),
                       StructField("systemTimestamp",DoubleType(),True),
                       StructField("timestamp",MapType(StringType(),StringType(),True)), 
                       StructField("vehicleId",LongType(),True),
                       StructField("vehicleSpeed",LongType(),True)]

schema = StructType(schema_structfields)
raw_df = spark.read.json('/mnt/dacoursedatabricksstg/dacoursedatabricksdata/busFile', schema=schema)

# display(raw_df)

In [0]:
# keep relevant cols 
raw_df = raw_df[['busStop', 'lineId', 'areaId1', 'delay', 'congestion', 'timestamp', 'atStop', 'longitude', 'latitude','vehicleSpeed',"journeyPatternId"]]

# fix structered columns to contain only the value 
fixed_df = raw_df.withColumn('timestamp', F.map_values(raw_df.timestamp)[0])

# fix UTX to timestamp- relevant columns: timestamp, calender, systemTimestamp 
fixed_df = fixed_df.withColumn('timestamp', F.to_timestamp(F.from_unixtime(fixed_df.timestamp / (1000))))

# drop null and duplicate rows 
fixed_df = fixed_df.dropDuplicates()
fixed_df = fixed_df.na.drop(how="all")

# fix delay to be in mins 
fixed_df = fixed_df.withColumn('mins_delay', fixed_df.delay / 60)

# create isInCenter column 
fixed_df=fixed_df.withColumn("binary_isInCenter" ,F.expr(
    """IF (busStop IN (278, 281, 4724, 274, 279, 4725, 4508, 272, 277, 270, 6059, 271, 7402, 1184, 4717, 288, 289, 7591, 299, 298, 297, 302, 301, 273, 315, 7622, 334, 335, 336, 340, 317, 319, 325, 7392, 328, 345, 346, 7588, 404, 405, 406, 320, 1359, 7582, 1358, 7581, 1279, 1278, 4522, 4521, 494, 495, 792, 793) ,1, 0)"""
))

# create hour column 
fixed_df=fixed_df.withColumn('hour', F.hour(fixed_df.timestamp))

# create year column 
fixed_df=fixed_df.withColumn('year', F.year(fixed_df.timestamp))

# create isWeekend column 
fixed_df=fixed_df.withColumn("isWeekend", F.dayofweek(fixed_df.timestamp).isin([1,7]).cast("int"))

# create row_id column 
fixed_df = fixed_df.withColumn("row_id", F.monotonically_increasing_id())

fixed_df = fixed_df.withColumn('line_num', fixed_df['journeyPatternId'][0:4])
fixed_df = fixed_df.withColumn('line_num', F.regexp_replace('line_num', '^0+', '')) 
fixed_df = fixed_df.withColumn('direction', fixed_df['journeyPatternId'][5:5][0:1])
# display(fixed_df)

In [0]:
fixed_df = fixed_df[fixed_df.atStop==True]
# print(f'len of fixed_df: {fixed_df.count()}') #len of fixed_df: 50599721

In [0]:
# lr_full_df=run_lr(fixed_df, is_imputed='full_data') 

In [0]:
# def calc_outliers(df, col_name):
#   df_stats = df.select(
#      F.mean(F.col(col_name)).alias('mean'),
#      F.stddev(F.col(col_name)).alias('std')
#   ).collect()

#   mean = df_stats[0]['mean']
#   std = df_stats[0]['std']

#   print(f'mean of {col}: {mean}, std of {col}: {std}')
#   return mean, std 

In [0]:
# mean_delay, std_delay = calc_outliers(fixed_df, 'mins_delay') 
mean_delay = 187.79413873308675 / 60 #3.1299023122181127
std_delay = 420.2325261387322 / 60 #7.003875435645537

mean_plus_std = mean_delay+2*std_delay
mean_minus_std = mean_delay-2*std_delay

fixed_df=fixed_df.withColumn("null_mins_delay" ,F.expr(
    f"""IF ((mins_delay > {mean_plus_std}) or (mins_delay < {mean_minus_std}) ,null, mins_delay)"""
))
# display(fixed_df)

In [0]:
fixed_df = fixed_df.withColumn("null_congestion", F.expr(f"""IF ((congestion=True and mins_delay < 0) or (congestion=False and mins_delay > 10) ,null, congestion)""")) #24 mins

# display(fixed_df)

In [0]:
fixed_df = fixed_df.withColumn('binary_congestion', F.expr("""IF (congestion=True, 1, 0)"""))
fixed_df = fixed_df.withColumn('null_binary_congestion', F.expr("""IF (null_congestion=True, 1, IF(null_congestion=False, 0, null))""")) 

In [0]:
# for c in ['null_mins_delay', 'null_congestion']:
#     print(f"{c} has: {sample.where(F.col(c).isNull()).count()} null values")

# on entire df: 
# null_mins_delay has: 2294463 null values
# null_congestion has: 5507430 null values

In [0]:
# code is modified from: 
# https://towardsdatascience.com/machine-learning-with-pyspark-and-mllib-solving-a-binary-classification-problem-96396065d2aa

In [0]:
def prepare_data_for_lr(is_imputed):
  stages = []

  if is_imputed==True:
    categoricalColumns = ['filled_binary_congestion', 'areaId1', 'lineId']
    numericCols = ['filled_mins_delay', 'longitude', 'latitude']
    
  elif is_imputed==False:
    categoricalColumns = ['null_binary_congestion', 'areaId1', 'lineId']
    numericCols = ['null_mins_delay', 'longitude', 'latitude']
  
  elif is_imputed=='full_data':
    categoricalColumns = ['binary_congestion', 'areaId1', 'lineId']
    numericCols = ['mins_delay', 'longitude', 'latitude']
    
  else: #run function for MICE. is_imputed hold one of: delay\ congestion. we need to add the *other* col to the explanatory cols 
    if is_imputed=='binary_congestion':
      categoricalColumns = ['areaId1', 'lineId', 'hour', 'isWeekend']
      numericCols = ['filled_mins_delay', 'longitude', 'latitude', 'vehicleSpeed']
    else: #is_imputed=='mins_delay'
      categoricalColumns = ['filled_binary_congestion', 'areaId1', 'lineId', 'hour', 'isWeekend']
      numericCols = ['longitude', 'latitude', 'vehicleSpeed']
      
      
  for categoricalCol in categoricalColumns:
      stringIndexer = StringIndexer(inputCol = categoricalCol, outputCol = categoricalCol + 'Index').setHandleInvalid("keep")
#       encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
#       stages += [stringIndexer, encoder]
      stages += [stringIndexer]

  if is_imputed==True or is_imputed==False or is_imputed=='full_data':
    label_stringIdx = StringIndexer(inputCol = 'binary_isInCenter', outputCol = 'label').setHandleInvalid("keep")
  elif is_imputed == 'binary_congestion':
    label_stringIdx = StringIndexer(inputCol = 'filled_'+is_imputed, outputCol = 'label').setHandleInvalid("keep")
  else: # mins_delay 
    label_stringIdx = None
  if label_stringIdx != None:
    stages += [label_stringIdx]


  assemblerInputs = [c + "Index" for c in categoricalColumns] + numericCols
  assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
  stages += [assembler]
  
  return stages 

In [0]:
def pipeline_prepare_data(stages, input_df, is_delay=False):
  pipeline = Pipeline(stages = stages)
  pipelineModel = pipeline.fit(input_df)
  df = pipelineModel.transform(input_df)
  if is_delay == False:
    selectedCols = ['label', 'features'] + input_df.columns
  else: #its delay
    selectedCols = ['features'] + input_df.columns
  df = df.select(selectedCols)
#   df.printSchema()
  return df 

In [0]:
def create_lr_model(train):
  lr = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter=10)
  lrModel = lr.fit(train)

  # Print the coefficients and intercept for logistic regression
#   print("Coefficients: " + str(lrModel.coefficients))
#   print("Intercept: " + str(lrModel.intercept))
  return lrModel

In [0]:
def run_lr(df, is_imputed):
  stages = prepare_data_for_lr(is_imputed)
  df = pipeline_prepare_data(stages, df)
  train_df=df[df["year"]== 2017]
  lrModel = create_lr_model(train_df)
  lrModel.save(sc, "lrm_model.model")
  return lrModel

In [0]:
##########################################################################################################################################################################################################################

In [0]:
Complete_case_df = fixed_df.dropna()

In [0]:
stages_no_OHE = prepare_data_for_lr(is_imputed=False)
train_df_no_OHE=Complete_case_df[Complete_case_df["year"]== 2017]
df_after_pipeline_no_OHE = pipeline_prepare_data(stages_no_OHE, train_df_no_OHE)
display(df_after_pipeline_no_OHE)

In [0]:
lr_no_OHE = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter=10)
lrModel_no_OHE = lr_no_OHE.fit(df_after_pipeline_no_OHE)

In [0]:
lrModel_no_OHE.save("lrm_model_task2.model")

In [0]:
from pyspark.sql.functions import row_number, monotonically_increasing_id
from pyspark.sql import Window


def from_catCol_to_index(train_df,col_name):
  counts = train_df.select(col_name).groupBy(col_name).count().sort(col("count").desc())
  counts =counts.withColumn(col_name+"_index",row_number().over(Window.orderBy(monotonically_increasing_id()))-1)
  counts=counts.select(col_name+"_index",colname)
  counts.write.csv(col_name+'_index.csv',header=True)
  return counts


col_names=["line_num",'source','dest']
for col_name in col_names:
  counts=from_line_to_index(train_df,col_name)
#counts = counts.select("lineId_index","lineId")
