In [1]:
# imports
import pyspark.sql.functions as F
import csv
import os
import sys
# Spark imports
from pyspark.sql import DataFrame
from pyspark.sql import SparkSession
from pyspark.sql.functions import when, col, count, isnull, upper, substring, to_timestamp, unix_timestamp, lit, avg
from pyspark.sql.types import IntegerType
from pyspark.ml.feature import Normalizer
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator, Evaluator
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit, CrossValidator
from pyspark.ml.regression import LinearRegression
from pyspark.mllib.evaluation import RegressionMetrics
from pyspark.ml.regression import GBTRegressor

def init_spark():
    spark = SparkSession \
        .builder \
        .appName("NYC 311 Data Analysis") \
        .config('spark.sql.codegen.wholeStage', 'false') \
        .getOrCreate()
    return spark

In [2]:
def dataPreparation(csvPath="data/New_Data/311_Cleaned_Data_2019.csv"):
    spark = init_spark()
    nyc_311_df_2019 = spark.read.csv(csvPath, inferSchema=True, header=True)
    nyc_2019 = nyc_311_df_2019.drop('created_date')
    
    # Code to make categorical data into columns (Agenct, Borough, Complaint Type, Channel) 
    agencies = nyc_2019.select("Agency").distinct().rdd.flatMap(lambda x: x).collect()
    boroughs = nyc_2019.select("Borough").distinct().rdd.flatMap(lambda x: x).collect()
    complain_types = nyc_2019.select("complaint_type").distinct().rdd.flatMap(lambda x: x).collect()
    open_data_channel_types = nyc_2019.select("open_data_channel_type").distinct().rdd.flatMap(lambda x: x).collect()
    
    # filling new column with value 1 if belong to particular category
    agencies_expr = [F.when(F.col("Agency") == ty, 1).otherwise(0).alias("e_AGENCY_" + ty) for ty in agencies]
    boroughs_expr = [F.when(F.col("Borough") == code, 1).otherwise(0).alias("e_BOROUGH_" + code) for code in boroughs]
    complain_types_expr = [F.when(F.col("complaint_type") == ty, 1).otherwise(0).alias("e_COMPLAIN_TYPE_" + ty) for ty in complain_types]
    open_data_channel_types_expr = [F.when(F.col("open_data_channel_type") == code, 1).otherwise(0).alias("e_CHANNEL_TYPE_" + code) for code in open_data_channel_types]
    
    nyc_2019_new = nyc_2019.select("Creation_Month", "Creation_Day", "Creation_Hour", 'time_to_resolve_in_hrs', *agencies_expr+boroughs_expr+complain_types_expr+open_data_channel_types_expr)
    
    nyc_2019_new.cache()
    # Save new csv for prepared data to be used in model Learning
    nyc_2019_new.coalesce(1).write.format('com.databricks.spark.csv').save('311 Learning Data.csv',header = 'true')

In [3]:

spark = init_spark()
nyc_311_df_2019 = spark.read.csv("data/New_Data/311 Learning Data.csv", inferSchema=True, header=True)
nyc_311_df_2019.printSchema()

root
 |-- Creation_Month: integer (nullable = true)
 |-- Creation_Day: integer (nullable = true)
 |-- Creation_Hour: integer (nullable = true)
 |-- time_to_resolve_in_hrs: double (nullable = true)
 |-- e_AGENCY_HPD: integer (nullable = true)
 |-- e_AGENCY_NYPD: integer (nullable = true)
 |-- e_AGENCY_DEP: integer (nullable = true)
 |-- e_AGENCY_DSNY: integer (nullable = true)
 |-- e_AGENCY_DOITT: integer (nullable = true)
 |-- e_BOROUGH_UNSPECIFIED: integer (nullable = true)
 |-- e_BOROUGH_QUEENS: integer (nullable = true)
 |-- e_BOROUGH_BROOKLYN: integer (nullable = true)
 |-- e_BOROUGH_BRONX: integer (nullable = true)
 |-- e_BOROUGH_MANHATTAN: integer (nullable = true)
 |-- e_BOROUGH_STATEN ISLAND: integer (nullable = true)
 |-- e_COMPLAIN_TYPE_UNSANITARY CONDITION: integer (nullable = true)
 |-- e_COMPLAIN_TYPE_Illegal Parking: integer (nullable = true)
 |-- e_COMPLAIN_TYPE_Noise - Residential: integer (nullable = true)
 |-- e_COMPLAIN_TYPE_Noise - Commercial: integer (nullable = tr

In [4]:
# Feature vector
def getLearningDataWithFeatureVector(learningData= "data/New_Data/311 Learning Data.csv"):
    nyc_311_df_2019 = spark.read.csv(learningData, inferSchema=True, header=True)
    assembler = VectorAssembler(inputCols=['Creation_Month', 'Creation_Day','Creation_Hour','e_AGENCY_HPD', 'e_AGENCY_NYPD', 'e_AGENCY_DEP', 'e_AGENCY_DSNY', 'e_AGENCY_DOITT', 'e_BOROUGH_UNSPECIFIED', 'e_BOROUGH_BROOKLYN', 'e_BOROUGH_BRONX', 'e_BOROUGH_MANHATTAN', 'e_BOROUGH_STATEN ISLAND', 'e_COMPLAIN_TYPE_UNSANITARY CONDITION', 'e_COMPLAIN_TYPE_Illegal Parking', 'e_COMPLAIN_TYPE_Noise - Residential', 'e_COMPLAIN_TYPE_Noise - Commercial', 'e_COMPLAIN_TYPE_Water System', 'e_COMPLAIN_TYPE_Blocked Driveway', 'e_COMPLAIN_TYPE_HEAT/HOT WATER', 'e_COMPLAIN_TYPE_PAINT/PLASTER', 'e_COMPLAIN_TYPE_PAINT/PLASTER', 'e_COMPLAIN_TYPE_Noise', 'e_COMPLAIN_TYPE_Request Large Bulky Item Collection', 'e_COMPLAIN_TYPE_PLUMBING', 'e_COMPLAIN_TYPE_WATER LEAK', 'e_COMPLAIN_TYPE_Noise - Street/Sidewalk', 'e_CHANNEL_TYPE_MOBILE', 'e_CHANNEL_TYPE_UNKNOWN', 'e_CHANNEL_TYPE_OTHER', 'e_CHANNEL_TYPE_PHONE', 'e_CHANNEL_TYPE_ONLINE'],
    outputCol="features")
    output = assembler.transform(nyc_311_df_2019)
    X = output.select("features", "time_to_resolve_in_hrs").withColumnRenamed("time_to_resolve_in_hrs","label")
    return X

In [5]:
def trainWithLinearRegressor(X):
    train, test = X.randomSplit([0.8, 0.2], seed=12345)

    lr = LinearRegression()

    # We use a ParamGridBuilder to construct a grid of parameters to search over.
    # TrainValidationSplit will try all combinations of values and determine best model using
    # the evaluator.
    paramGrid = ParamGridBuilder()\
        .addGrid(lr.regParam, [0.1, 0.01]) \
        .addGrid(lr.fitIntercept, [False, True])\
        .addGrid(lr.maxIter, [100, 150, 200])\
        .build()

    # In this case the estimator is simply the linear regression.
    # A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
    tvs = CrossValidator(estimator=lr,
                               estimatorParamMaps=paramGrid,
                               evaluator=RegressionEvaluator(),)

    # Run TrainValidationSplit, and choose the best set of parameters.
    model = tvs.fit(train)

    # Make predictions on test data. model is the model with combination of parameters
    # that performed best.
    model.transform(test)\
        .select("features", "label", "prediction")\
        .show()
    print("Linear Regression Result")
    print("RootMeanSquare:")
    print(model.bestModel.summary.rootMeanSquaredError)
    print("R2:")
    print(model.bestModel.summary.r2)

In [6]:
def startTrainingLinearRegressor():
    trainWithLinearRegressor(getLearningDataWithFeatureVector())

In [7]:
startTrainingLinearRegressor()

+--------------------+------------------+------------------+
|            features|             label|        prediction|
+--------------------+------------------+------------------+
|(32,[0,1,2,3,9,13...| 749.1591666666667|420.03701095705264|
|(32,[0,1,2,3,9,13...|1039.5022222222221| 419.2431538561919|
|(32,[0,1,2,3,9,13...|          312.7975| 419.5869840435109|
|(32,[0,1,2,3,9,13...| 144.8177777777778|421.99379535474367|
|(32,[0,1,2,3,9,13...|            429.79|422.33762554206265|
|(32,[0,1,2,3,9,13...|374.62583333333333| 422.6814557293816|
|(32,[0,1,2,3,9,13...|20.548333333333332|423.02528591670057|
|(32,[0,1,2,3,9,13...|153.44361111111112| 424.0567764786575|
|(32,[0,1,2,3,9,13...| 464.6483333333333|419.13695712996906|
|(32,[0,1,2,3,9,13...| 98.28472222222223| 421.1999382538829|
|(32,[0,1,2,3,9,13...| 91.24972222222222|421.88759862852083|
|(32,[0,1,2,3,9,13...|305.48555555555555|422.57525900315875|
|(32,[0,1,2,3,9,13...|             85.68|422.91908919047773|
|(32,[0,1,2,3,9,13...| 3

In [8]:
def trainWithRandomForest(X):
    train, test = X.randomSplit([0.8, 0.2], seed=12345)

    rf = RandomForestRegressor()

    # We use a ParamGridBuilder to construct a grid of parameters to search over.
    # TrainValidationSplit will try all combinations of values and determine best model using
    # the evaluator.
    paramGrid = ParamGridBuilder()\
        .addGrid(rf.numTrees, [35, 50]) \
        .addGrid(rf.maxDepth, [7, 10])\
        .build()

    # In this case the estimator is simply the linear regression.
    # A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
    tvs = CrossValidator(estimator=rf,
                               estimatorParamMaps=paramGrid,
                               evaluator=RegressionEvaluator())

    # Run TrainValidationSplit, and choose the best set of parameters.
    model = tvs.fit(train)

    # Make predictions on test data. model is the model with combination of parameters
    # that performed best.
    predictions = model.transform(test)
    predictions.select("features", "label", "prediction")\
        .show()
    # return model
    #print("Linear Regression Result")
    #print("RootMeanSquare:")
    #print(model.bestModel.summary.rootMeanSquaredError)
    #print("R2:")
    #print(model.bestModel.summary.r2)
    # Select (prediction, true label) and compute test error
    evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
    rmse = evaluator.evaluate(predictions)
    print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)
    evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")
    r2 = evaluator.evaluate(predictions)
    print("R2 (R2) on test data = %g" % r2)
    

In [9]:
def startTrainingRandomForest():
    trainWithRandomForest(getLearningDataWithFeatureVector())

In [None]:
temp = startTrainingRandomForest()

In [None]:
# temp.bestModel.summary

In [12]:
def trainWithGBT(X):
    train, test = X.randomSplit([0.8, 0.2], seed=12345)

    rf = GBTRegressor()

    # We use a ParamGridBuilder to construct a grid of parameters to search over.
    # TrainValidationSplit will try all combinations of values and determine best model using
    # the evaluator.
    paramGrid = ParamGridBuilder()\
        .addGrid(rf.maxIter, [50, 100]) \
        .addGrid(rf.maxDepth, [5])\
        .build()

    # In this case the estimator is simply the linear regression.
    # A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
    tvs = CrossValidator(estimator=rf,
                               estimatorParamMaps=paramGrid,
                               evaluator=RegressionEvaluator())

    # Run TrainValidationSplit, and choose the best set of parameters.
    model = tvs.fit(train)

    # Make predictions on test data. model is the model with combination of parameters
    # that performed best.
    predictions = model.transform(test)
    predictions.select("features", "label", "prediction")\
        .show()
    # return model
    #print("Linear Regression Result")
    #print("RootMeanSquare:")
    #print(model.bestModel.summary.rootMeanSquaredError)
    #print("R2:")
    #print(model.bestModel.summary.r2)
    # Select (prediction, true label) and compute test error
    evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
    rmse = evaluator.evaluate(predictions)
    print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)
    evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")
    r2 = evaluator.evaluate(predictions)
    print("R2 (R2) on test data = %g" % r2)

In [13]:
def startTrainingGBT():
    trainWithGBT(getLearningDataWithFeatureVector())

In [None]:
startTrainingGBT()