In [25]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, OneHotEncoderEstimator, IndexToString
import pyspark.sql.functions as f
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline

In [27]:
spark = SparkSession \
        .builder \
        .config("spark.jars", "/path/to/gcs-connector-hadoop2-latest.jar") \
        .getOrCreate()
df = (spark.read
         .format("com.databricks.spark.csv")
         .option("header", "true")
         .load("gs://bdl2021_final_project/nyc_tickets_train.csv"))

In [28]:
df2 = df.drop(*['Time First Observed', 'Intersecting Street', 'Law Section', 'Violation Legal Code', 
                'From Hours In Effect', 'To Hours In Effect', 'Unregistered Vehicle?', 
                'Meter Number', 'Violation Description', 'No Standing or Stopping Violation', 'Hydrant Violation', 
                'Double Parking Violation', 'Latitude', 'Longitude', 'Community Board', 
                'Community Council', 'Census Tract', 'BIN', 'BBL', 'NTA'])

Categorical Encoding

In [29]:
cat_cols = ['Registration State', 'Plate Type', 'Violation Code', 'Vehicle Body Type', 'Vehicle Make', 
            'Issuing Agency', 'Issuer Code', 'Issuer Command', 'Issuer Squad', 
            'Violation Time', 'Violation In Front Of Or Opposite', 
            'House Number', 'Street Name', 
            'Sub Division', 'Days Parking In Effect', 'Vehicle Color', 'Vehicle Year',
            'Feet From Curb', 'Violation Post Code']

In [30]:
cat_cols_indexed = map(lambda x: x+'_Index', cat_cols)
cat_cols_onehot = map(lambda x: x+'_Onehot', cat_cols)

In [None]:
featureIndexers = []
for i in cat_cols:
    featureIndexers.append(StringIndexer(inputCol=i,outputCol=i+'_Index').setHandleInvalid("keep"))

In [40]:
from pyspark.ml.feature import StringIndexer, OneHotEncoderEstimator
OHE = OneHotEncoderEstimator(inputCols=cat_cols_indexed,outputCols=cat_cols_onehot)

In [41]:
from pyspark.ml.feature import VectorAssembler
columns = ['Summons Number', 'Plate ID', 'Registration State', 'Plate Type', 
           'Issue Date', 'Violation Code', 'Vehicle Body Type', 'Vehicle Make', 
           'Issuing Agency', 'Street Code1', 'Street Code2', 'Street Code3', 
           'Vehicle Expiration Date', 'Issuer Code', 'Issuer Command', 
           'Issuer Squad', 'Violation Time', 'Violation_County', 'Violation In Front Of Or Opposite', 
           'House Number', 'Street Name', 'Date First Observed', 'Sub Division', 
           'Days Parking In Effect', 'Vehicle Color', 'Vehicle Year', 'Feet From Curb', 'Violation Post Code']

for i in cat_cols:
    columns.remove(i)
columns = columns + cat_cols_onehot
columns.remove('Violation_County') # feature to predict


for i in ['Summons Number', 'Plate ID', 'Vehicle Expiration Date', 
          'Issue Date', 'Street Code1', 'Street Code2', 'Street Code3',
          'Date First Observed']:
    columns.remove(i)
columns

['Registration State_Onehot',
 'Plate Type_Onehot',
 'Violation Code_Onehot',
 'Vehicle Body Type_Onehot',
 'Vehicle Make_Onehot',
 'Issuing Agency_Onehot',
 'Issuer Code_Onehot',
 'Issuer Command_Onehot',
 'Issuer Squad_Onehot',
 'Violation Time_Onehot',
 'Violation In Front Of Or Opposite_Onehot',
 'House Number_Onehot',
 'Street Name_Onehot',
 'Sub Division_Onehot',
 'Days Parking In Effect_Onehot',
 'Vehicle Color_Onehot',
 'Vehicle Year_Onehot',
 'Feet From Curb_Onehot',
 'Violation Post Code_Onehot']

In [42]:
assembler = VectorAssembler(inputCols=columns,
                           outputCol='vector')

In [46]:
labelIndexer = StringIndexer().setInputCol('Violation_County').setOutputCol("label").setHandleInvalid("skip").fit(df2)

In [47]:
classifier = RandomForestClassifier(featuresCol='vector',labelCol='label')

In [48]:
outputLabel = IndexToString().setInputCol("prediction").setOutputCol('Violation_County_Prediction').setLabels(labelIndexer.labels)

In [49]:
pipeline = Pipeline(stages= featureIndexers + [OHE, assembler, labelIndexer, classifier, outputLabel])

In [None]:
model = pipeline.fit(df2)