In [115]:
import seaborn as sn
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import csv
from pprint import pprint
import random
from sklearn import preprocessing
from __future__ import division
import pickle
from pyproj import Proj, transform
from math import sqrt 

# Load data from "uncorrupted" part of dataset
filename = '../data/parking_citations_uncorrupted.csv'
random.seed(42)

In [34]:
# Get headers and data types
columns = []
with open(filename) as f:
    reader = csv.reader(f)
    columns = reader.next()
    
names = [col.replace(" ","_") for col in columns]

dtypes = {
    'Ticket_number': 'unicode' ,
    'Issue_Date': 'unicode' ,
    'Issue_Time':'unicode'  ,
    'Meter_Id': 'unicode',
    'Marked_Time': 'unicode' ,
    'RP_State_Plate': 'unicode',
    'Plate_Expiry_Date': 'unicode' ,
    'VIN': 'unicode' ,
    'Make': 'unicode' ,
    'Body Style': 'unicode' ,
    'Color': 'unicode' ,
    'Location': 'unicode' ,
    'Route': 'unicode' ,
    'Agency': 'unicode' ,
    'Violation_Code': 'unicode' ,
    'Violation_Description': 'unicode' ,
    'Fine_amount': np.float64 ,
    'Latitude': np.float64 ,
    'Longitude': np.float64 ,
}

In [63]:
# Load data from file
print("Reading data from file " + filename)

citations = pd.read_csv(
    filename ,
    header = 0 ,
    names = names ,
    dtype = dtypes ,
)

In [65]:
# Label dataset by top 25 makes
top_25_makes = citations.groupby(['Make']).size().sort_values(ascending=False)
make_names = set(top_25_makes.index[:25])

citations['top_25_makes'] = citations['Make'].apply(lambda x: (x in make_names)*1)
citations['top_25_makes'] = citations['top_25_makes'].astype('category')

In [74]:
print("Formatting and transforming data")
# Date formatting and coversion to days since epoch
citations['Issue_Date'] = pd.to_datetime(citations['Issue_Date'])

citations['Plate_Expiry_Date'] = citations['Plate_Expiry_Date'].fillna('')
citations['Plate_Expiry_Date'] = pd.to_datetime(citations['Plate_Expiry_Date'].str[:-2], format='%Y%M', errors='coerce')

citations['Issue_Date'] = pd.to_timedelta(citations['Issue_Date']).dt.days
citations['Plate_Expiry_Date'] = pd.to_timedelta(citations['Plate_Expiry_Date']).fillna(pd.to_timedelta('0 days')).dt.days

#min_date = citations['Issue_Date'].min()

#citations['Issue_Date'] = citations['Issue_Date'] - min_date
#citations['Plate_Expiry_Date'] = citations['Plate_Expiry_Date'] - min_date

# Drop vars with too many NA's
drop_vars = [
    'VIN' ,
    'Marked_Time' ,
    'Meter_Id'
]
citations.drop(drop_vars , axis=1, inplace=True)

# Fill in nulls for continuous variables
citations['Issue_time'] = citations['Issue_time'].fillna(citations['Issue_time'].median())
citations['Fine_amount'] = citations['Fine_amount'].fillna(citations['Fine_amount'].median())
#citations['Plate_Expiry_Date'] = citations['Plate_Expiry_Date'].fillna(citations['Plate_Expiry_Date'].mode())

# Fill in nulls for categorical variables
categorical_vars = [
    'RP_State_Plate' ,
    'Body_Style' ,
    'Color' ,
    'Route' ,
    'Agency' ,
    'Violation_code' ,
    'Violation_Description' ,
    'Location'
]

for v in categorical_vars:
    citations[v] = citations[v].astype('category')
    citations[v] = citations[v].fillna(citations[v].mode().values[0])

  import sys
  


RP_State_Plate
Body_Style
Color
Route
Agency
Violation_code
Violation_Description
Location


In [131]:
# Convert latitude/longitude into distance from center of LA
# Using US Feet Projection, should be Cartesian formula
# http://www.earthpoint.us/StatePlane.aspx, (34.0522,-118.2437)
la_lat = 6487847
la_lon = 1841468

citations['distance_from_la'] = ((citations['Latitude'] - la_lat)**2 + (citations['Longitude'] - la_lon)**2).apply(lambda x: sqrt(x))/5280

In [103]:
# Label-encode all high cardinality categoricals - worked better than one-hot
categorical_vars = [
    'Agency' , 
    'Color' ,
    'Route' ,
    'Violation_code' ,
    'Violation_Description' ,
    'Location' ,
    'Body_Style' ,
    'RP_State_Plate'
]

for v in categorical_vars:
    le = preprocessing.LabelEncoder()
    le.fit(citations[v])
    citations[v] = le.transform(citations[v])


  result = method(y)


Agency
Color
Route
Violation_code
Violation_Description
Location
Body_Style
RP_State_Plate


In [179]:
feature_cols = [
    'Issue_Date' ,
    'Issue_time' ,
    'Plate_Expiry_Date' ,
    'Color' ,
    'Location' ,
    'Route' ,
    'Agency' ,
    'Violation_code' ,
    'Violation_Description' ,
    'Fine_amount' ,
    'Body_Style' ,
    'RP_State_Plate' ,
    'distance_from_la'
] 

target_cols = [
    'top_25_makes' ,
]

In [172]:
labels = np.array(citations[target_cols])
features = np.array(citations[feature_cols])


In [177]:
print("Training Model")

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Train-test split
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)

# Instantiate model with 100 decision trees
rf = RandomForestClassifier(
    n_estimators = 100, 
    random_state = 42,
    max_depth = 20 ,
)
# Train the model on training data
rf.fit(train_features, train_labels)

  del sys.path[0]


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [None]:
print("Saving Model")

with open('../app/models/random_forest_classifier.pkl' ,'wb') as f:
    pickle.dump(rf, f)