In [13]:
import pandas as pd
import numpy as np
import csv
from pprint import pprint
import random
from sklearn import preprocessing
from __future__ import division
import pickle
import h2o

filename = '../data/parking_citations.corrupted.csv'
random.seed(42)

In [2]:
# Get headers and data types
columns = []
with open(filename) as f:
    reader = csv.reader(f)
    columns = reader.next()
    
names = [col.replace(" ","_") for col in columns]

dtypes = {
    'Ticket_number': 'unicode' ,
    'Issue_Date': 'unicode' ,
    'Issue_Time':'unicode'  ,
    'Meter_Id': 'unicode',
    'Marked_Time': 'unicode' ,
    'RP_State_Plate': 'unicode',
    'Plate_Expiry_Date': 'unicode' ,
    'VIN': 'unicode' ,
    'Make': 'unicode' ,
    'Body Style': 'unicode' ,
    'Color': 'unicode' ,
    'Location': 'unicode' ,
    'Route': 'unicode' ,
    'Agency': 'unicode' ,
    'Violation_Code': 'unicode' ,
    'Violation_Description': 'unicode' ,
    'Fine_amount': np.float64 ,
    'Latitude': np.float64 ,
    'Longitude': np.float64 ,
}

parse_dates = [
    'Issue_Date' ,
]

In [3]:
# Load data from file
citations = pd.read_csv(
    filename ,
    header = 0 ,
    names = names ,
    dtype = dtypes ,
    parse_dates = parse_dates ,
)

citations = citations[citations.Make.notnull()]

In [4]:
# Optional: take a random sample for exploration purposes
citations = citations.sample(frac = 0.1)

In [5]:
# Label dataset by top 25 makes
top_25_makes = citations.groupby(['Make']).size().sort_values(ascending=False)
make_names = set(top_25_makes.index[:25])

citations['top_25_makes'] = citations['Make'].apply(lambda x: (x in make_names)*1)
citations['top_25_makes'] = citations['top_25_makes'].astype('category')

# Convert date to days since epoch
citations['Issue_Date_Days'] = (citations['Issue_Date'] - pd.datetime(1970,1,1)).dt.days



In [9]:
citations.describe(include='all')

Unnamed: 0,Ticket_number,Issue_Date,Issue_time,RP_State_Plate,Plate_Expiry_Date,Make,Body_Style,Color,Location,Route,Agency,Violation_code,Violation_Description,Fine_amount,Latitude,Longitude,top_25_makes,Issue_Date_Days
count,4357544.0,4357544,4357544.0,4357544.0,4357544.0,4357544,4357544.0,4357544.0,4357395,4357544.0,4357544.0,4357544.0,4357544.0,4357544.0,4357543.0,4357543.0,4357544.0,4357544.0
unique,4357544.0,1724,,,710.0,1477,,,1047757,,,,,,,,2.0,
top,4254111606.0,2016-01-19 00:00:00,,,201702.0,TOYT,,,1301 ELECTRIC AVE,,,,,,,,1.0,
freq,1.0,4998,,,396547.0,721411,,,4719,,,,,,,,3986214.0,
first,,2010-01-09 00:00:00,,,,,,,,,,,,,,,,
last,,2019-01-10 00:00:00,,,,,,,,,,,,,,,,
mean,,,1203.869,8.997933,,,78.75941,37.823,,911.9001,21.06232,162.5721,481.5551,70.10722,5503485.0,1587400.0,,17153.37
std,,,472.2119,9.383043,,,11.89833,27.60509,,1483.717,3.879709,52.71173,73.33203,32.04423,3025596.0,2098413.0,,418.7181
min,,,0.0,0.0,,,0.0,0.0,,0.0,0.0,0.0,0.0,10.0,99999.0,99999.0,,14618.0
25%,,,912.0,7.0,,,77.0,7.0,,249.0,19.0,143.0,465.0,63.0,6421524.0,1821596.0,,16793.0


In [88]:
# Check null counts for each column
cols = citations.columns
for c in cols:
    print(c)
    print(citations[c].isnull().sum())

Ticket_number
0
Issue_Date
0
Issue_time
0
Meter_Id
3223695
Marked_Time
4212622
RP_State_Plate
391
Plate_Expiry_Date
396547
VIN
4349402
Make
0
Body_Style
3990
Color
1514
Location
149
Route
31677
Agency
6
Violation_code
0
Violation_Description
431
Fine_amount
3195
Latitude
1
Longitude
1
top_25_makes
0
Issue_Date_Days
0


In [6]:
# Drop vars with too many NA's
drop_vars = [
    'VIN' ,
    'Marked_Time' ,
    'Meter_Id'
]
try:
    citations.drop(drop_vars , axis=1, inplace=True)
except KeyError:
    print("Already dropped columns.")

# Fill in nulls as appropriate
citations['Issue_time'] = citations['Issue_time'].fillna(citations['Issue_time'].median())
citations['Fine_amount'] = citations['Fine_amount'].fillna(citations['Fine_amount'].median())
citations['Plate_Expiry_Date'] = citations['Plate_Expiry_Date'].fillna(citations['Plate_Expiry_Date'].median())

# Fill in nulls for categorical variables
categorical_vars = [
    'RP_State_Plate' ,
    'Body_Style' ,
    'Color' ,
    'Route' ,
    'Agency' ,
    'Violation_code' ,
    'Violation_Description' ,
    'Location'
]

for v in categorical_vars:
    print(v)
    citations[v] = citations[v].astype('category')
    citations[v] = citations[v].fillna(citations[v].mode().values[0])

RP_State_Plate
Body_Style
Color
Route
Agency
Violation_code
Violation_Description
Location


In [7]:
# Encode categorical variables
#for v in categorical_vars:
#    le = preprocessing.LabelEncoder()
#    le.fit(citations[v])
#    citations[v] = le.transform(citations[v])

In [11]:
feature_cols = [
    'Issue_Date_Days' ,
    'Issue_time' ,
    'RP_State_Plate' ,
    'Plate_Expiry_Date' ,
    'Body_Style' ,
    'Color' ,
    'Location' ,
    'Route' ,
    'Agency' ,
    'Violation_code' ,
    'Violation_Description' ,
    'Fine_amount' ,
    #'Latitude' , # These numbers too large for float32 and break RF classifier.
    #'Longitude'
]

target_cols = [
    'top_25_makes' ,
]

all_cols = feature_cols + target_cols
citations = citations[all_cols]

In [19]:
from sklearn.model_selection import train_test_split
from h2o.estimators import H2ORandomForestEstimator
h2o.init()

train, test = train_test_split(citations, test_size=0.25)
train = h2o.H2OFrame(train)

model = H2ORandomForestEstimator(ntrees=100,max_depth=20,nfolds=10)
model.train(x=feature_cols , y=target_cols[0], training_frame=train)


Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O cluster uptime:,49 secs
H2O cluster timezone:,Etc/UTC
H2O data parsing timezone:,UTC
H2O cluster version:,3.26.0.2
H2O cluster version age:,22 days
H2O cluster name:,H2O_from_python_ec2_user_n098qo
H2O cluster total nodes:,1
H2O cluster free memory:,1.670 Gb
H2O cluster total cores:,2
H2O cluster allowed cores:,2


Parse progress: |█████████████████████████████████████████████████████████| 100%
drf Model Build progress: |███████████████████████████████████████████████| 100%


In [21]:
test = h2o.H2OFrame(test)
performance = model.model_performance(test_data=test)

print performance

Parse progress: |█████████████████████████████████████████████████████████| 100%

ModelMetricsRegression: drf
** Reported on test data. **

MSE: 0.0689917752794
RMSE: 0.262662854777
MAE: 0.136890989669
RMSLE: 0.182661552012
Mean Residual Deviance: 0.0689917752794



In [26]:
# Save H2O Model
h2o.save_model(model, "../app/models/")

u'/home/ec2-user/grainger_data_science/app/models/DRF_model_python_1566103591348_1'

In [None]:
h2o.s

In [141]:
# Fill in 99999.0 values for longitude; replace with most common lat/lon pair
# Actually probably don't want to do this...missing lat/lon is enough information on its own.
"""
citations['coord'] = citations.Latitude.map(str) + "," + citations.Longitude.map(str)

coord_mode = citations.loc[citations['Latitude']!=99999.0].coord.mode()
coord_mode_lat = float(coord_mode.values[0].split(",")[0])
coord_mode_lon = float(coord_mode.values[0].split(",")[0])

citations.loc[citations['Latitude'] == 99999.0 , ['Latitude']] = coord_mode_lat
citations.loc[citations['Longitude'] == 99999.0 , ['Longitude']] = coord_mode_lon
"""

In [None]:
# Check cardinality of categorical variables
for v in categorical_vars:
    print(v)
    print(len(citations[v].unique()))

In [17]:
# Check skew of labels
split = (citations['top_25_makes'].value_counts()) / citations.shape[0]
print(split)

1    0.914785
0    0.085215
Name: top_25_makes, dtype: float64
