In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import auc,roc_auc_score
from catboost import CatBoostClassifier,Pool, cv
from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_curve, auc, classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score, KFold, StratifiedKFold

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
from datetime import time
sns.set()
%matplotlib inline

import geopandas
from geopandas import GeoDataFrame
from shapely.geometry import Point

import warnings
warnings.filterwarnings("ignore")

**1) This part is based on prior feature engineering notebook**

https://github.com/hackforla/lucky-parking/blob/master/notebooks/LuckyParkingAnalysis.ipynb

In [3]:
df = pd.read_csv('../input/parking-citation-data/Parking_Citations_After_July_1_2015.csv')
print(df.shape)

(8706412, 18)


In [4]:
df.head()

Unnamed: 0,Ticket number,Issue Date,Issue time,Meter Id,Marked Time,RP State Plate,VIN,Make,Body Style,Color,Location,Route,Agency,Violation code,Violation Description,Fine amount,Latitude,Longitude
0,4361370075,2019/09/17 12:00:00 AM,817.0,,,LA,,MERC,PA,RD,1324 84TH PL W,551,55.0,80.69BS,NO PARK/STREET CLEAN,73,6471017.0,1808559.0
1,4361370064,2019/09/17 12:00:00 AM,811.0,VM48,,CA,,BUIC,PA,GN,1000 85TH ST W,551,55.0,88.13B+,METER EXP.,63,6473192.0,1808298.0
2,4361370053,2019/09/17 12:00:00 AM,809.0,,,CA,,TOYT,PA,GY,1000 85TH ST W,551,55.0,80.69BS,NO PARK/STREET CLEAN,73,6473192.0,1808298.0
3,4361370042,2019/09/17 12:00:00 AM,805.0,,,CA,,NISS,PA,BK,8021 VERMONT AVE,551,55.0,80.69BS,NO PARK/STREET CLEAN,73,6473164.0,1810388.0
4,4361370031,2019/09/17 12:00:00 AM,727.0,,,CA,,HOND,PA,GY,1473 45TH ST W,551,55.0,22500E,BLOCKING DRIVEWAY,68,6469917.0,1823537.0


In [5]:
df = df[df['Latitude']!=99999]
df = df[df['Longitude']!=99999]
print(df['Latitude'].min(), df['Latitude'].max())
print(df['Longitude'].min(), df['Longitude'].max())

6359739.30402891 6514332.07372465
1715555.176091 1942668.996228


In [6]:
x = (1850000<df['Longitude'])&(df['Longitude']<1875000)& (6490000<df['Latitude'])&(df['Latitude']<6520000)
sdf = df[x]
print(sdf.shape)

(192383, 18)


**2) Only include the following variables for further processing**

In [7]:
final_sdf=sdf[['Issue Date', 'Issue time', 'RP State Plate', 'Agency',
               'Location', 'Fine amount', 'Violation Description', 'Latitude', 'Longitude']]
print(final_sdf.shape)

(192383, 9)


In [8]:
final_sdf.head()

Unnamed: 0,Issue Date,Issue time,RP State Plate,Agency,Location,Fine amount,Violation Description,Latitude,Longitude
152,2019/09/17 12:00:00 AM,2103.0,CA,56.0,5033 LINCOLN AVE,58,OFF STR/OVERTIME/MTR,6499186.0,1866284.0
153,2019/09/17 12:00:00 AM,2102.0,CA,56.0,5033 LINCOLN AVE,58,OFF STR/OVERTIME/MTR,6499186.0,1866284.0
154,2019/09/17 12:00:00 AM,2100.0,CA,56.0,5033 LINCOLN AVE,58,OFF STR/OVERTIME/MTR,6499186.0,1866284.0
155,2019/09/17 12:00:00 AM,2050.0,CA,56.0,123 AVE 57 N,58,OFF STR/OVERTIME/MTR,6503242.0,1862387.0
156,2019/09/17 12:00:00 AM,2049.0,CA,56.0,123 AVE 57 N,58,OFF STR/OVERTIME/MTR,6503242.0,1862387.0


1. **3) Feature Mapping for "RP State Plate" and create new variable "State"**

In [9]:
#Create a list to store the data
state_plate = []

#For each row in the column,
for row in final_sdf['RP State Plate']:
    if row == 'MX':
        state_plate.append('Others')
    elif row == 'VN':
        state_plate.append('Others')
    elif row == 'BC':
        state_plate.append('Others')
    elif row == 'AB':
        state_plate.append('Others')
    elif row == 'ON':
        state_plate.append('Others')
    elif row == 'NB':
        state_plate.append('Others')
    elif row == 'QU':
        state_plate.append('Others')
    elif row == 'XX':
        state_plate.append('Others')
    elif row == 'FN':
        state_plate.append('Others')
    elif row == 'CN':
        state_plate.append('Others')
    elif row == 'CZ':
        state_plate.append('Others')
    elif row == 'MB':
        state_plate.append('Others')
    elif row == 'AS':
        state_plate.append('Others')
    elif row == 'US':
        state_plate.append('Others')
    elif row == 'NS':
        state_plate.append('Others')
    elif row == 'SA':
        state_plate.append('Others')
    else:
        state_plate.append(row)

final_sdf['State'] = state_plate
final_sdf['State'] = final_sdf['State'].astype('object')

**4) Another Feature Mapping to group into Meter Expired Citation vs Meter Occupied Citation**

In [10]:
#Create a list to store the data
meter_exp = []

#For each row in the column,
for row in final_sdf['Violation Description']:
    if row == 'METER EXP.':
        meter_exp.append(1)
    elif row == 'PARKED OVER TIME LIMIT':
        meter_exp.append(1)
    elif row == 'PARKED OVER TIME LIM':
        meter_exp.append(1)
    elif row == 'METER EXPIRED':
        meter_exp.append(1)
    elif row == 'COMM VEH OVER TIME LIMIT':
        meter_exp.append(1)
    elif row == 'OFF STR MTR/OUT LINE':
        meter_exp.append(1)
    elif row == 'NO EVIDENCE OF REG':
        meter_exp.append(1)
    elif row == 'OUTSIDE LINES/METER':
        meter_exp.append(1)
    elif row == 'TIME LIMIT/CITY LOT':
        meter_exp.append(1)
    elif row == 'EXPIRED TAGS':
        meter_exp.append(1)
    elif row == 'COMM VEH OVER TIME L':
        meter_exp.append(1)
    elif row == 'PKD OVER TIME LIMIT':
        meter_exp.append(1)
    elif row == 'EXCEED 72 HOURS':
        meter_exp.append(1)
    elif row == 'COMVEH RES/OV TM B-2':
        meter_exp.append(1)
    elif row == 'COMVEH RES/OV TM LMT':
        meter_exp.append(1)
    elif row == 'COMVEH RES/OV TM LMT ':
        meter_exp.append(1)
    else:
        meter_exp.append(0)
        
final_sdf['Meter Exp'] = meter_exp
final_sdf['Meter Exp']=final_sdf['Meter Exp'].astype('int8')

**5) Time features processing with a new variable added "Total Minutes"**

In [11]:
final_sdf['Time'] = pd.to_datetime(final_sdf['Issue time'], format= '%H%M', errors= "coerce").dt.time 
final_sdf['Hour'] = final_sdf['Time'].apply(lambda x: x.hour)
final_sdf['Minute'] = final_sdf['Time'].apply(lambda x: x.minute)
final_sdf['Hour'].fillna(8.0, inplace=True)
final_sdf['Minute'].fillna(5.0, inplace=True)
final_sdf['Total_Minutes'] = ((final_sdf['Hour']*60)+final_sdf['Minute']).astype('float32')

**6) This part is also from Breeze's notebook**

In [12]:
final_sdf['Fine amount'] = pd.to_numeric(final_sdf['Fine amount'], errors="coerce") 
final_sdf['Issue Date'] = pd.to_datetime(final_sdf['Issue Date']) 

#create day of the week
weekdays=('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday')
final_sdf['DayofWeek'] = final_sdf['Issue Date'].apply(lambda x : weekdays[x.weekday()])
final_sdf['Year'] = final_sdf['Issue Date'].apply(lambda x : x.year)
final_sdf['Month'] = final_sdf['Issue Date'].apply(lambda x : x.month)

**7) Add "Weekend" feature. Fill in NaN values with mode.**

In [13]:
final_sdf['Weekend'] = (np.where(np.logical_or(final_sdf['DayofWeek']=='Saturday',
                                              final_sdf['DayofWeek']=='Sunday'), 1, 0)).astype('int8')

In [14]:
final_sdf['Fine amount'].fillna(68.0, inplace=True)
final_sdf['State'].fillna('CA', inplace=True)

In [15]:
cols = ['Agency', 'Year', 'Month']
for c in cols:
    final_sdf[c]=final_sdf[c].astype('int8')
cols = ['Fine amount', 'Hour', 'Latitude', 'Longitude']
for c in cols:
    final_sdf[c]=final_sdf[c].astype('float32')

**8) Drop some features**

In [16]:
final_sdf.drop(['RP State Plate', 'Violation Description',
                'Issue time', 'Time', 'Minute'], axis=1, inplace=True)
print(final_sdf.shape)

(192383, 14)


**9) Feature Engineering for:** 

**a) Time/Day vs Location vs Citation**

**b) Location vs Citation**

**c) Time/Day vs Location**

In [17]:
# Time/Day Vs Location Vs Citation Analysis
final_sdf['Average Hourly Fine by Location'] = (final_sdf.groupby(['Hour', 'Location'])['Fine amount'].transform('mean')).astype('float32')
final_sdf['Min Hourly Fine by Location'] = (final_sdf.groupby(['Hour', 'Location'])['Fine amount'].transform('min')).astype('float32')
final_sdf['Average TMinutes Fine by Location'] = (final_sdf.groupby(['Total_Minutes', 'Location'])['Fine amount'].transform('mean')).astype('float32')
final_sdf['Min TMinutes Fine by Location'] = (final_sdf.groupby(['Total_Minutes', 'Location'])['Fine amount'].transform('min')).astype('float32')
final_sdf['AverageDay FineAmount by Location'] = (final_sdf.groupby(['DayofWeek','Location'])['Fine amount'].transform('mean')).astype('float32')
final_sdf['MinDay FineAmount by Location'] = (final_sdf.groupby(['DayofWeek','Location'])['Fine amount'].transform('min')).astype('float32')

# Location vs Citation Analysis
final_sdf['Average FineAmount by StateAgency'] = (final_sdf.groupby(['Agency', 'State'])['Fine amount'].transform('mean')).astype('float32')
final_sdf['Min FineAmount by StateAgency'] = (final_sdf.groupby(['Agency', 'State'])['Fine amount'].transform('min')).astype('float32')
final_sdf['Average FineAmount by Location'] = (final_sdf.groupby(['Location'])['Fine amount'].transform('mean')).astype('float32')
final_sdf['Min FineAmount by Location'] = (final_sdf.groupby(['Location'])['Fine amount'].transform('min')).astype('float32')

# Time/Day vs Location Analysis
final_sdf['Location by DayTMinutes'] = (final_sdf.groupby(['Total_Minutes','DayofWeek'])['Location'].transform('count')).astype('float32')
final_sdf['Location by DayHour'] = (final_sdf.groupby(['Hour','DayofWeek'])['Location'].transform('count')).astype('float32')
final_sdf['Location by MonthDayHour'] = (final_sdf.groupby(['Hour','DayofWeek', 'Month'])['Location'].transform('count')).astype('float32')
final_sdf['Location by StateHour'] = (final_sdf.groupby(['Hour','State'])['Location'].transform('count')).astype('float32')

final_sdf['State Agency'] = (final_sdf['State'].astype('str')+'_'+final_sdf['Agency'].astype('str')).astype('object')

**10) Convert into Numerical Values**

In [18]:
cols = [c for c in final_sdf.columns if final_sdf[c].dtypes=='object']
for c in cols:
    le = LabelEncoder()
    final_sdf[c] = (le.fit_transform(final_sdf[c])).astype('int8')

In [19]:
#cat_cols = []
#I did not include any categorical variables here; however, it depends on problem structures and one own's preferences.

**11) Sort values by Issue Date**

In [20]:
final_sdf.sort_values(by=['Issue Date'], inplace=True, ascending=True)

**12) Separate into train and validation sets.** 

**Model used: LGBMClassifier; Metrics: binary_logloss for binary classification**

In [21]:
y = pd.DataFrame(final_sdf['Meter Exp'])
X = final_sdf.drop(['Issue Date', 'Meter Exp'], axis=1)

**13) Slice Dataframe into train and validation sets based on ascending Issue Date**

In [22]:
X_train = X.iloc[:153913]
X_val = X.iloc[153913:]
y_train = y.iloc[:153913]
y_val = y.iloc[153913:]
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

(153913, 27) (38470, 27) (153913, 1) (38470, 1)


In [23]:
# You can also split by randon train_test_split as well
#X_train, X_val, y_train, y_val = train_test_split(X, y, 
#            test_size = 0.2, random_state = 42, stratify = y)

In [24]:
lgb_clf = LGBMClassifier(n_estimators=2800,
                      learning_rate=0.01,
                      feature_fraction=0.2,
                      bagging_fraction=0.2,
                      min_data_in_leaf=13,
                      max_depth=-1,
                      num_leaves=20,
                      early_stopping_rounds=100,
                      bagging_freq=5,
                      random_state=42,
                     )

lgb_clf.fit(X_train, y_train,
      eval_set = [(X_train, y_train),(X_val, y_val.values)],
      eval_metric = 'binary_logloss', 
      early_stopping_rounds = 100,
      verbose = 100
    )
lgb_pred = lgb_clf.predict_proba(X_val)[:, -1]

Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.252939	valid_1's binary_logloss: 0.258299
[200]	training's binary_logloss: 0.163114	valid_1's binary_logloss: 0.178487
[300]	training's binary_logloss: 0.122455	valid_1's binary_logloss: 0.142738
[400]	training's binary_logloss: 0.101337	valid_1's binary_logloss: 0.124365
[500]	training's binary_logloss: 0.0897798	valid_1's binary_logloss: 0.114939
[600]	training's binary_logloss: 0.0797086	valid_1's binary_logloss: 0.106687
[700]	training's binary_logloss: 0.0739067	valid_1's binary_logloss: 0.102168
[800]	training's binary_logloss: 0.0693409	valid_1's binary_logloss: 0.0989626
[900]	training's binary_logloss: 0.0664882	valid_1's binary_logloss: 0.0970771
[1000]	training's binary_logloss: 0.0633594	valid_1's binary_logloss: 0.0949262
[1100]	training's binary_logloss: 0.0607454	valid_1's binary_logloss: 0.0934033
[1200]	training's binary_logloss: 0.0587462	valid_1's binary_logloss: 0.092119