In [2]:
import pandas as pd
import numpy as np
import datetime

In [3]:
# Read in all of the relevant csv files
train = pd.read_csv('data/train.csv', encoding = "ISO-8859-1")
test = pd.read_csv('data/test.csv')
addresses = pd.read_csv('data/addresses.csv')
latlons = pd.read_csv('data/latlons.csv')

FileNotFoundError: File b'data/train.csv' does not exist

In [None]:
# Drop NULL compliance values to leave binary classification problem
train.dropna(subset=['compliance'], inplace=True)

# Drop a number of features for reasons listed:
# Removing collection status and compliance detail to avoid data leakage. Removing violator name because
# it doesn't seem like that would provide much generalizable information. Removing information about violation
# location, replacing with latitude/longitude. Remove fine_amount, admin_fee, state_fee as they're rolled into
# judgment_amount, but keep the late_fee, discount_amount, and clean_up_cost. Maybe get rid of clean_up_cost
# later. Remove other columns related to payment, prevent data leakage. Removing mailing address st name and
# zip code, as well as non_us_str_code. Removing violation_description as it should overlap with violation_code.
# Removing city as I was killing the kernel trying to one hot encode it. Removing grafitti_status as the entire
# column was NaN.
droplist = ['violator_name', 'violation_street_number', 'violation_street_name', 
            'violation_zip_code', 'fine_amount','admin_fee','state_fee',
            'payment_amount', 'payment_date', 'payment_status', 'balance_due',
            'collection_status', 'compliance_detail', 
            'mailing_address_str_name', 'zip_code', 'non_us_str_code',
            'violation_description', 'city', 'grafitti_status']
train.drop(droplist, axis=1, inplace=True)

In [None]:
# Merge address and lat lon, then merge into main df
addFull = pd.merge(addresses, latlons, on='address')

# Remove address as the info there is already covered by lat and lon
X = pd.merge(train, addFull, on='ticket_id').drop('address', axis=1)

# One-hot encode categorical variables
categoricalCols = ['agency_name', 'state', 'country', 'disposition', 
                   'violation_code', 'inspector_name']

for col in categoricalCols:
    X = pd.concat([X.drop(col, axis=1), pd.get_dummies(X[col])], axis=1)

# Convert datetime columns to seconds since the epoch
dateCols = ['ticket_issued_date', 'hearing_date']

for col in dateCols:
    X[col] = pd.to_datetime(X[col])
    X[col] = (X[col] - datetime.datetime(1970,1,1)).dt.total_seconds()

# Introduce new feature - time from ticket issue to hearing date, remove old cols
X['time_to_hearing'] = X['hearing_date'] - X['ticket_issued_date']
X.drop(['ticket_issued_date', 'hearing_date'], axis=1, inplace=True)
    
# Do a final dropna
X.dropna(inplace=True)

# Get target values, then drop col from features
y = X['compliance']
X.drop('compliance', axis=1, inplace=True)

# Remove ticket id as this likely is not informative for future cases
X.drop('ticket_id', axis=1, inplace=True)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Split the training data. Won't do this for actual function, but is useful here for evaluation.
X_train, X_test, y_train, y_test = train_test_split(X, y)
rfc = RandomForestClassifier(max_features = 8, n_estimators = 10).fit(X_train, y_train)

# Actually looks surpisingly good!!
print('Training accuracy: {}'.format(rfc.score(X_train, y_train)))
print('Test accuracy: {}'.format(rfc.score(X_test, y_test)))

In [None]:
from sklearn.metrics import classification_report, roc_curve, auc

print(classification_report(y_test, rfc.predict(X_test), target_names=['non-compliant', 'compliant']))

fpr, tpr, _ = roc_curve(np.asarray(y_test), rfc.predict_proba(X_test)[:,1])
print('AUC: {}'.format(auc(fpr, tpr)))

In [1]:
feature_importances = pd.DataFrame(rfc.feature_importances_, index = X_train.columns,
                                   columns=['importance']).sort_values('importance', ascending=False)
feature_importances.iloc[0:10, :]

NameError: name 'pd' is not defined