In [88]:
import numpy as np
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score

start = time.time()
df = pd.read_csv('train.csv',encoding = 'ISO-8859-1', low_memory=False, index_col=['ticket_id'])
test_df = pd.read_csv('test.csv',encoding = 'ISO-8859-1', low_memory=False, index_col=['ticket_id'])
address =  pd.read_csv('addresses.csv')
latlons = pd.read_csv('latlons.csv')
address = address.set_index('address').join(latlons.set_index('address'), how='left')

# data cleaning and first SFeatures selection
sFeatures = ['agency_name',
 'inspector_name',
 'violator_name',
 'zip_code',
 'ticket_issued_date',
 'hearing_date',
 'violation_code',
 'violation_description',
 'disposition',
 'fine_amount',
 'judgment_amount',
 'compliance'] 

df = df[sFeatures].dropna(axis=0, how='any')
df['date_delta'] = (pd.to_datetime(df['hearing_date']) - pd.to_datetime(df['ticket_issued_date'])).astype(int)
xdf = df.drop(['ticket_issued_date','hearing_date'],axis=1)
xdf = df.join(address.set_index('ticket_id'))
test_df['date_delta'] = (pd.to_datetime(test_df['hearing_date']) - pd.to_datetime(test_df['ticket_issued_date'])).astype(int)
testdf = test_df.join(address.set_index('ticket_id'))

# Select X_train features for train and extract target vaule to y
xFeatures = ['agency_name','disposition','judgment_amount','lat','lon','date_delta','compliance']
y = xdf['compliance']
X = xdf[xFeatures].drop(['compliance'],axis=1)
tFeatures = xFeatures.copy()
tFeatures.remove('compliance')
Xt = testdf[tFeatures].copy()

# LabelEncoder categorical angecy_name and disposition
le_agency = LabelEncoder().fit(np.array(X['agency_name'].tolist()))
le_disposition = LabelEncoder().fit(np.array(Xt['disposition'].tolist()))

# X.loc[:,'agency_name'] = le_agency.transform(X.loc[:,'agency_name'])
# X.loc[:,'disposition'] = le_disposition.transform(X.loc[:,'disposition'])
# Xt.loc[:,'agency_name'] = le_agency.transform(Xt.loc[:,'agency_name'])
# Xt.loc[:,'disposition'] = le_disposition.transform(Xt.loc[:,'disposition'])

X.loc[:,'agency_name'] = le_agency.transform(X['agency_name'])
X.loc[:,'disposition'] = le_disposition.transform(X['disposition'])
Xt.loc[:,'agency_name'] = le_agency.transform(Xt['agency_name'])
Xt.loc[:,'disposition'] = le_disposition.transform(Xt['disposition'])

# Fill NA Lat Lon Values
X.lat.fillna(method='pad', inplace=True)
X.lon.fillna(method='pad', inplace=True)
Xt.lat.fillna(method='pad', inplace=True)
Xt.lon.fillna(method='pad', inplace=True)
y_train = y
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X)
X_test = scaler.transform(Xt)
# print(X.shape, X_train.shape, X_test.shape,y_train.shape,y_test.shape)

# Grid Search best param
gbdt = GradientBoostingClassifier().fit(X_train, y_train)
grid_values = {'n_estimators': [100, 200], 'max_depth': [4,6,8]}
grid_clf_auc = GridSearchCV(gbdt, param_grid = grid_values, scoring = 'roc_auc')
grid_clf_auc.fit(X_train, y_train)
print('Grid best parameter (max. AUC): ', grid_clf_auc.best_params_)
print('Grid best score (AUC): ', grid_clf_auc.best_score_)

# Gradient Boosting Classification
# gbdt = GradientBoostingClassifier(n_estimators=200, max_depth=4).fit(X_train, y_train)
# score = gbdt.score(X_train, y_train)
# yp_X = gbdt.predict(X_train)
# aucscore = roc_auc_score(y_train,yp_X)
# yp_Xt = gbdt.predict_proba(X_test)
# print('X_train score = ',score)
# print('feature_importances = ', gbdt.feature_importances_)
# print('roc_auc = ',aucscore)
# print('yp_X', yp_X[0:10])
# print('yp_Xt', yp_Xt[0:10])

end = time.time()
print('time spend: %.4f sec' %(end - start))

Grid best parameter (max. AUC):  {'max_depth': 4, 'n_estimators': 100}
Grid best score (AUC):  0.7860258872639522
time spend: 262.6252 sec


In [7]:
print(df.shape)
print(test_df.shape)
print(address.shape)
print(latlons.shape)

(250306, 33)
(61001, 26)
(311307, 3)
(121769, 3)


In [8]:
df.columns

Index(['agency_name', 'inspector_name', 'violator_name',
       'violation_street_number', 'violation_street_name',
       'violation_zip_code', 'mailing_address_str_number',
       'mailing_address_str_name', 'city', 'state', 'zip_code',
       'non_us_str_code', 'country', 'ticket_issued_date', 'hearing_date',
       'violation_code', 'violation_description', 'disposition', 'fine_amount',
       'admin_fee', 'state_fee', 'late_fee', 'discount_amount',
       'clean_up_cost', 'judgment_amount', 'payment_amount', 'balance_due',
       'payment_date', 'payment_status', 'collection_status',
       'grafitti_status', 'compliance_detail', 'compliance'],
      dtype='object')

In [61]:
# Grid Search best param
grid_values = {'n_estimators': [200], 'max_depth': [4,6,8]}
grid_clf_auc = GridSearchCV(gbdt, param_grid = grid_values, scoring = 'roc_auc')
grid_clf_auc.fit(X_train, y_train)
print('Grid best parameter (max. AUC): ', grid_clf_auc.best_params_)
print('Grid best score (AUC): ', grid_clf_auc.best_score_)


Grid best parameter (max. AUC):  {'n_estimators': 200, 'max_depth': 4}
Grid best score (AUC):  0.7912309160880067
