In [3]:
import numpy as np
import pandas as pd
import time

In [5]:
start = time.time()
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score

df = pd.read_csv('train.csv',encoding = 'ISO-8859-1', low_memory=False, index_col=['ticket_id'])
test_df = pd.read_csv('test.csv',encoding = 'ISO-8859-1', low_memory=False, index_col=['ticket_id'])
address =  pd.read_csv('addresses.csv')
latlons = pd.read_csv('latlons.csv')
address = address.set_index('address').join(latlons.set_index('address'), how='left')

# data cleaning and first SFeatures selection
sFeatures = ['agency_name',
 'inspector_name',
 'violator_name',
 'zip_code',
 'violation_street_name',
 'violation_zip_code',             
 'ticket_issued_date',
 'hearing_date',
 'violation_code',
 'violation_description',
 'disposition',
 'fine_amount',
 'judgment_amount',
 'compliance'] 

# df = df[sFeatures].dropna(axis=0, how='any')
xdf = df.join(address.set_index('ticket_id'))
testdf = test_df.join(address.set_index('ticket_id'))

# Select X_train features for train and extract target vaule to y
xFeatures = ['agency_name',
 'disposition',
 'judgment_amount',
 'lat','lon',            
 'violation_code',
 'compliance']

X = xdf[xFeatures].dropna(axis=0, how='any')
tFeatures = list(X.columns)
tFeatures.remove('compliance')
Xt = testdf[tFeatures].copy()
Xt.lat.fillna(method='pad', inplace=True)
Xt.lon.fillna(method='pad', inplace=True)

# LabelEncoder categorical angecy_name and disposition
le_agency = LabelEncoder().fit(np.array(X['agency_name'].tolist()))
le_disposition = LabelEncoder().fit(np.array(Xt['disposition'].tolist()))
X['agency_name'] = le_agency.transform(X['agency_name'])
X['disposition'] = le_disposition.transform(X['disposition'])
Xt['agency_name'] = le_agency.transform(Xt['agency_name'])
Xt['disposition'] = le_disposition.transform(Xt['disposition'])

# Label Encoder for violation_code
xv = pd.Series(X['violation_code'].unique())
vlist =  Xt['violation_code'].unique()
X = X[X['violation_code'].isin(xv[xv.isin(vlist)])]
le_violation_code = LabelEncoder().fit(np.array(Xt['violation_code'].tolist()))
X['violation_code'] = le_violation_code.transform(X['violation_code'])
Xt['violation_code'] = le_violation_code.transform(Xt['violation_code'])

# Create X_train, y_train, X_test
y_train = X['compliance']
X = X.drop(['compliance'],axis=1)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X)
X_test = scaler.transform(Xt)

# Gradient Boosting Classification
gbdt = GradientBoostingClassifier(n_estimators=100, max_depth=4).fit(X_train, y_train)
score = gbdt.score(X_train, y_train)
yp_X = gbdt.predict(X_train)
aucscore = roc_auc_score(y_train,yp_X)
yp_Xt = gbdt.predict_proba(X_test)
res = pd.Series(yp_Xt[:,1], index = Xt.index)
print('X_train score = ',score)
print('feature_importances = ', gbdt.feature_importances_)
print('roc_auc = ',aucscore)
print('yp_X', yp_X[0:10])
print('yp_Xt', yp_Xt[0:10])
print('res', res)

end = time.time()
print('time spend: %.4f sec' %(end - start))

X_train score =  0.9377395056540582
feature_importances =  [0.02371498 0.33119462 0.2622822  0.13846083 0.16672614 0.07762124]
roc_auc =  0.5766020050996976
yp_X [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
yp_Xt [[0.90532806 0.09467194]
 [0.98643976 0.01356024]
 [0.94415613 0.05584387]
 [0.9335586  0.0664414 ]
 [0.90874566 0.09125434]
 [0.9335586  0.0664414 ]
 [0.92756958 0.07243042]
 [0.22914921 0.77085079]
 [0.98150259 0.01849741]
 [0.9732478  0.0267522 ]]
res ticket_id
284932    0.094672
285362    0.013560
285361    0.055844
285338    0.066441
285346    0.091254
285345    0.066441
285347    0.072430
285342    0.770851
285530    0.018497
284989    0.026752
285344    0.070648
285343    0.022443
285340    0.022625
285341    0.071193
285349    0.091254
285348    0.066441
284991    0.026752
285532    0.025043
285406    0.020632
285001    0.038163
285006    0.026533
285405    0.013891
285337    0.019983
285496    0.057582
285497    0.053856
285378    0.013755
285589    0.021612
285585    0.048852
285