In [3]:
import numpy as np
import pandas as pd
import time

In [4]:
start = time.time()
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score

df = pd.read_csv('train.csv',encoding = 'ISO-8859-1', low_memory=False, index_col=['ticket_id'])
test_df = pd.read_csv('test.csv',encoding = 'ISO-8859-1', low_memory=False, index_col=['ticket_id'])
address =  pd.read_csv('addresses.csv')
latlons = pd.read_csv('latlons.csv')
address = address.set_index('address').join(latlons.set_index('address'), how='left')

# data cleaning and first SFeatures selection
sFeatures = ['agency_name',
 'inspector_name',
 'violator_name',
 'zip_code',
 'violation_street_name',
 'violation_zip_code',             
 'ticket_issued_date',
 'hearing_date',
 'violation_code',
 'violation_description',
 'disposition',
 'fine_amount',
 'judgment_amount',
 'compliance'] 

# df = df[sFeatures].dropna(axis=0, how='any')
xdf = df.join(address.set_index('ticket_id'))
testdf = test_df.join(address.set_index('ticket_id'))

# Select X_train features for train and extract target vaule to y
xFeatures = ['agency_name',
 'disposition',
 'judgment_amount',
 'lat','lon',            
 'violation_code',
 'compliance']

X = xdf[xFeatures].dropna(axis=0, how='any')
tFeatures = list(X.columns)
tFeatures.remove('compliance')
Xt = testdf[tFeatures].copy()
Xt.lat.fillna(method='pad', inplace=True)
Xt.lon.fillna(method='pad', inplace=True)

# LabelEncoder categorical angecy_name and disposition
le_agency = LabelEncoder().fit(np.array(X['agency_name'].tolist()))
le_disposition = LabelEncoder().fit(np.array(Xt['disposition'].tolist()))
X['agency_name'] = le_agency.transform(X['agency_name'])
X['disposition'] = le_disposition.transform(X['disposition'])
Xt['agency_name'] = le_agency.transform(Xt['agency_name'])
Xt['disposition'] = le_disposition.transform(Xt['disposition'])

# Label Encoder for violation_code
xv = pd.Series(X['violation_code'].unique())
vlist =  Xt['violation_code'].unique()
X = X[X['violation_code'].isin(xv[xv.isin(vlist)])]
le_violation_code = LabelEncoder().fit(np.array(Xt['violation_code'].tolist()))
X['violation_code'] = le_violation_code.transform(X['violation_code'])
Xt['violation_code'] = le_violation_code.transform(Xt['violation_code'])

# Create X_train, y_train, X_test
y_train = X['compliance']
X = X.drop(['compliance'],axis=1)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X)
X_test = scaler.transform(Xt)

# Gradient Boosting Classification
gbdt = GradientBoostingClassifier(n_estimators=100, max_depth=6).fit(X_train, y_train)
score = gbdt.score(X_train, y_train)
yp_X = gbdt.predict(X_train)
aucscore = roc_auc_score(y_train,yp_X)
yp_Xt = gbdt.predict_proba(X_test)
res = pd.Series(yp_Xt[:,1], index = Xt.index)
print('X_train score = ',score)
print('feature_importances = ', gbdt.feature_importances_)
print('roc_auc = ',aucscore)
print('yp_X', yp_X[0:10])
print('yp_Xt', yp_Xt[0:10])
print('res', res)

end = time.time()
print('time spend: %.4f sec' %(end - start))

X_train score =  0.9400538123341884
feature_importances =  [0.01908131 0.20674616 0.20204926 0.2476169  0.26431922 0.06018715]
roc_auc =  0.593572284412829
yp_X [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
yp_Xt [[0.91696204 0.08303796]
 [0.98851492 0.01148508]
 [0.9491718  0.0508282 ]
 [0.93714042 0.06285958]
 [0.91543522 0.08456478]
 [0.93714042 0.06285958]
 [0.9233772  0.0766228 ]
 [0.13480573 0.86519427]
 [0.98582126 0.01417874]
 [0.97914331 0.02085669]]
res ticket_id
284932    0.083038
285362    0.011485
285361    0.050828
285338    0.062860
285346    0.084565
285345    0.062860
285347    0.076623
285342    0.865194
285530    0.014179
284989    0.020857
285344    0.069131
285343    0.015381
285340    0.016922
285341    0.071007
285349    0.084565
285348    0.062860
284991    0.020857
285532    0.012142
285406    0.016264
285001    0.020403
285006    0.014532
285405    0.012516
285337    0.018334
285496    0.056085
285497    0.049221
285378    0.012185
285589    0.019865
285585    0.048589
2855