In [7]:
import numpy as np
import pandas as pd
import time

def blight_model():
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.metrics import roc_curve, auc
    from sklearn.metrics import roc_auc_score
    
    df = pd.read_csv('train.csv',encoding = 'ISO-8859-1', low_memory=False, index_col=['ticket_id'])
    test_df = pd.read_csv('test.csv',encoding = 'ISO-8859-1', low_memory=False, index_col=['ticket_id'])
    address =  pd.read_csv('addresses.csv')
    latlons = pd.read_csv('latlons.csv')
    address = address.set_index('address').join(latlons.set_index('address'), how='left')
    # data cleaning and first SFeatures selection
    sFeatures = ['agency_name',
     'inspector_name',
     'violator_name',
     'zip_code',
     'ticket_issued_date',
     'hearing_date',
     'violation_code',
     'violation_description',
     'disposition',
     'fine_amount',
     'judgment_amount',
     'compliance'] 
    df = df[sFeatures].dropna(axis=0, how='any')
    xdf = df.join(address.set_index('ticket_id'))
    testdf = test_df.join(address.set_index('ticket_id'))   
    # Select X_train features for train and extract target vaule to y
    xFeatures = ['agency_name', 'disposition','judgment_amount','lat','lon','compliance']
    y = xdf['compliance']
    X = pd.get_dummies(xdf[xFeatures].drop(['compliance'],axis=1))
    xFeatures_dummies = np.array(X.columns)
    # adjust test.csv x features
    # make the features order consistenance with X_test
    tFeatures = xFeatures.copy()
    tFeatures.remove('compliance')
    Xt = testdf[tFeatures].copy()
    Xt = pd.get_dummies(Xt)
    Xt = Xt.drop(['disposition_Responsible (Fine Waived) by Admis','disposition_Responsible - Compl/Adj by Default','disposition_Responsible - Compl/Adj by Determi','disposition_Responsible by Dismissal'], axis=1)
    Xt['agency_name_Health Department']=0
    Xt['agency_name_Neighborhood City Halls']=0
    Xt = Xt[xFeatures_dummies] 
    # Fill NA Lat Lon Values
    X.lat.fillna(method='pad', inplace=True)
    X.lon.fillna(method='pad', inplace=True)
    Xt.lat.fillna(method='pad', inplace=True)
    Xt.lon.fillna(method='pad', inplace=True)
    # X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    y_train = y
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X)
    X_test = scaler.transform(Xt)    
    # KNN 
    k = 10
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    score = knn.score(X_train, y_train)
    yp_X = knn.predict(X_train)
    roc_auc = roc_auc_score(y_train, yp_X)
    fpr, tpr, _ = roc_curve(y_train,yp_X)
    aucscore = auc(fpr,tpr)
    yp_X = knn.predict_proba(X_train)
    yp_Xt = knn.predict_proba(X_test)
#     print('k = ',k)
#     print('X_train score = ',score)
#     print('roc_auc = ',roc_auc)
#     print('auc = ',aucscore)
#     print('yp_X', yp_X[0:10])
#     print('yp_Xt', yp_Xt[0:10])

    answer = pd.Series(list(zip(Xt.index,yp_Xt)))

    return answer

In [8]:
start = time.time()
bm = blight_model()
end = time.time()
print('time spend: %.4f sec' %(end - start))

k =  10
X_train score =  0.9342968294837339
roc_auc =  0.5652354534274778
auc =  0.5652354534274778
yp_X [[1.  0. ]
 [0.9 0.1]
 [1.  0. ]
 [1.  0. ]
 [0.6 0.4]
 [0.6 0.4]
 [0.9 0.1]
 [1.  0. ]
 [1.  0. ]
 [1.  0. ]]
yp_Xt [[0.8 0.2]
 [1.  0. ]
 [1.  0. ]
 [1.  0. ]
 [1.  0. ]
 [1.  0. ]
 [1.  0. ]
 [0.4 0.6]
 [0.9 0.1]
 [1.  0. ]]
time spend: 134.2207 sec


In [6]:
bm[0:10]

0                                 (284932, [0.9, 0.1])
1    (285362, [0.9666666666666667, 0.03333333333333...
2    (285361, [0.9666666666666667, 0.03333333333333...
3    (285338, [0.9666666666666667, 0.03333333333333...
4    (285346, [0.9666666666666667, 0.03333333333333...
5    (285345, [0.9666666666666667, 0.03333333333333...
6    (285347, [0.9333333333333333, 0.06666666666666...
7    (285342, [0.36666666666666664, 0.6333333333333...
8    (285530, [0.9333333333333333, 0.06666666666666...
9                                 (284989, [1.0, 0.0])
dtype: object

In [34]:
res = 'Data type Test: '
res += ['Failed: type(bm) should Series\n','Passed\n'][type(bm)==pd.Series]
res += 'Data shape Test: '
res += ['Failed: len(bm) should be 61001\n','Passed\n'][len(bm)==61001]
res += 'Data Values Test: '
res += ['Failed: all values should be in [0.,1.]\n','Passed\n'][all((bm<=1.) & (bm>=0.))]
res += 'Data Values type Test: '
res += ['Failed: bm.dtype should be float\n','Passed\n'][str(bm.dtype).count('float')>0]
res += 'Index type Test: '
res += ['Failed: type(bm.index) should be Int64Index\n','Passed\n'][type(bm.index)==pd.Int64Index]
res += 'Index values type Test: '
res += ['Failed: type(bm.index[0]) should be int64\n','Passed\n'][str(type(bm.index[0])).count("int64")>0]

res += 'Output index shape test:'
res += ['Failed, bm.index.shape should be (61001,)\n','Passed\n'][bm.index.shape==(61001,)]

res += 'Output index test: '
if bm.index.shape==(61001,):
    res +=['Failed\n','Passed\n'][all(pd.read_csv('test.csv',usecols=[0],index_col=0).sort_index().index.values==bm.sort_index().index.values)]
else:
    res+='Failed'
print(res)

TypeError: unorderable types: tuple() <= float()

In [27]:
res

'Data type Test: Passed\nData shape Test: Passed\nData Values Test: '