In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import KFold, cross_val_score

%matplotlib inline
plt.style.use('ggplot')

In [80]:
df = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
seed = 42

In [40]:
df.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [81]:
le = LabelEncoder()
df['DayOfWeek'] = le.fit_transform(df.DayOfWeek)
test['DayOfWeek'] = le.transform(test.DayOfWeek)

df['PdDistrict'] = le.fit_transform(df.PdDistrict)
test['PdDistrict'] = le.transform(test.PdDistrict)

#Заменяем адреса отсутствующие в тестовой выборке на самое частое значение в обучающей: (исправить и заменить на ближайший адрес по координатам?)
test.loc[~test.Address.isin(np.unique(df.Address)),['Address']] = '800 Block of BRYANT ST'
df['Address'] = le.fit_transform(df.Address)
test['Address'] = le.transform(test.Address)

df['Dates'] = pd.to_datetime(df.Dates, yearfirst=True)
df['Year'] = df.Dates.dt.year
df['Month'] = df.Dates.dt.month
df['Hour'] = df.Dates.dt.hour


In [15]:
features = ['DayOfWeek', 'PdDistrict','X','Y','Address', 'Year', 'Month', 'Hour']
clf = GradientBoostingClassifier(n_estimators=20,random_state=seed) 
clf.fit(df[features], df.Category)


GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=20,
              presort='auto', random_state=42, subsample=1.0, verbose=0,
              warm_start=False)

In [17]:
seed = 42
kf = KFold(len(df), n_folds=5, shuffle=True, random_state=seed)
scores = cross_val_score(clf, df[features], df.Category, scoring='log_loss', cv=kf)

In [19]:
scores

array([-2.52554   , -2.52789242, -2.52947081, -2.5291952 , -2.52713521])

In [22]:
import pickle
# save the classifier
with open('GB_trees20_rands42_.pkl', 'wb') as fid:
    pickle.dump(clf, fid)


In [23]:
with open('GB_trees20_rands42_.pkl', 'rb') as fid:
    clf1 = pickle.load(fid)

In [82]:
test['Dates'] = pd.to_datetime(test.Dates, yearfirst=True)
test['Year'] = test.Dates.dt.year
test['Month'] = test.Dates.dt.month
test['Hour'] = test.Dates.dt.hour
test.head()

Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,X,Y,Year,Month,Hour
0,0,2015-05-10 23:59:00,3,0,6420,-122.399588,37.735051,2015,5,23
1,1,2015-05-10 23:51:00,3,0,9760,-122.391523,37.732432,2015,5,23
2,2,2015-05-10 23:50:00,3,4,6349,-122.426002,37.792212,2015,5,23
3,3,2015-05-10 23:45:00,3,2,10656,-122.437394,37.721412,2015,5,23
4,4,2015-05-10 23:45:00,3,2,10656,-122.437394,37.721412,2015,5,23


In [83]:
pred = clf.predict_proba(test[features])

In [93]:
pred = np.round(pred, decimals=7)

In [94]:
submit = pd.DataFrame(pred, columns=clf.classes_)

In [95]:
pred[:2]

array([[ 0.0056594,  0.1191554,  0.003503 ,  0.0036812,  0.0413547,
         0.0051999,  0.0057724,  0.038444 ,  0.0065447,  0.0040444,
         0.0034383,  0.0037241,  0.007784 ,  0.0122786,  0.0034378,
         0.0063051,  0.1170335,  0.0044452,  0.0037741,  0.0441669,
         0.083758 ,  0.1131853,  0.0032677,  0.0060256,  0.0054406,
         0.038992 ,  0.0051812,  0.0183581,  0.0075385,  0.0034524,
         0.0088453,  0.0037077,  0.0382556,  0.0032491,  0.009476 ,
         0.069397 ,  0.087383 ,  0.0376781,  0.0170619],
       [ 0.0058602,  0.1157538,  0.003403 ,  0.0035962,  0.0360084,
         0.0050514,  0.0056076,  0.0482611,  0.0064602,  0.0039289,
         0.0033402,  0.0036711,  0.0075618,  0.0117283,  0.0035376,
         0.0061251,  0.1136924,  0.0045083,  0.0036663,  0.0429061,
         0.0762606,  0.1242278,  0.0031745,  0.0058536,  0.0050205,
         0.0413729,  0.0050333,  0.0189747,  0.0073232,  0.0033539,
         0.0085928,  0.0036019,  0.0378681,  0.0031573,  0.

In [107]:
submit.to_csv('gb20.csv',index=False)

In [106]:
submit['Id'] = test.Id
