In [1]:
import numpy as np
import pandas as pd
train_set = pd.read_csv('train.csv', parse_dates=['Dates'])
test_set = pd.read_csv('test.csv', parse_dates=['Dates'])

In [2]:
from sklearn import preprocessing
le_category = preprocessing.LabelEncoder()
category = le_category.fit_transform(train_set.Category)

In [3]:
total_set = pd.concat([train_set, test_set])
total_set['DayOfWeekValue'] = 0
total_set.loc[total_set.DayOfWeek == 'Wednesday', 'DayOfWeekValue'] = 1
total_set.loc[total_set.DayOfWeek == 'Thursday', 'DayOfWeekValue'] = 2
total_set.loc[total_set.DayOfWeek == 'Friday', 'DayOfWeekValue'] = 3
total_set.loc[total_set.DayOfWeek == 'Saturday', 'DayOfWeekValue'] = 4
total_set.loc[total_set.DayOfWeek == 'Sunday', 'DayOfWeekValue'] = 5
total_set.loc[total_set.DayOfWeek == 'Monday', 'DayOfWeekValue'] = 6
total_set.loc[total_set.DayOfWeek == 'Tuesday', 'DayOfWeekValue'] = 7
print(total_set.DayOfWeek.value_counts())
total_set.DayOfWeekValue.value_counts()

Friday       268437
Wednesday    259610
Saturday     253848
Tuesday      251905
Thursday     251579
Monday       243810
Sunday       233122
Name: DayOfWeek, dtype: int64


3    268437
1    259610
4    253848
7    251905
2    251579
6    243810
5    233122
Name: DayOfWeekValue, dtype: int64

In [4]:
total_set['HourValue'] = total_set.Dates.dt.hour
total_set['MonthValue'] = total_set.Dates.dt.month
total_set['DayValue'] = total_set.Dates.dt.day
total_set['YearValue'] = total_set.Dates.dt.year

In [5]:
le_address = preprocessing.LabelEncoder()
total_set['AddressValue'] = le_address.fit_transform(total_set.Address)

In [6]:
district = pd.get_dummies(total_set.PdDistrict)
total_set = pd.concat([total_set, district], axis=1)

In [7]:
from sklearn import cross_validation
test_data = total_set[~np.isnan(total_set.Id)]
train_data, validation_data = cross_validation.train_test_split(
        total_set[np.isnan(total_set.Id)],
        train_size = 0.7,
        random_state = 3)

# Using Gradient Boosting

In [8]:
features = [
    'DayOfWeekValue', 
    'HourValue', 
    'MonthValue', 
    'DayValue', 
    'YearValue',
    'AddressValue',
    'X',
    'Y']
features+=district.columns.tolist()

In [9]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import log_loss
from sklearn.externals import joblib
clf = GradientBoostingClassifier(max_depth=4)
clf.fit(train_data[features], train_data['Category'])
predicted = np.array(clf.predict_proba(validation_data[features]))
log_loss(validation_data['Category'], predicted)
joblib.dump(clf, 'gradient-boosting-max-depth-4.pkl')

['gradient-boosting-max-depth-4.pkl',
 'gradient-boosting-max-depth-4.pkl_01.npy',
 'gradient-boosting-max-depth-4.pkl_02.npy',
 'gradient-boosting-max-depth-4.pkl_03.npy',
 'gradient-boosting-max-depth-4.pkl_04.npy']

In [10]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import log_loss
from sklearn.externals import joblib
clf = GradientBoostingClassifier(max_depth=2)
clf.fit(train_data[features], train_data['Category'])
predicted = np.array(clf.predict_proba(validation_data[features]))
log_loss(validation_data['Category'], predicted)
joblib.dump(clf, 'gradient-boosting-max-depth-2.pkl')

['gradient-boosting-max-depth-2.pkl',
 'gradient-boosting-max-depth-2.pkl_01.npy',
 'gradient-boosting-max-depth-2.pkl_02.npy',
 'gradient-boosting-max-depth-2.pkl_03.npy',
 'gradient-boosting-max-depth-2.pkl_04.npy']

In [11]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import log_loss
from sklearn.externals import joblib
clf = GradientBoostingClassifier(max_depth=1)
clf.fit(train_data[features], train_data['Category'])
predicted = np.array(clf.predict_proba(validation_data[features]))
log_loss(validation_data['Category'], predicted)
joblib.dump(clf, 'gradient-boosting-max-depth-1.pkl')

['gradient-boosting-max-depth-1.pkl',
 'gradient-boosting-max-depth-1.pkl_01.npy',
 'gradient-boosting-max-depth-1.pkl_02.npy',
 'gradient-boosting-max-depth-1.pkl_03.npy',
 'gradient-boosting-max-depth-1.pkl_04.npy']

In [12]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import log_loss
from sklearn.externals import joblib
clf = GradientBoostingClassifier(max_depth=5)
clf.fit(train_data[features], train_data['Category'])
predicted = np.array(clf.predict_proba(validation_data[features]))
log_loss(validation_data['Category'], predicted)
joblib.dump(clf, 'gradient-boosting-max-depth-5.pkl')

['gradient-boosting-max-depth-5.pkl',
 'gradient-boosting-max-depth-5.pkl_01.npy',
 'gradient-boosting-max-depth-5.pkl_02.npy',
 'gradient-boosting-max-depth-5.pkl_03.npy',
 'gradient-boosting-max-depth-5.pkl_04.npy']

In [14]:
clf = joblib.load('gradient-boosting-max-depth-1.pkl')
predicted = np.array(clf.predict_proba(validation_data[features]))
print(log_loss(validation_data['Category'], predicted))
clf = joblib.load('gradient-boosting-max-depth-2.pkl')
predicted = np.array(clf.predict_proba(validation_data[features]))
print(log_loss(validation_data['Category'], predicted))
clf = joblib.load('gradient-boosting-max-depth-4.pkl')
predicted = np.array(clf.predict_proba(validation_data[features]))
print(log_loss(validation_data['Category'], predicted))
clf = joblib.load('gradient-boosting-max-depth-5.pkl')
predicted = np.array(clf.predict_proba(validation_data[features]))
print(log_loss(validation_data['Category'], predicted))

2.51761806714
2.46019434585
2.4261260487
2.44142320188


In [15]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import log_loss
from sklearn.externals import joblib
clf = GradientBoostingClassifier(max_depth=6)
clf.fit(train_data[features], train_data['Category'])
predicted = np.array(clf.predict_proba(validation_data[features]))
print(log_loss(validation_data['Category'], predicted))
joblib.dump(clf, 'gradient-boosting-max-depth-6.pkl')

2.47766810308


['gradient-boosting-max-depth-6.pkl',
 'gradient-boosting-max-depth-6.pkl_01.npy',
 'gradient-boosting-max-depth-6.pkl_02.npy',
 'gradient-boosting-max-depth-6.pkl_03.npy',
 'gradient-boosting-max-depth-6.pkl_04.npy']

In [16]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import log_loss
from sklearn.externals import joblib
clf = GradientBoostingClassifier(max_depth=7)
clf.fit(train_data[features], train_data['Category'])
predicted = np.array(clf.predict_proba(validation_data[features]))
print(log_loss(validation_data['Category'], predicted))
joblib.dump(clf, 'gradient-boosting-max-depth-7.pkl')

2.48136943297


['gradient-boosting-max-depth-7.pkl',
 'gradient-boosting-max-depth-7.pkl_01.npy',
 'gradient-boosting-max-depth-7.pkl_02.npy',
 'gradient-boosting-max-depth-7.pkl_03.npy',
 'gradient-boosting-max-depth-7.pkl_04.npy']

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import log_loss
from sklearn.externals import joblib
clf = GradientBoostingClassifier(max_depth=8)
clf.fit(train_data[features], train_data['Category'])
predicted = np.array(clf.predict_proba(validation_data[features]))
print(log_loss(validation_data['Category'], predicted))
joblib.dump(clf, 'gradient-boosting-max-depth-8.pkl')

2.53703437513


['gradient-boosting-max-depth-8.pkl',
 'gradient-boosting-max-depth-8.pkl_01.npy',
 'gradient-boosting-max-depth-8.pkl_02.npy',
 'gradient-boosting-max-depth-8.pkl_03.npy',
 'gradient-boosting-max-depth-8.pkl_04.npy']

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import log_loss
from sklearn.externals import joblib
clf = GradientBoostingClassifier(max_depth=9)
clf.fit(train_data[features], train_data['Category'])
predicted = np.array(clf.predict_proba(validation_data[features]))
print(log_loss(validation_data['Category'], predicted))
joblib.dump(clf, 'gradient-boosting-max-depth-9.pkl')