In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns; sns.set()
import hyperopt
import xgboost as xgb

In [2]:
train = pd.read_csv('data/train.csv', index_col=0)

In [3]:
import datetime as dt
import time

hdays = [dt.datetime(2014, 1, day) for day in range(1,9)]
hdays = hdays + [dt.datetime(2014, 2, 23), 
              dt.datetime(2014, 3, 8),
              dt.datetime(2014, 3, 9),
              dt.datetime(2014, 3, 10),
              dt.datetime(2014, 5, 1), 
              dt.datetime(2014, 5, 2),
              dt.datetime(2014, 5, 3),
              dt.datetime(2014, 5, 4),
              dt.datetime(2014, 5, 9),
              dt.datetime(2014, 5, 10),
              dt.datetime(2014, 7, 12),
              dt.datetime(2014, 7, 13),
              dt.datetime(2014, 7, 14),
              dt.datetime(2014, 7, 15),
              dt.datetime(2014, 11, 3),
              dt.datetime(2014, 11, 4)]

hdays = [hd.timetuple() for hd in hdays]

def check_hday(day, hday):
    if day.tm_mon == hday.tm_mon and day.tm_mday == hday.tm_mday:
        return True
    else:
        return False

def map_holidays(date):
    tdate = time.strptime(date[:-4], '%Y-%m-%d %H:%M:%S')
    hits = [check_hday(tdate, hday) for hday in hdays]
    if True in hits:
        return 1
    else:
        return 0 
        
zz = map(map_holidays, train['due'])

In [25]:
from sklearn.cluster import KMeans
from sklearn.feature_extraction import DictVectorizer

vectorizer = DictVectorizer(sparse=False, dtype=np.bool)

x_col = [ 'dist','due', 'lat','lon','f_class','s_class','t_class',]
X_raw = train[x_col]
y = train['burned'].values

X_raw.loc[:, 'weekday'] = X_raw['due'].astype('datetime64').map(lambda x: x.weekday())

#data_dict = [ {'f_class':f,'s_class':s,'t_class':t}
#           for f,s,t in X_raw[['f_class','s_class','t_class']].values ]

Xwek = pd.get_dummies(X_raw.weekday).values

#Xcat = vectorizer.fit_transform(data_dict)

def has_class(train, class_name):
    return (train.f_class == class_name) | (train.s_class == class_name) | (train.t_class == class_name)
econom = has_class(X_raw, 'econom')
business = has_class(X_raw, 'business')
vip = has_class(X_raw, 'vip')

kmn = KMeans(n_clusters=100)
Xkf = kmn.fit_predict(train[['lat', 'lon']].values)

conc = econom.map(lambda x: str(x)) + business.map(lambda x: str(x)) + vip.map(lambda x: str(x))

Xcon = pd.get_dummies(conc).values

Xhol = np.array(zz).reshape(-1, 1)

Xhour = pd.get_dummies(X_raw['due'].astype('datetime64').map(lambda x: x.hour)).values

real_features = ["lat", "lon"]
Xreal = X_raw[real_features].values
Xdist = np.log(X_raw.dist+2).values
print(Xreal.shape, Xdist.shape, Xcon.shape)
Xfull = np.hstack((Xreal, Xdist.reshape(-1, 1), Xcon, Xwek, Xhol, Xhour, Xkf))

wft = ['w'+str(i) for i in range(7)]
conc_f = ['c'+str(i) for i in range(Xcon.shape[1])]
hor_f = ['h'+str(i) for i in range(Xhour.shape[1])]

X_new = pd.DataFrame(Xfull, columns=real_features+['dist']+conc_f+wft+['holiday']+hor_f)

((1793300, 2), (1793300,), (1793300, 8))


In [26]:
from sklearn.cross_validation import StratifiedKFold

In [27]:
mask = y.astype('int').astype('str') + X_raw.weekday.astype('str')

In [28]:
skf = StratifiedKFold(mask, n_folds=5, random_state=42)

In [29]:
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import roc_auc_score,accuracy_score,precision_score,recall_score
from sklearn.metrics import roc_curve

In [30]:
for train_index, test_index in skf:
    X_train, X_test = X_new.iloc[train_index], X_new.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    model = RandomForestClassifier(n_estimators=250,n_jobs=-1)
    model.fit(X_train, y_train)
    
    preds = model.predict_proba(X_test)[:, 1]
    print(roc_auc_score(y_test, preds))

0.585875083734


KeyboardInterrupt: 

In [31]:
import prettyprint as pp

In [39]:
test = pd.read_csv('data/test.csv', index_col=0)
t_col = [ 'dist','due', 'lat','lon','f_class','s_class','t_class',]
t_raw = test[t_col]
#y = train['burned'].values

t_raw.loc[:, 'weekday'] = t_raw['due'].astype('datetime64').map(lambda x: x.weekday())

#data_dict = [ {'f_class':f,'s_class':s,'t_class':t}
#           for f,s,t in t_raw[['f_class','s_class','t_class']].values ]

twek = pd.get_dummies(t_raw.weekday).values

#tcat = vectorizer.transform(data_dict)

def has_class(train, class_name):
    return (train.f_class == class_name) | (train.s_class == class_name) | (train.t_class == class_name)
econom = has_class(t_raw, 'econom')
business = has_class(t_raw, 'business')
vip = has_class(t_raw, 'vip')

conc = econom.map(lambda x: str(x)) + business.map(lambda x: str(x)) + vip.map(lambda x: str(x))

tcon = pd.get_dummies(conc).values

thol = np.array(map(map_holidays, t_raw['due'])).reshape(-1, 1)

thour = pd.get_dummies(t_raw['due'].astype('datetime64').map(lambda x: x.hour)).values

real_features = ["lat", "lon"]
treal = t_raw[real_features].values
tdist = np.log(t_raw.dist+2).values
tfull = np.hstack((treal, tdist.reshape(-1, 1), tcon, twek, thol, thour))

wft = ['w'+str(i) for i in range(7)]
conc_f = ['c'+str(i) for i in range(tcon.shape[1])]
hor_f = ['h'+str(i) for i in range(thour.shape[1])]

t_new = pd.DataFrame(tfull, columns=real_features+['dist']+conc_f+wft+['holiday']+hor_f)

dtest = xgb.DMatrix(t_new)

In [42]:
preds_all = np.zeros((test.shape[0], 5))
o = 0
for train_index, test_index in skf:
    X_train, X_test = X_new.iloc[train_index], X_new.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]

    dtrain = xgb.DMatrix(X_train, y_train)
    dval = xgb.DMatrix(X_test, y_test)

    watchlist = ((dtrain, 'train'), (dval, 'validation'))

    params = {
        'nthread': 12,
        'eval_metric': 'auc',
        'eta': .2,
        'max_depth': 5,
        'min_child_weight': 2,
        'subsample': .8,
        'colsample_bytree': .8,
        'objective': 'binary:logistic',
        'seed': 42, 
        'silent': 1
    }

    #pp.pp(params)

    gbm = xgb.train(params, dtrain, num_boost_round=5000, maximize=True,
          evals=watchlist, early_stopping_rounds=100, verbose_eval=True)

    preds = gbm.predict(dtest, ntree_limit=gbm.best_iteration)
    preds_all[:, o] = preds
    o += 1

KeyboardInterrupt: 

In [47]:
pres = preds_all[:, :3].mean(axis=1)

In [48]:
pres.shape

(743463,)

In [32]:
X_train, X_test = X_new.iloc[train_index], X_new.iloc[test_index]
y_train, y_test = y[train_index], y[test_index]

dtrain = xgb.DMatrix(X_train, y_train)
dval = xgb.DMatrix(X_test, y_test)

watchlist = ((dtrain, 'train'), (dval, 'validation'))

params = {
    'nthread': 12,
    'eval_metric': 'auc',
    'eta': .2,
    'max_depth': 5,
    'min_child_weight': 2,
    'subsample': .8,
    'colsample_bytree': .8,
    'objective': 'binary:logistic',
    'seed': 42, 
    'silent': 1
}

pp.pp(params)

gbm = xgb.train(params, dtrain, num_boost_round=5000, maximize=True,
      evals=watchlist, early_stopping_rounds=300, verbose_eval=True)

preds = gbm.predict(dval)

{
    "colsample_bytree": 0.8, 
    "eta": 0.2, 
    "eval_metric": "auc", 
    "max_depth": 5, 
    "min_child_weight": 2, 
    "nthread": 12, 
    "objective": "binary:logistic", 
    "seed": 42, 
    "silent": 1, 
    "subsample": 0.8
}


In [30]:
preds = gbm.predict(dval, ntree_limit=gbm.best_iteration)

In [31]:
roc_auc_score(y_test, preds)

0.66759808515858143

In [37]:
test = pd.read_csv('data/test.csv', index_col=0)
t_col = [ 'dist','due', 'lat','lon','f_class','s_class','t_class',]
t_raw = test[t_col]
#y = train['burned'].values

t_raw.loc[:, 'weekday'] = t_raw['due'].astype('datetime64').map(lambda x: x.weekday())

data_dict = [ {'f_class':f,'s_class':s,'t_class':t}
           for f,s,t in t_raw[['f_class','s_class','t_class']].values ]

twek = pd.get_dummies(t_raw.weekday).values

tcat = vectorizer.transform(data_dict)

real_features = ["lat", "lon"]
treal = t_raw[real_features].values
tdist = np.log(t_raw.dist+2).values
print(treal.shape, tdist.shape, tcat.shape)
tfull = np.hstack((treal, tdist.reshape(-1, 1), tcat, twek))

wft = ['w'+str(i) for i in range(7)]

t_new = pd.DataFrame(tfull, columns=real_features+['dist']+vectorizer.feature_names_+wft)

((743463, 2), (743463,), (743463, 12))


In [38]:
dtest = xgb.DMatrix(t_new)

In [39]:
preds = gbm.predict(dtest, ntree_limit=gbm.best_iteration)

In [42]:
res = pd.DataFrame(data=preds, index=test.index, columns=['Y_prob'])

In [43]:
res.to_csv('res0.csv')

In [18]:
def objective(space):
    res = []
    
    params = {
        'nthread': 12,
        'eval_metric': 'auc',
        'eta': .2,
        'max_depth': int(space['max_depth']),
        'min_child_weight': space['min_child_weight'],
        'subsample': space['subsample'],
        'colsample_bytree': space['colsample_bytree'],
        'objective': 'binary:logistic',
        'seed': 42, 
        'silent': 1
    }

    pp.pp(params)
        
    for train_index, test_index in skf:
        X_train, X_test = X_new.iloc[train_index], X_new.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]

        dtrain = xgb.DMatrix(X_train, y_train)
        dval = xgb.DMatrix(X_test, y_test)

        watchlist = ((dtrain, 'train'), (dval, 'validation'))

        

        gbm = xgb.train(params, dtrain, num_boost_round=5000, maximize=True,
              evals=watchlist, early_stopping_rounds=100, verbose_eval=True)

        preds = gbm.predict(dval, ntree_limit=gbm.best_iteration)
        
        res.append(roc_auc_score(y_test, preds))
        
    print(np.mean(res))

In [19]:
from hyperopt import hp, fmin, tpe

In [20]:
space = {'max_depth': hp.quniform('max_depth', 3, 10, 1),
         'min_child_weight': hp.quniform('min_child_weight', 1, 5, 1),
         'subsample': hp.uniform('subsample', .5, 1.),
         'colsample_bytree': hp.uniform('colsample_bytree', .5, 1.)}

In [None]:
best = fmin(objective, space, algo=tpe.suggest, max_evals=50)

{
    "colsample_bytree": 0.7756573845414456, 
    "eta": 0.2, 
    "eval_metric": "auc", 
    "max_depth": 5, 
    "min_child_weight": 2.0, 
    "nthread": 12, 
    "objective": "binary:logistic", 
    "seed": 42, 
    "silent": 1, 
    "subsample": 0.8482345927989308
}


In [49]:
res = pd.DataFrame(data=pres, index=test.index, columns=['Y_prob'])

In [51]:
res.to_csv('res1.csv')