In [None]:
%matplotlib inline
from __future__ import (
    print_function,
    division,
    unicode_literals
)

import itertools as it

import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import sklearn as skl
import uncertainties as u
import uncertainties.unumpy as unp

matplotlib.style.use('ggplot')

In [None]:
df = pd.read_csv('data/train.txt', delimiter='|')
print(' '.join(df.columns))

In [None]:
# Prices
for num in (1, 2, 3):
    df['logPrice{}'.format(num)] = np.log(df['price{}'.format(num)])
    df['logBasePrice{}'.format(num)] = np.log(df['basePrice{}'.format(num)])

In [None]:
# Datetime
df.orderTime = pd.to_datetime(df.orderTime)
df.couponsReceived = pd.to_datetime(df.couponsReceived)
df['deltaT'] = (df.orderTime - df.couponsReceived).astype('int64')
df['logDeltaT'] = np.log(df.deltaT)

df['orderTime_weekday'] = df.orderTime.dt.dayofweek
df['couponsReceived_weekday'] = df.couponsReceived.dt.dayofweek

df['orderTime_minutes'] = df.orderTime.dt.hour * 60 + df.orderTime.dt.minute
df['couponsReceived_minutes'] = df.couponsReceived.dt.hour * 60 + df.couponsReceived.dt.minute

df['sameDay'] = df.orderTime.dt.dayofyear == df.couponsReceived.dt.dayofyear

In [None]:
# Correlation plot include delta_t
corr = df.corr()
plt.imshow(corr, cmap='seismic', vmin=-1, vmax=1, interpolation='none')
plt.colorbar()
plt.xticks(np.arange(0, len(corr.columns)), corr.columns, rotation='vertical')
plt.yticks(np.arange(0, len(corr.columns)), corr.columns)
_ = 1
#plt.tight_layout()
#plt.savefig('corr.png', bbox_inches='tight')

In [None]:
# Brands
brands = reduce(lambda acc, x: acc.union(set(df['brand{}'.format(x)])), (1, 2, 3), set())
    
for num, brand in it.product((1, 2, 3), brands):
    df['brand{}_{}'.format(num, brand)] = 0
    
for num, row in df.iterrows():
    df.loc[num, 'brand1_{}'.format(row.brand1)] = 1
    df.loc[num, 'brand2_{}'.format(row.brand2)] = 1
    df.loc[num, 'brand3_{}'.format(row.brand3)] = 1

In [None]:
variables = ('deltaT', 'logDeltaT', 'orderTime_weekday', 'orderTime_minutes',
             'sameDay', 'premiumProduct{}', 'price{}', 'basePrice{}', 'reward{}',
             'logPrice{}', 'logBasePrice{}')

ncols = 3
nrows = len(variables)

plt.figure(figsize=(ncols * 5, nrows * 4))
for num in (1, 2, 3):
    crit = 'coupon{}Used'.format(num)
    sig = df[df[crit] == 1]
    bkg = df[df[crit] == 0]
    
    for i, var in enumerate(variables):
        if '{}' in var:
            var = var.format(num)
        plt.subplot(nrows, ncols, i * ncols + num)
        _, bins, _ = plt.hist(sig[var].values, alpha=0.5, normed=True, label='Sig')
        plt.hist(bkg[var].values, bins=bins, alpha=0.5, normed=True, label='Bkg')
        plt.xlabel('{}'.format(var), ha='right', x=1)
        plt.ylabel('relative frequency', ha='right', y=1)
    plt.tight_layout()
    plt.savefig('sep.png', bbox_layout='tight')

In [None]:
# Attempt some simple MVA
columns = [
    'deltaT',
    'logDeltaT',
    'price1',
    'price2',
    'price3',
    'basePrice1',
    'basePrice2',
    'basePrice3',
    'reward1',
    'reward2',
    'reward3',
    'premiumProduct1',
    'premiumProduct2',
    'premiumProduct3',
    'orderTime_minutes',
    'orderTime_weekday',
    'couponsReceived_minutes',
    'couponsReceived_weekday',
    'sameDay'
]
labels = ['coupon1Used']

X = df[columns].values
Y = df[labels].values

In [None]:
from sklearn.ensemble import (
    ExtraTreesClassifier,
    RandomForestClassifier
)
from sklearn.cross_validation import cross_val_score

classifiers = {
    'Random Forest': RandomForestClassifier()
}

def score2ufloat(score):
    return u.ufloat(score.mean(), score.std())

for name, clf in classifiers.iteritems():
    print(name)
    for num in (1, 2, 3):
        y = df['coupon{}Used'.format(num)].values
        score = score2ufloat(cross_val_score(clf, X, y, n_jobs=-1))
        print("\t{}: ({:P})%".format(num, score * 100))