In [3]:
import pandas as pd
from sklearn.base import TransformerMixin
from sklearn.metrics import matthews_corrcoef, roc_auc_score
from sklearn.cross_validation import cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline, FeatureUnion, make_union
from numpy import ravel
import numpy as np
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import gc

In [4]:
# Usage: Use the TransformerMixin classes below to create features and then use FeatureUnions
# to concatenate them together.

# Example transformer to take a data frame and then return some features via the transform()
# method.
class Regular(TransformerMixin):
    #return itself, use this for when you have a dataframe already in the script base
    def __init__(self, df):
        # TODO build in args so that we can pass things to read_csv
        self.df = df
        
    def fit(self, x, y=None):
        return self
    
    def transform(self, X):
        return self.df

# Loads a CSV and then returns a set of features.
class LoadCSV(TransformerMixin):
    # use this to load in an external csv, it accepts kwargs to feed to pd.read_csv()
    def __init__(self, filename, **kwargs):
        # TODO build in args so that we can pass things to read_csv
        self.filename = filename
        self.kwargs = kwargs
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, X):
        # we assume the first CSV entry is always the ID
        temp = pd.read_csv(self.filename, index_col = 0, **self.kwargs)
        return temp

# Loads from a directory in chunks and then returns a dataframe.
def load_data(directory, files, cols):
    # Huey's older loading method, should deprecate this I think
    df = None
    for i, f in enumerate(files):
        print(f)
        subset = None
        
        for i, chunk in enumerate(pd.read_csv(directory + f,
                                              usecols=cols[i],
                                              chunksize=50000,
                                              low_memory=False)):
            if i % 5 == 0:
                print('Processing chunk %d' % i)
            if subset is None:
                subset = chunk.copy()
            else:
                subset = pd.concat([subset, chunk])
            del chunk
            gc.collect()
            
        if df is None:
            df = subset.copy()
        else:
            df = pd.merge(df, subset.copy(), on="Id")
        del subset
        gc.collect()
    
    return df

In [5]:
# load in targets
base = pd.read_csv("data/train_numeric.csv", usecols = ['Id','Response'])
target = base.loc[:,['Response']]

In [8]:
train_files = ['train_date.csv',
              'train_numeric.csv']

test_files = ['test_date.csv',
             'test_numeric.csv']

train_cols = [
        ['Id',
         'L3_S30_D3496', 'L3_S30_D3506',
         'L3_S30_D3501', 'L3_S30_D3516',
         'L3_S30_D3511'],
        ['Id',
         'L1_S24_F1846', 'L3_S32_F3850',
         'L1_S24_F1695', 'L1_S24_F1632',
         'L3_S33_F3855', 'L1_S24_F1604',
         'L3_S29_F3407', 'L3_S33_F3865',
         'L3_S38_F3952', 'L1_S24_F1723',
         'Response'],
        ['Id','Fail']]

test_cols = [
        ['Id',
         'L3_S30_D3496', 'L3_S30_D3506',
         'L3_S30_D3501', 'L3_S30_D3516',
         'L3_S30_D3511'],
        ['Id',
         'L1_S24_F1846', 'L3_S32_F3850',
         'L1_S24_F1695', 'L1_S24_F1632',
         'L3_S33_F3855', 'L1_S24_F1604',
         'L3_S29_F3407', 'L3_S33_F3865',
         'L3_S38_F3952', 'L1_S24_F1723'],
        ['Id','Fail']]

In [9]:
train_raw_features = load_data('data/', train_files, train_cols)
print(train_raw_features.shape)

train_date.csv
Processing chunk 0
Processing chunk 5
Processing chunk 10
Processing chunk 15
Processing chunk 20
train_numeric.csv
Processing chunk 0
Processing chunk 5
Processing chunk 10
Processing chunk 15
Processing chunk 20
(1183747, 17)


In [10]:
test_raw_features = load_data('data/', test_files, test_cols)
print(test_raw_features.shape)

test_date.csv
Processing chunk 0
Processing chunk 5
Processing chunk 10
Processing chunk 15
Processing chunk 20
test_numeric.csv
Processing chunk 0
Processing chunk 5
Processing chunk 10
Processing chunk 15
Processing chunk 20
(1183748, 16)


In [11]:
#train_rawfeatures = train_mindate
#test_rawfeatures = test_mindate
test_raw_features.columns

Index([u'Id', u'L3_S30_D3496', u'L3_S30_D3501', u'L3_S30_D3506',
       u'L3_S30_D3511', u'L3_S30_D3516', u'L1_S24_F1604', u'L1_S24_F1632',
       u'L1_S24_F1695', u'L1_S24_F1723', u'L1_S24_F1846', u'L3_S29_F3407',
       u'L3_S32_F3850', u'L3_S33_F3855', u'L3_S33_F3865', u'L3_S38_F3952'],
      dtype='object')

In [12]:
# remove Id and Response
train_raw_features = train_raw_features[train_raw_features.columns.difference(['Id', 'Response'])]
test_raw_features = test_raw_features[test_raw_features.columns.difference(['Id', 'Response'])]

In [14]:
# Make FeatureUnions
# load in each piece of data from 'data/' that you want
features = make_union(LoadCSV('data/train_fail_date_score.csv'), 
                          # Counts the number of failures that 
                          # happened between the first and last datetime.
                      LoadCSV('data/train_min_date.csv'),
                          # This is the so-called "magic" feature.
                      LoadCSV('data/train_s32_s33_s34.csv'),
                          # Indicates whether part passed through S32, S33, or S34.
                      Regular(train_raw_features))

#                      LoadCSV('data/train_id_rates_max.csv'),
#                      LoadCSV('data/train_id_rates_total.csv'))
# mindate's best columns, Huey's (is S32, S33, S34), 
# nathan's cyclic thing, fail_date_score, useful date columns


X = features.fit_transform(1)
y = base['Response']

In [18]:
test = make_union(LoadCSV('data/test_fail_date_score.csv'), 
                  LoadCSV('data/test_min_date.csv'),
                  LoadCSV('data/test_s32_s33_s34.csv'),
                  Regular(test_raw_features))
#                  LoadCSV('data/test_id_rates_max.csv'),
#                  LoadCSV('data/test_id_rates_total.csv'))
X_test = test.fit_transform(1)
df_test = pd.DataFrame(X_test)

In [19]:
if X.shape[1] != X_test.shape[1]:
    print('loaded in CSVs wrong')

#clf = XGBClassifier(base_score=0.005, seed=24)
#clf.fit(X,y)
##original_preds = np.ones(y.shape[0])
#original_raw_preds = clf.predict_proba(X)
#original_preds = (clf.predict_proba(X)[:,1] > 0.05).astype(np.int8)
#newpreds = (clf.predict_proba(test)[:,1] > 0.05).astype(np.int8)

In [20]:
clf = XGBClassifier(max_depth=5, base_score=0.005, seed=37)
cv = StratifiedKFold(y, n_folds=3, random_state=37)
preds = np.ones(y.shape[0])
dfX = pd.DataFrame(X)

for i, (infold, outfold) in enumerate(cv):
    preds[outfold] = clf.fit(dfX.loc[infold], y[infold]).predict_proba(dfX.loc[outfold])[:,1]
    print("fold {}, ROC AUC: {:.3f}".format(i, roc_auc_score(y[outfold], preds[outfold])))
print(roc_auc_score(y, preds))


fold 0, ROC AUC: 0.910
fold 1, ROC AUC: 0.902
fold 2, ROC AUC: 0.894
0.901577240223


In [21]:
# Pick the best threshold out-of-fold
thresholds = np.linspace(0.01, 0.99, 50)
mcc = np.array([matthews_corrcoef(y, preds>thr) for thr in thresholds])
plt.plot(thresholds, mcc)
best_threshold = thresholds[mcc.argmax()]
print(mcc.max())
preds = (clf.predict_proba(df_test)[:,1] > best_threshold).astype(np.int8)

0.387231789121


In [22]:
# Submit
sub = pd.read_csv("data/sample_submission.csv", index_col=0)
sub["Response"] = preds
sub.to_csv("pipesubmission.csv.gz", compression="gzip")