In [50]:
import pandas as pd
from sklearn.base import TransformerMixin
from sklearn.metrics import matthews_corrcoef, roc_auc_score
from sklearn.cross_validation import cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline, FeatureUnion, make_union
from numpy import ravel
import numpy as np
from xgboost import XGBClassifier
import matplotlib.pyplot as plt

In [51]:
# contains all methods and classes for FeatureUnion
## generic read_csv

#example
class Regular(TransformerMixin):
    #return itself
    def fit(self, x, y=None):
        return self
    
    def transform(self, X):
        print(X.shape)
        return X

#example
class LoadCSV(TransformerMixin):
    def __init__(self, filename, **kwargs):
        # TODO build in args so that we can pass things to read_csv
        self.filename = filename
        self.kwargs = kwargs
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, X):
        # we assume the first CSV entry is always the ID
        temp = pd.read_csv(self.filename, index_col = 0, **self.kwargs)
        return temp


In [52]:
# load in targets
base = pd.read_csv("data/train_numeric.csv", usecols = ['Id','Response'])
target = base.loc[:,['Response']]

In [53]:
# make featureunions
numeric_usecols = ['Id',
                     'L1_S24_F1846', 'L3_S32_F3850',
                     'L1_S24_F1695', 'L1_S24_F1632',
                     'L3_S33_F3855', 'L1_S24_F1604',
                     'L3_S29_F3407', 'L3_S33_F3865',
                     'L3_S38_F3952', 'L1_S24_F1723']
features = make_union(LoadCSV('data/train_fail_date_score.csv'), 
                      LoadCSV('data/train_min_date.csv'),
                      LoadCSV('data/train_numeric.csv', usecols = numeric_usecols))
# mindate's best columns, Huey's (is S32, S33, S34), 
# nathan's cyclic thing, fail_date_score, useful date columns
X = features.fit_transform(1)
y = base['Response']

In [54]:
test = make_union(LoadCSV('data/test_fail_date_score.csv'), 
                  LoadCSV('data/test_min_date.csv'),
                  LoadCSV('data/test_numeric.csv', usecols = numeric_usecols))
test = test.fit_transform(1)

if X.shape[1] != test.shape[1]:
    print('loaded in CSVs wrong')

clf = XGBClassifier(base_score=0.005, seed=24)
clf.fit(X,y)
#original_preds = np.ones(y.shape[0])
original_raw_preds = clf.predict_proba(X)
original_preds = (clf.predict_proba(X)[:,1] > 0.05).astype(np.int8)
newpreds = (clf.predict_proba(test)[:,1] > 0.05).astype(np.int8)

In [55]:
print(original_preds.sum(),y.sum(),newpreds.sum())


36099 6879 36108


In [56]:
# Submit
sub = pd.read_csv("data/sample_submission.csv", index_col=0)
sub["Response"] = newpreds
sub.to_csv("davesubmission.csv.gz", compression="gzip")

In [57]:
# individual import statements for every feature, comment these out if needed
#LoadCSV('data/train_fail_date_score.csv').fit_transform()
#union = FeatureUnion([('first', LoadCSV('data/train_fail_date_score.csv'))])
#                      #('second', LoadCSV('data/train_numeric.csv'))])



#pipe = Pipeline([('union', features),
#                 ('linear', XGBClassifier(base_score=0.005, seed=24))])
#pipe.fit(np.linspace(1,2,num=nrows),base['Response'])
#y = pipe.predict(target)


In [58]:
#features.transform(1)

In [59]:
# one featureunion for everything if possible
## always new code cell for testing pipelines