In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
import zipfile
from tqdm import tqdm_notebook
from glob import glob

from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.metrics import roc_auc_score



# Read data

In [4]:
train = pd.read_csv('../data/ytrain.csv')
train['train'] = 1

test = pd.read_csv('../data/SampleSubmission.csv')
test['train'] = 0

labels = pd.concat([train, test])

In [8]:
train.describe()

Unnamed: 0,Id,Attack,train
count,250.0,250.0,250.0
mean,245.732,0.368,1.0
std,146.991804,0.483229,0.0
min,0.0,0.0,1.0
25%,116.75,0.0,1.0
50%,242.5,0.0,1.0
75%,376.5,1.0,1.0
max,499.0,1.0,1.0


In [9]:
test.describe()

Unnamed: 0,Id,Attack,train
count,250.0,250.0,250.0
mean,253.268,0.5,0.0
std,142.122254,0.0,0.0
min,1.0,0.5,0.0
25%,131.75,0.5,0.0
50%,253.0,0.5,0.0
75%,369.75,0.5,0.0
max,495.0,0.5,0.0


# Extract features

In [12]:
def calc_features(df):
    return df.max()

features = {}

for archive in '../data/xtrain.zip', '../data/xtest.zip':
    with zipfile.ZipFile(archive) as zf:
        for name in tqdm_notebook(zf.namelist()):
            if name.endswith('.csv'):
                
                fname = name.split('/')[-1]
                Id = int(fname.split('.')[0])

                df = pd.read_csv(zf.open(name), header=None)
                features[Id] = calc_features(df)
            
            
features = pd.DataFrame(features).T
features.index.name = 'Id'
features.reset_index(inplace=True)

data = labels.merge(features, how='left', on='Id')

features[:1]





Unnamed: 0,Id,0,1,2,3,4,5,6,7,8,...,46,47,48,49,50,51,52,53,54,55
0,0,116.14,0.27,3724.37,4512.69,9.41,32.84,48.08,2803.7,66.9,...,1.0,30.69,37.31,46.47,1.0,36.33,17.7,100.0,12.73,129.55


In [16]:
data.head()

Unnamed: 0,Id,Attack,train,0,1,2,3,4,5,6,...,46,47,48,49,50,51,52,53,54,55
0,0,0.0,1,116.14,0.27,3724.37,4512.69,9.41,32.84,48.08,...,1.0,30.69,37.31,46.47,1.0,36.33,17.7,100.0,12.73,129.55
1,2,1.0,1,116.14,0.21,5252.07,820.61,8.0,20.39,32.92,...,77.62,13.34,29.19,39.47,0.0,37.64,94.27,62.11,18.04,53.51
2,3,0.0,1,116.14,0.26,3740.21,4572.5,9.52,27.63,43.04,...,22.21,44.37,38.17,46.59,47.45,41.5,23.6,50.0,12.78,184.66
3,4,1.0,1,116.14,0.21,5250.12,793.52,10.03,21.62,32.83,...,77.62,35.09,29.22,39.57,0.0,38.97,94.02,62.11,18.04,102.94
4,5,0.0,1,116.14,0.24,5870.69,873.02,8.96,21.15,34.79,...,77.62,13.86,32.76,44.31,0.0,43.26,99.23,61.47,20.2,57.01


# Cross validation

In [19]:
xcols = [c for c in features if not c in ('Id', 'Attack', 'train')]
target = 'Attack'


xtrain, ytrain = data.loc[data.train == 1, xcols].fillna(0), data.loc[data.train == 1, target]


clf = RandomForestClassifier(random_state=0)
scores = cross_val_score(clf, xtrain, ytrain, scoring='roc_auc')
scores.mean(), scores.std()

(0.74853112921433063, 0.02287918344824422)

# Predict for test

In [22]:
xtrain, ytrain = data.loc[data.train == 1, xcols].fillna(0), data.loc[data.train == 1, target]
xtest,  ytest  = data.loc[data.train == 0, xcols].fillna(0), data.loc[data.train == 0, target]

clf.fit(xtrain, ytrain)
prediction = clf.predict_proba(xtest)[:,1]

sample_submission = pd.read_csv('../data/SampleSubmission.csv')
sample_submission['Attack'] = prediction
sample_submission.to_csv('../result/baseline.csv', index=False)