In [5]:
import datetime
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.cross_validation import KFold
from sklearn.metrics import roc_auc_score
from scipy.io import loadmat
from operator import itemgetter
import random
import os
import time
import glob
from tqdm import tqdm

In [6]:
random.seed(777)
np.random.seed(777)

In [7]:
def create_feature_map(features):
    outfile = open('xgb.fmap', 'w')
    for i, feat in enumerate(features):
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
    outfile.close()


def get_importance(gbm, features):
    create_feature_map(features)
    importance = gbm.get_fscore(fmap='xgb.fmap')
    importance = sorted(importance.items(), key=itemgetter(1), reverse=True)
    return importance


def intersect(a, b):
    return list(set(a) & set(b))


def print_features_importance(imp):
    for i in range(len(imp)):
        print("# " + str(imp[i][1]))
        print('output.remove(\'' + imp[i][0] + '\')')


def mat_to_pandas(path):
    mat = loadmat(path)
    names = mat['dataStruct'].dtype.names
    ndata = {n: mat['dataStruct'][n][0, 0] for n in names}
    return pd.DataFrame(ndata['data'], columns=ndata['channelIndices'][0])


def create_simple_csv():
    # TRAIN
    print('Create train.csv...')
    files = sorted(glob.glob("/datasets/kaggle/mls/train_*/*.mat"))
    out = open("simple_train.csv", "w")
    out.write("Id,patient_id")
    for i in range(16):
        out.write(",avg_" + str(i))
    out.write(",file_size,result\n")
    
    
    for fl in tqdm(files):
        
        # print('Go for ' + fl)
        id_str = os.path.basename(fl)[:-4]
        arr = id_str.split("_")
        patient = int(arr[0])
        id = int(arr[1])
        result = int(arr[2])
        new_id = patient*100000 + id
        try:
            tables = mat_to_pandas(fl)
        except:
            continue
        out.write(str(new_id))
        out.write("," + str(patient))
        for f in sorted(list(tables.columns.values)):
            mean = tables[f].mean()
            out.write("," + str(mean))
        out.write("," + str(os.path.getsize(fl)))
        out.write("," + str(result) + "\n")
        # break
        
    out.close()

    # TEST
    print('Create test.csv...')
    files = sorted(glob.glob("/datasets/kaggle/mls/test_*/*.mat"))
    out = open("simple_test.csv", "w")
    out.write("Id,patient_id")
    for i in range(16):
        out.write(",avg_" + str(i))
    out.write(",file_size\n")
    
    for fl in tqdm(files):
        # print('Go for ' + fl)
        id_str = os.path.basename(fl)[:-4]
        arr = id_str.split("_")
        patient = int(arr[0])
        id = int(arr[1])
        new_id = patient*100000 + id
        try:
            tables = mat_to_pandas(fl)
        except:
            continue
        out.write(str(new_id))
        out.write("," + str(patient))
        for f in sorted(list(tables.columns.values)):
            mean = tables[f].mean()
            out.write("," + str(mean))
        out.write("," + str(os.path.getsize(fl)))
        out.write("\n")
        # break
    out.close()


def run_single(train, test, features, target, random_state=1):
    eta = 0.1
    max_depth = 5
    subsample = 0.92
    colsample_bytree = 0.9
    start_time = time.time()

    print('XGBoost params. ETA: {}, MAX_DEPTH: {}, SUBSAMPLE: {}, COLSAMPLE_BY_TREE: {}'.format(eta, max_depth, subsample, colsample_bytree))
    params = {
        "objective": "binary:logistic",
        "booster" : "gbtree",
        "eval_metric": "auc",
        "eta": eta,
        "tree_method": 'exact',
        "max_depth": max_depth,
        "subsample": subsample,
        "colsample_bytree": colsample_bytree,
        "silent": 1,
        "seed": random_state,
    }
    num_boost_round = 1000
    early_stopping_rounds = 50
    test_size = 0.2

    kf = KFold(len(train.index), n_folds=int(round(1/test_size, 0)), shuffle=True, random_state=random_state)
    train_index, test_index = list(kf)[0]
    print('Length of train: {}'.format(len(train_index)))
    print('Length of valid: {}'.format(len(test_index)))

    X_train, X_valid = train[features].as_matrix()[train_index], train[features].as_matrix()[test_index]
    y_train, y_valid = train[target].as_matrix()[train_index], train[target].as_matrix()[test_index]

    print('Length train:', len(X_train))
    print('Length valid:', len(X_valid))

    dtrain = xgb.DMatrix(X_train, y_train)
    dvalid = xgb.DMatrix(X_valid, y_valid)

    watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
    gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist,
                    early_stopping_rounds=early_stopping_rounds, verbose_eval=True)

    print("Validating...")
    check = gbm.predict(xgb.DMatrix(X_valid), ntree_limit=gbm.best_iteration+1)
    score = roc_auc_score(y_valid, check)
    print('Check error value: {:.6f}'.format(score))

    imp = get_importance(gbm, features)
    print('Importance array: ', imp)

    print("Predict test set...")
    test_prediction = gbm.predict(xgb.DMatrix(test[features].as_matrix()), ntree_limit=gbm.best_iteration+1)

    print('Training time: {} minutes'.format(round((time.time() - start_time)/60, 2)))
    return test_prediction.tolist(), score


def create_submission(score, test, prediction):
    # Make Submission
    now = datetime.datetime.now()
    sub_file = 'submission_' + str(score) + '_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
    print('Writing submission: ', sub_file)
    f = open(sub_file, 'w')
    f.write('File,Class\n')
    total = 0
    for id in test['Id']:
        patient = id // 100000
        fid = id % 100000
        str1 = str(patient) + '_' + str(fid) + '.mat' + ',' + str(prediction[total])
        str1 += '\n'
        total += 1
        f.write(str1)
    f.close()


def get_features(train, test):
    trainval = list(train.columns.values)
    testval = list(test.columns.values)
    output = intersect(trainval, testval)
    output.remove('Id')
    # output.remove('file_size')
    return sorted(output)


def read_test_train():
    print("Load train.csv...")
    train = pd.read_csv("simple_train.csv")
    print("Load test.csv...")
    test = pd.read_csv("simple_test.csv")
    print("Process tables...")
    features = get_features(train, test)
    return train, test, features

In [8]:
print('XGBoost: {}'.format(xgb.__version__))
# create_simple_csv()
train, test, features = read_test_train()
print('Length of train: ', len(train))
print('Length of test: ', len(test))
print('Features [{}]: {}'.format(len(features), sorted(features)))
test_prediction, score = run_single(train, test, features, 'result')
create_submission(score, test, test_prediction)

  0%|          | 2/6042 [00:00<09:04, 11.10it/s]

XGBoost: 0.6
Create train.csv...


100%|██████████| 6042/6042 [07:51<00:00, 12.83it/s]
  0%|          | 2/6126 [00:00<08:17, 12.31it/s]

Create test.csv...


100%|██████████| 6126/6126 [08:05<00:00, 12.61it/s]

Load train.csv...
Load test.csv...
Process tables...
('Length of train: ', 6042)
('Length of test: ', 6126)
Features [18]: ['avg_0', 'avg_1', 'avg_10', 'avg_11', 'avg_12', 'avg_13', 'avg_14', 'avg_15', 'avg_2', 'avg_3', 'avg_4', 'avg_5', 'avg_6', 'avg_7', 'avg_8', 'avg_9', 'file_size', 'patient_id']
XGBoost params. ETA: 0.1, MAX_DEPTH: 5, SUBSAMPLE: 0.92, COLSAMPLE_BY_TREE: 0.9
Length of train: 4833
Length of valid: 1209
('Length train:', 4833)
('Length valid:', 1209)
[0]	train-auc:0.699582	eval-auc:0.648061
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 50 rounds.
[1]	train-auc:0.710219	eval-auc:0.661985
[2]	train-auc:0.720333	eval-auc:0.664607
[3]	train-auc:0.760721	eval-auc:0.689907
[4]	train-auc:0.764542	eval-auc:0.686532
[5]	train-auc:0.771819	eval-auc:0.692639
[6]	train-auc:0.790394	eval-auc:0.691865
[7]	train-auc:0.794618	eval-auc:0.692567
[8]	train-auc:0.80774	eval-auc:0.695791
[9]	train-auc:0.81




[16]	train-auc:0.857488	eval-auc:0.724337
[17]	train-auc:0.860313	eval-auc:0.724833
[18]	train-auc:0.864425	eval-auc:0.724499
[19]	train-auc:0.869881	eval-auc:0.726903
[20]	train-auc:0.874083	eval-auc:0.730995
[21]	train-auc:0.879538	eval-auc:0.733305
[22]	train-auc:0.884549	eval-auc:0.738607
[23]	train-auc:0.885576	eval-auc:0.740472
[24]	train-auc:0.888196	eval-auc:0.742508
[25]	train-auc:0.893557	eval-auc:0.743615
[26]	train-auc:0.898833	eval-auc:0.744445
[27]	train-auc:0.900213	eval-auc:0.744193
[28]	train-auc:0.900373	eval-auc:0.743363
[29]	train-auc:0.904425	eval-auc:0.742841
[30]	train-auc:0.908011	eval-auc:0.742148
[31]	train-auc:0.910449	eval-auc:0.742328
[32]	train-auc:0.913576	eval-auc:0.745446
[33]	train-auc:0.91465	eval-auc:0.745026
[34]	train-auc:0.917721	eval-auc:0.74616
[35]	train-auc:0.919991	eval-auc:0.745116
[36]	train-auc:0.920405	eval-auc:0.744364
[37]	train-auc:0.921932	eval-auc:0.74273
[38]	train-auc:0.923446	eval-auc:0.741858
[39]	train-auc:0.925498	eval-auc:0.74