# Data exploration

In [1]:
import os

import numpy as np

import pandas as pd
pd.set_option('display.max_columns', 100)

import matplotlib
%matplotlib inline
from matplotlib import pylab as plt
import seaborn as sns

In [34]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, Imputer, OneHotEncoder, LabelEncoder

In [3]:
DATA_DIR = 'data/'

In [18]:
x_train = pd.read_csv(os.path.join(DATA_DIR, 'X.train.csv')).values
y_train = pd.read_csv(os.path.join(DATA_DIR, 'y.train.csv')).values

x_test= pd.read_csv(os.path.join(DATA_DIR, 'X.test.csv')).values

In [19]:
meta_data = pd.read_csv(os.path.join(DATA_DIR, 'MetaData.csv'))

In [20]:
numeric_indexes = meta_data[meta_data['Column Type'] == 'Numeric'].index.values

category_indexes = meta_data[meta_data['Column Type'] == 'Category'].index.values
genes_start_index = 330
genes_indexes = category_indexes[genes_start_index:]
category_indexes = category_indexes[:genes_start_index]

ordered_category_indexes = meta_data[meta_data['Column Type'] == 'Ordered Category'].index.values

data = {
    'y_train': y_train,
    
    'numeric': {
        'train': x_train[:, numeric_indexes],
        'test': x_test[:, numeric_indexes]
    },
    
    'category': {
        'train': x_train[:, category_indexes],
        'test': x_test[:, category_indexes]
    },
    
    'genes': {
        'train': x_train[:, genes_indexes],
        'test': x_test[:, genes_indexes]
    },
    
    'ordered_category': {
        'train': x_train[:, ordered_category_indexes],
        'test': x_test[:, ordered_category_indexes]
    }, 
}

In [21]:
imputer_strategy = 'most_frequent'
imputer = Imputer(strategy=imputer_strategy)
data['numeric']['train'] = imputer.fit_transform(data['numeric']['train'])
data['numeric']['test'] = imputer.transform(data['numeric']['test'])

for kind in ['category', 'genes', 'ordered_category']:
    data[kind]['train'][np.isnan(data[kind]['train'])] = -999
    data[kind]['test'][np.isnan(data[kind]['test'])] = -999

In [38]:
train_len = data[feature_type]['train'].shape[0]
data_all = np.vstack([data[feature_type]['train'], data[feature_type]['test']])
for i in range(data[feature_type]['train'].shape[1]):
    label_encoder = LabelEncoder()
    data_all[:, i] = label_encoder.fit_transform(data_all[:, i])
    
data[feature_type]['train'] = data_all[:train_len, :]
data[feature_type]['test'] = data_all[train_len:, :]

In [41]:
data[feature_type]['train'].dtype

dtype('float64')

In [None]:
d = data['category']['train'].astype('str')

In [None]:
df = pd.DataFrame(d)

In [None]:
for col_i in range(df.shape[1]):
    df.iloc[:, col_i] = '{}='.format(col_i) + df.iloc[:, col_i]

texts = df.apply(lambda row: ' '.join(row), axis=1).values

In [None]:
from sklearn.feature_extraction.text import HashingVectorizer

In [None]:
hv = HashingVectorizer(n_features=1000, binary=True)

In [None]:
hashed = hv.fit_transform(texts)

In [None]:
from disease_ml.data import _hashing_trick

In [None]:
a, b = _hashing_trick(data['category']['train'], data['category']['test'], n_features=1000)

In [None]:
(a == 0).sum()

In [None]:
a

In [12]:
pred_1 = pd.read_csv('./submissions/Logreg[score=-0.22939][24-03-2017 18:30:44]/submission.csv').Prediction.values
pred_2 = pd.read_csv('./submissions/RF[score=-0.23364][24-03-2017 19:23:44]/submission.csv').Prediction.values

In [16]:
predictions = 0.7*pred_1 + 0.3*pred_2

In [17]:
submission = pd.DataFrame({'Id': np.arange(len(predictions)),
                           'Prediction': predictions})

In [19]:
submission.to_csv(os.path.join('.', 'blending_submission.csv'), index=False)