In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.insert(0, "/Users/user/git/datasci")
from sci.metrics import rmse_score
from plots import *
import sci.features as scif
import sci.learn as scil
import sci.plots as scip


import pandas as pd
pd.set_option('display.float_format', lambda x: '{:.4f}'.format(x)) #Limiting floats output to
pd.options.display.max_rows = 999

import numpy as np

import seaborn as sns
color = sns.color_palette()
sns.set_style('darkgrid')


This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.
Using TensorFlow backend.


## Load Data

In [4]:
# raw data
df_train = pd.read_csv('input/train.csv')
df_test = pd.read_csv('input/test.csv')

df_train = df_train.set_index("PassengerId")
df_test = df_test.set_index("PassengerId")

y = df_train["Survived"]
X = df_train.drop('Survived', axis=1)

print(y.shape)
print(df_train.shape)
print(X.shape)
print(df_test.shape)

(891,)
(891, 11)
(891, 10)
(418, 10)


In [23]:
scif.df_summary(df_train)

Unnamed: 0,count,missing,percent,dtype,types,uniq,values,freq
Cabin,204,687,77.1044,object,str,147,"A10, A14, A16, A19, A20, A23, A24, A26, ...","'B96 B98':4, 'G6':4, 'C23 C25 C27':4, 'C22..."
Age,714,177,19.8653,float64,float,88,"0.42, 0.67, 0.75, 0.83, 0.92, 1.0, 2.0, ...","'24.0':30, '22.0':27, '18.0':26, '19.0':25..."
Embarked,889,2,0.2245,object,str,3,"C, Q, S","'S':644, 'C':168, 'Q':77..."
Survived,891,0,0.0,int64,int,2,"0, 1","'0':549, '1':342..."
Pclass,891,0,0.0,int64,int,3,"1, 2, 3","'3':491, '1':216, '2':184..."
Name,891,0,0.0,object,str,891,"Abbing, Mr. Anthony, ...","'Widener, Mr. Harry Elkins':1, 'Foreman, ..."
Sex,891,0,0.0,object,str,2,"female, male","'male':577, 'female':314..."
SibSp,891,0,0.0,int64,int,7,"0, 1, 2, 3, 4, 5, 8","'0':608, '1':209, '2':28, '4':18, '3':16, '8..."
Parch,891,0,0.0,int64,int,7,"0, 1, 2, 3, 4, 5, 6","'0':678, '1':118, '2':80, '5':5, '3':5, '4':..."
Ticket,891,0,0.0,object,str,681,"110152, 110413, 110465, 110564, 110813, ...","'1601':7, '347082':7, 'CA. 2343':7, 'CA 21..."


## Process

In [31]:
def get_deck(cabin):
    if pd.isnull(cabin):
        return np.nan
    return cabin.split(" ")[-1][0]

def MungeData(data):
    data = data.copy()
    data.drop(['Ticket', 'Name'], inplace=True, axis=1)
    
    # Sex
    data.Sex.fillna(0, inplace=True)
    data.loc[data.Sex != 'male', 'Sex'] = 0
    data.loc[data.Sex == 'male', 'Sex'] = 1
    
    # Cabin
    data["Cabin"] = data["Cabin"].apply(get_deck)
    scif.from_cat_to_numeric(data, "Cabin", "NA,A,B,C,D,E,F,G,T", fillna="NA")
    
    # Embarked
    scif.from_cat_to_numeric(data, "Embarked", "NA,C,Q,S", fillna="NA")

    # all the rest
    data.fillna(-1, inplace=True)

    return data.astype(float)

df_train_proc = MungeData(df_train)
df_test_proc = MungeData(df_test)

In [32]:
scif.df_summary(df_train_proc)

Unnamed: 0,count,missing,percent,dtype,types,uniq,values,freq
Survived,891,0,0.0,float64,float,2,"0.0, 1.0","'0.0':549, '1.0':342..."
Pclass,891,0,0.0,float64,float,3,"1.0, 2.0, 3.0","'3.0':491, '1.0':216, '2.0':184..."
Sex,891,0,0.0,float64,float,2,"0.0, 1.0","'1.0':577, '0.0':314..."
Age,891,0,0.0,float64,float,89,"-1.0, 0.42, 0.67, 0.75, 0.83, 0.92, 1.0, ...","'-1.0':177, '24.0':30, '22.0':27, '18.0':2..."
SibSp,891,0,0.0,float64,float,7,"0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 8.0","'0.0':608, '1.0':209, '2.0':28, '4.0':18, '..."
Parch,891,0,0.0,float64,float,7,"0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0","'0.0':678, '1.0':118, '2.0':80, '3.0':5, '5..."
Fare,891,0,0.0,float64,float,248,"0.0, 4.0125, 5.0, 6.2375, 6.4375, 6.45, ...","'8.05':43, '13.0':42, '7.8958':38, '7.75':..."
Cabin,891,0,0.0,float64,float,9,"0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, ...","'0.0':687, '3.0':59, '2.0':47, '4.0':33, '5..."
Embarked,891,0,0.0,float64,float,4,"0.0, 1.0, 2.0, 3.0","'3.0':644, '1.0':168, '2.0':77, '0.0':2..."


## Save

In [26]:
y = df_train_proc["Survived"]
X_train = df_train_proc.drop("Survived", axis=1)
X_score = df_test_proc

In [27]:
path = 'processed_min'
!mkdir -p $path
X_train.to_msgpack(f'{path}/X.msgpack')
y.to_msgpack(f'{path}/y.msgpack')
X_score.to_msgpack(f'{path}/X_score.msgpack')

# from sklearn.externals import joblib
# joblib.dump(y_tr, f'{path}/y_tr.joblib')


## Quick Test

In [28]:
import sklearn.utils
from sklearn.linear_model import *


In [29]:
import xgboost
from sklearn.svm import SVC, LinearSVC, NuSVC

# model = ElasticNetCV(l1_ratio=0.5)
model = xgboost.XGBClassifier()
# model = SVC(kernel="linear", C=0.025, probability=True)

# sci.metric_global = roc_auc_score
scil.scoring_global = 'accuracy'

path = 'processed_min'

data = scil.load_data(path)
cv_scores = scil.score_cv(data, model, scoring=scil.scoring_global, verbose=2, cv=10)

(891, 8) (891,)
CV for XGBClassifier ..


[Parallel(n_jobs=11)]: Using backend LokyBackend with 11 concurrent workers.
[Parallel(n_jobs=11)]: Done  10 out of  10 | elapsed:    0.4s finished


Unnamed: 0,mean,std
XGBClassifier,0.816,0.0125


Can't plot with only a single or no samples
