In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.insert(0, "/Users/user/git/datasci")
from sci.metrics import rmse_score
from plots import *
import sci.features as scif
import sci.learn as scil
import sci.plots as scip


import pandas as pd
pd.set_option('display.float_format', lambda x: '{:.4f}'.format(x)) #Limiting floats output to
pd.options.display.max_rows = 999

import numpy as np

import seaborn as sns
color = sns.color_palette()
sns.set_style('darkgrid')


This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.
Using TensorFlow backend.


## Load Data

In [31]:
# raw data
df_train = pd.read_csv('input/train.csv')
df_test = pd.read_csv('input/test.csv')

In [32]:
df_train = df_train.set_index("PassengerId")
df_test = df_test.set_index("PassengerId")

y = df_train["Survived"]
X = df_train.drop('Survived', axis=1)

print(y.shape)
print(df_train.shape)
print(X.shape)
print(df_test.shape)

(891,)
(891, 11)
(891, 10)
(418, 10)


In [33]:
scif.df_summary(df_train)

Unnamed: 0,count,missing,percent,dtype,types,uniq,values,freq
Cabin,204,687,77.1044,object,str,147,"A10, A14, A16, A19, A20, A23, A24, A26, ...","'C23 C25 C27':4, 'B96 B98':4, 'G6':4, 'C22..."
Age,714,177,19.8653,float64,float,88,"0.42, 0.67, 0.75, 0.83, 0.92, 1.0, 2.0, ...","'24.0':30, '22.0':27, '18.0':26, '19.0':25..."
Embarked,889,2,0.2245,object,str,3,"C, Q, S","'S':644, 'C':168, 'Q':77..."
Survived,891,0,0.0,int64,int,2,"0, 1","'0':549, '1':342..."
Pclass,891,0,0.0,int64,int,3,"1, 2, 3","'3':491, '1':216, '2':184..."
Name,891,0,0.0,object,str,891,"Abbing, Mr. Anthony, ...","'Peter, Mrs. Catherine (Catherine Rizk)..."
Sex,891,0,0.0,object,str,2,"female, male","'male':577, 'female':314..."
SibSp,891,0,0.0,int64,int,7,"0, 1, 2, 3, 4, 5, 8","'0':608, '1':209, '2':28, '4':18, '3':16, '8..."
Parch,891,0,0.0,int64,int,7,"0, 1, 2, 3, 4, 5, 6","'0':678, '1':118, '2':80, '5':5, '3':5, '4':..."
Ticket,891,0,0.0,object,str,681,"110152, 110413, 110465, 110564, 110813, ...","'CA. 2343':7, '347082':7, '1601':7, '31012..."


## process

In [61]:
df_train_proc, df_test_proc = scif.auto_process(df_train, df_test, "Survived")

Dropping Name with 1307 unique str values
Dropping Ticket with 929 unique str values
Dropping Cabin with 186 unique str values


In [62]:
scif.df_summary(df_train_proc)

Unnamed: 0,count,missing,percent,dtype,types,uniq,values,freq
Pclass,891,0,0.0,float64,float,3,"1.0, 2.0, 3.0","'3.0':491, '1.0':216, '2.0':184..."
Age,891,0,0.0,float64,float,186,"-7.636280494105499, -4.643542120291784, ...","'24.0':30, '22.0':27, '18.0':26, '28.0':25..."
SibSp,891,0,0.0,float64,float,7,"0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 8.0","'0.0':608, '1.0':209, '2.0':28, '4.0':18, '..."
Parch,891,0,0.0,float64,float,7,"0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0","'0.0':678, '1.0':118, '2.0':80, '3.0':5, '5..."
Fare,891,0,0.0,float64,float,248,"0.0, 4.0125, 5.0, 6.2375, 6.4375, 6.45, ...","'8.05':43, '13.0':42, '7.8958':38, '7.75':..."
Sex_male,891,0,0.0,float64,float,2,"0.0, 1.0","'1.0':577, '0.0':314..."
Sex_nan,891,0,0.0,float64,float,1,0.0,'0.0':891...
Embarked_Q,891,0,0.0,float64,float,2,"0.0, 1.0","'0.0':814, '1.0':77..."
Embarked_S,891,0,0.0,float64,float,2,"0.0, 1.0","'1.0':644, '0.0':247..."
Embarked_nan,891,0,0.0,float64,float,2,"0.0, 1.0","'0.0':889, '1.0':2..."


## Save

In [63]:
path = 'processed_auto'
!mkdir -p $path
df_train_proc.to_msgpack(f'{path}/X.msgpack')
y.to_msgpack(f'{path}/y.msgpack')
df_test_proc.to_msgpack(f'{path}/X_score.msgpack')

# from sklearn.externals import joblib
# joblib.dump(y_tr, f'{path}/y_tr.joblib')


## Quick Test

In [64]:
import sklearn.utils
from sklearn.linear_model import *
from sklearn.metrics import roc_auc_score, make_scorer

In [65]:
import xgboost
from sklearn.svm import SVC, LinearSVC, NuSVC

# model = ElasticNetCV(l1_ratio=0.5)
model = xgboost.XGBClassifier()
# model = SVC(kernel="linear", C=0.025, probability=True)

sci.metric_global = roc_auc_score
sci.scoring_global = 'accuracy'

data = scil.load_data(path)
cv_scores = scil.score_cv(data, model, scoring=sci.scoring_global, verbose=2, cv=10)

(891, 10) (891,)
CV for XGBClassifier ..


[Parallel(n_jobs=11)]: Using backend LokyBackend with 11 concurrent workers.
[Parallel(n_jobs=11)]: Done  10 out of  10 | elapsed:    2.5s finished


Unnamed: 0,mean,std
XGBClassifier,0.8273,0.0125


Can't plot with only a single or no samples
