In [267]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.insert(0, "/Users/user/git/datasci")
from plots import *
import sci.features as scif
import sci.learn as scil
import sci.plots as scip


import pandas as pd
pd.set_option('display.float_format', lambda x: '{:.4f}'.format(x)) #Limiting floats output to
pd.options.display.max_rows = 999

import numpy as np

import seaborn as sns
color = sns.color_palette()
sns.set_style('darkgrid')


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load Data

In [268]:
# raw data
df_train = pd.read_csv('input/train.csv')
df_test = pd.read_csv('input/test.csv')

df_train = df_train.set_index("PassengerId")
df_test = df_test.set_index("PassengerId")

y = df_train["Survived"]
X = df_train.drop('Survived', axis=1)

print(y.shape)
print(df_train.shape)
print(X.shape)
print(df_test.shape)

(891,)
(891, 11)
(891, 10)
(418, 10)


## Starts

In [269]:
scif.normalize_feature_names(df_train, df_test)

## Drop outliers from train data

### Manual outlier handling

### Automatic outlier handling

## Clean Data Types

In [270]:
# columsn with more than one data type (excluding NaN values)
display(scif.types_of_df(df_train, more_than_one=True))
display(scif.types_of_df(df_test, more_than_one=True))

Unnamed: 0,types,n_types


Unnamed: 0,types,n_types


## Missing Values
- drop columns with mostly missing values: these might have very little value for generalization
- columns with few missing values: we can drop the samples, or impute them. Dropping is only possible for the train_set, we can't drop from the live scoring set
- we can drop samples with missing values, but that isn't a good option when we don't have many samples, plus they can be special cases which we do want to model
- we can impute (fill) missing values, this makes sense sometimes, but not always as sometimes:
- missing values can represent a special "state", like value is missing since for this sample it's irrelevant. Filling this value would be incorrect.

In [271]:
scif.df_summary(df_train, missing_only=True)

Unnamed: 0,count,missing,percent,dtype,types,uniq,miss-idx,values,freq
Cabin,204,687,77.1044,object,str,147,"1, 3, 5","A10, A14, A16, A19, A20, A23, A24, A26, ...","'B96 B98':4, 'C23 C25 C27':4, 'G6':4, 'D':..."
Age,714,177,19.8653,float64,float,88,"6, 18, 20","0.42, 0.67, 0.75, 0.83, 0.92, 1.0, 2.0, ...","'24.0':30, '22.0':27, '18.0':26, '19.0':25..."
Embarked,889,2,0.2245,object,str,3,"62, 830","C, Q, S","'S':644, 'C':168, 'Q':77..."


In [272]:
scif.df_summary(df_test, missing_only=True)

Unnamed: 0,count,missing,percent,dtype,types,uniq,miss-idx,values,freq
Cabin,91,327,78.2297,object,str,76,"892, 893, 894","A11, A18, A21, A29, A34, A9, B10, B11, ...","'B57 B59 B63 B66':3, 'C55 C57':2, 'C101':..."
Age,332,86,20.5742,float64,float,79,"902, 914, 921","0.17, 0.33, 0.75, 0.83, 0.92, 1.0, 2.0, ...","'24.0':17, '21.0':17, '22.0':16, '30.0':15..."
Fare,417,1,0.2392,float64,float,169,1044,"0.0, 3.1708, 6.4375, 6.4958, 6.95, 7.0, ...","'7.75':21, '26.0':19, '8.05':17, '13.0':17..."


In [273]:
# df = df_train
# sizes = df.groupby("Ticket").size()
# df_merge = df.merge(sizes.rename("group_size"), left_on="Ticket", right_index=True, how="outer", suffixes=('', ''))
# df_merge = df_merge.sort_index()
# scif.df_comp(df, df_merge.drop("group_size", axis=1), "orig", "merge")


In [274]:
def get_deck(cabin):
    if pd.isnull(cabin):
        return "None"
    return cabin.split(" ")[-1][0]


In [286]:
def process(df):
    df = df.copy()
    df = df.merge(df.groupby("Ticket").size().rename("group_size"), left_on="Ticket", right_index=True, how="outer", suffixes=('', ''))
    df = df.sort_index()
    df = df.merge(df.groupby("Cabin").size().rename("group_size2"), left_on="Cabin", right_index=True, how="outer", suffixes=('', ''))
    df = df.sort_index()
    # fill with 0, as people with no Cabin are logically in a group of size 0 for Cabins
    df["group_size2"] = df["group_size2"].fillna(0)
    df.drop("Ticket", axis=1, inplace=True)
    
    df["family_size"] = df['SibSp'] + df['Parch'] + 1

    df["has_cabin"] = df.Cabin.notnull()
    df["deck"] = df.Cabin.apply(get_deck)
    # TBD: CABIN WORK
    df.drop("Cabin", axis=1, inplace=True)

    df["has_age"] = df.Age.notnull()

    df["Title"] = df.Name.apply(lambda x: x.split(", ")[1].split(" ")[0])
    df.loc[~df["Title"].isin(['Mr.', 'Miss.', 'Mrs.', 'Master.']), "Title"] = "Other."
#     df["FamilyName"] = df.Name.apply(lambda x: x.split(", ")[0])
#     df["FirstName"] = df.Name.apply(lambda x: " ".join(x.split(", ")[1].split(" ")[1:]))
    df["Name_has_paren"] = df.Name.str.contains('(', regex=False)
    df["Name_has_quote"] = df.Name.str.contains('"', regex=False)
    df.drop("Name", axis=1, inplace=True)

    # Fare == 0 looks like a mistake in the data
    df.loc[df["Fare"] == 0, "Fare"] = np.nan
    # we add has_fare and them fill with MICE
    df["has_fare"] = df["Fare"].notnull()
    
    return df

df_train_proc = process(df_train)
df_test_proc = process(df_test)


In [293]:
df_train_proc.pivot_table(index="Title", columns="Sex", aggfunc="size")

Sex,female,male
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Master.,,40.0
Miss.,182.0,
Mr.,,517.0
Mrs.,125.0,
Other.,7.0,20.0


In [280]:
df_train_proc, df_test_proc = scif.get_dummies(df_train_proc, df_test_proc, target="Survived", dummy_na=False, drop_first=False, reintroduce_na=True)


In [281]:
df_train_proc_imp = scif.impute_iterative(df_train_proc, one_hot_columns=["Embarked", "Sex"])
df_train_proc_imp.loc[df_train_proc_imp["Age"]<0, "Age"]= 0.1 # we assume baby if imputed age was below zero

df_test_proc_imp = scif.impute_iterative(df_test_proc, one_hot_columns=["Embarked", "Sex"])
df_test_proc_imp.loc[df_test_proc_imp["Age"]<0, "Age"]= 0.1 # we assume baby if imputed age was below zero


ValueError: attempt to get argmax of an empty sequence

In [278]:
def process2(df):
    df["age_group"] = pd.cut(df["Age"], [0, 2, 6, 12, 18, 25, 35, 45, 120])
    df["age_group"] = df["age_group"].astype(str)
    
    # div by group_size as all Fare is for all people with the same ticket
    df["fare_norm"] = df["Fare"] / df["group_size"]
#     df["fare_q"] = pd.qcut(df["fare_norm"], [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])    
#     df["fare_q"] = df["fare_q"].astype(str)
    
    return df

df_train_proc_imp = process2(df_train_proc_imp)
df_test_proc_imp = process2(df_test_proc_imp)
df_train_proc_imp, df_test_proc_imp = scif.get_dummies(df_train_proc_imp, df_test_proc_imp, target="Survived", dummy_na=False, drop_first=False, reintroduce_na=True)


KeyError: "None of [Index(['Title'], dtype='object')] are in the [columns]"

### encoding string features:
- opt-1: pd.factorize() and sklearn.LabelEncoder(): convert a single string feature to a single int features by mapping each label value to a number (and remember the mapping for later use)
- opt-2: pd.get_dummies() and sklearn.OneHotEncoder(): convert a single string feature to many bool features, 1-per label value
- note: get_dummies also supports drop_first which helps reduce dimensions and colinearity
but to be 100% sure we don't have different dummies between what the model expects (train) and test, we should use OneHotEncoder


## Main Processing

## Switch all to floats

In [None]:
scif.assert_no_missing_values(df_train_proc_imp)
scif.assert_no_missing_values(df_test_proc_imp)

### check and reduce skewness

In [None]:
# scif.skewness_check(df_train_proc_imp).head(10)

In [None]:
# scif.skewness_fix(df_train_proc)

## Categorical Features
- text features with few values are classic categorical features
- text features with rich content are NOT
- numeric (integer) features with few values that represent categories can be treated as categorical, but can also be processed as numeric and let the model figure it out

What do we do with them?
- we can convert them to integers using LabelEncoder and let the model figure it out
- we can convert them to one-hot features using get_dummie or OneHotEncoder

TODO:  
solved with (drop_first=True) ?  
https://www.algosome.com/articles/dummy-variable-trap-regression.html

## Drop Outliers

In [None]:
# from sklearn.linear_model import *

# X = df_train_proc_imp
# # X = df_train_proc_imp.drop('Survived', axis=1)
# y = df_train['Survived']
# model = RidgeCV()
# model.fit(X, y)
# # from sklearn.model_selection import cross_val_score
# # cross_val_score(model, X, y, scoring=rmse_score).mean()

# y_pred = pd.Series(model.predict(X), index=y.index)
# y_resid = y - y_pred
# resid_mean = y_resid.mean()
# resid_std  = y_resid.std()
# resid_z = (y_resid - resid_mean) / resid_std
# outliers_idx = y.index[np.abs(resid_z) >= 3]

In [None]:
# plt.scatter(y, y_pred)
# plt.scatter(y.loc[outliers_idx], y_pred.loc[outliers_idx])

In [None]:
# df_train_proc = df_train_proc.drop(outliers_idx)

In [None]:
scif.normalize_feature_names(df_train_proc_imp, df_test_proc_imp)

## Finished

In [None]:
scif.df_comp(df_train_proc_imp, df_test_proc_imp, "train", "test")
# scif.df_comp(df_train_proc.drop(["SalePrice"], axis=1), df_test_proc, "train", "test")

## Split data

In [283]:
# X_train = df_train_proc.set_index("Id")
y = df_train['Survived']
X_train = df_train_proc_imp
X_score = df_test_proc_imp

print(X_train.shape, y.shape, X_score.shape)

(891, 5) (891,) (418, 5)


## Save

In [284]:
path = 'processed_my_'
!mkdir -p $path
X_train.to_msgpack(f'{path}/X.msgpack')
y.to_msgpack(f'{path}/y.msgpack')
X_score.to_msgpack(f'{path}/X_score.msgpack')

# from sklearn.externals import joblib
# joblib.dump(y_tr, f'{path}/y_tr.joblib')


## Quick Test

In [None]:
import sklearn.utils
from sklearn.linear_model import *


In [285]:
import xgboost
from sklearn.svm import SVC, LinearSVC, NuSVC

# model = ElasticNetCV(l1_ratio=0.5)
model = xgboost.XGBClassifier()
# model = SVC(kernel="linear", C=0.025, probability=True)

# sci.metric_global = roc_auc_score
scil.scoring_global = 'accuracy'

# path = 'processed_my'

data = scil.load_data(path)
cv_scores = scil.score_cv(data, model, scoring=scil.scoring_global, verbose=2, cv=10)

(891, 5) (891,)
CV for XGBClassifier ..


[Parallel(n_jobs=11)]: Using backend LokyBackend with 11 concurrent workers.


CV score: 0.785699 +/- 0.0160 SEM


[Parallel(n_jobs=11)]: Done  10 out of  10 | elapsed:    2.4s finished
