In [59]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold as SK
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import xgboost
import catboost
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, HistGradientBoostingClassifier, VotingClassifier, RandomForestClassifier
from sklearn.metrics import roc_auc_score, confusion_matrix, accuracy_score

In [109]:
data_train = pd.read_csv('files/titanic.csv')
data_test = pd.read_csv('files/titanic_test.csv')
all_df=pd.concat([data_train, data_test], ignore_index=True,sort=False)

In [110]:
for i in all_df.columns:
    print('{:10s} {:.5f}'.format(i, np.sum(all_df[i].isnull())/all_df.shape[0] ))

PassengerId 0.00000
Survived   0.31933
Pclass     0.00000
Name       0.00000
Sex        0.00000
Age        0.20092
SibSp      0.00000
Parch      0.00000
Ticket     0.00000
Fare       0.00076
Cabin      0.77464
Embarked   0.00153


### Fill NA
We will fill Age based on sex, Pclass and Fare. Sum SibSp and Parch and fill Embarked with most popular - S

In [111]:
column_transformer = ColumnTransformer([
    ('ohe', OneHotEncoder(handle_unknown="ignore"), ['Pclass', 'Sex']),
    ('scaling', StandardScaler(), ['Fare'])
])

pipeline = Pipeline(steps=[
    ('ohe_and_scaling', column_transformer),
    ('regression', Lasso(alpha=0.1))
])

all_df['Fare'] = all_df['Fare'].fillna(all_df.loc[(all_df.Pclass==3) * (all_df.Sex=='male') * all_df.Fare.notnull(), 'Fare'].mean())

X_withAge = all_df[all_df.Age.notnull()]
y_withAge = all_df[all_df.Age.notnull()]['Age']
X_withoutAge = all_df[all_df.Age.isnull()]

predictAge = pipeline.fit(X_withAge, y_withAge)
age_predict = predictAge.predict(X_withoutAge)

all_df.loc[all_df.Age.isnull(), 'Age'] = age_predict
all_df['Rel'] = all_df.SibSp + all_df.Parch
all_df.loc[all_df.Embarked.isnull(), 'Embarked'] = 'S'
all_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Rel
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0


In [112]:
for i in all_df.columns:
    print('{:10s} {:.5f}'.format(i, np.sum(all_df[i].isnull())/all_df.shape[0] ))

PassengerId 0.00000
Survived   0.31933
Pclass     0.00000
Name       0.00000
Sex        0.00000
Age        0.00000
SibSp      0.00000
Parch      0.00000
Ticket     0.00000
Fare       0.00000
Cabin      0.77464
Embarked   0.00000
Rel        0.00000


In [58]:
categorical = ['Sex', 'Embarked']
numeric_features = ['Pclass', 'Age', 'Fare', 'Rel']

column_transformer = ColumnTransformer([
    ('ohe', OrdinalEncoder(), categorical),
    ('scaling', StandardScaler(), numeric_features) # how to not scaling?
])

cv = SK(n_splits=10, shuffle=True, random_state=42)
acc = []
models = []
for i in cv.split(X, y):
    pipeline2 = Pipeline(steps=[
    ('preprocessor', column_transformer),
    ('regression', xgboost.XGBClassifier(max_depth=3, eval_metric='logloss', n_estimators=100, learning_rate=0.5e-1, use_label_encoder=False))
    ])
    model = pipeline2.fit(X.iloc[i[0]], y.iloc[i[0]])
    models.append(model)
    y_pred = model.predict(X.iloc[i[1]])
    acc.append(accuracy_score(y_pred, y.iloc[i[1]]))
np.mean(acc)

0.8237578027465668

In [52]:
y_pred = np.vstack([models[i].predict(titanic_test) for i in range(10)])
y_pred = (np.sum(y_pred, axis=0) > 5) * 1
answer = pd.DataFrame({'PassengerId': titanic_test.PassengerId, 'Survived': y_pred})
answer.to_csv('files/titanic_answ.csv', index=False)