In [None]:
import pandas as pd
import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import (
    FunctionTransformer,
    KBinsDiscretizer,
    MinMaxScaler,
    OneHotEncoder,
)

In [None]:
# read titanic data
df = pd.read_csv('input_data/train.csv')
df.head(2)

In [None]:
# devide data
X_train, X_test, y_train, y_test = train_test_split(
        df[['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']],
        df[['Survived']],
        test_size=0.4685,
)

In [None]:
def process_titanic_data(X_train):
    # Fare
    imputer = SimpleImputer(strategy='mean')
    imputer.fit(X_train[['Fare']])
    X_train['Fare'] = imputer.transform(X_train[['Fare']])
    
    # Sex - One Hot
    one_hot = OneHotEncoder(sparse=False, drop='first')
    one_hot.fit(X_train[['Sex']])
    X_train['is_male'] = one_hot.transform(X_train[['Sex']])

    # Age - Imput
    iter_imp = IterativeImputer(initial_strategy='median', imputation_order='random')
    iter_imp.fit(X_train[['Age', 'is_male']])
    X_train['Age'] = iter_imp.transform(X_train[['Age', 'is_male']])

    # Name - custom function transformer
    name_length_transformer = FunctionTransformer(
        lambda inpdf: inpdf[inpdf.columns[0]].str.len().values.reshape(-1, 1)
    )

    # SibSp Parch - sum, bins
    X_train['family'] = X_train['SibSp'] + X_train['Parch']
    discret = KBinsDiscretizer(n_bins=3, strategy='uniform')

    # Deck
    X_train['deck'] = X_train['Cabin'].apply(
        lambda x: {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'T': 8}.get(x[0]) if isinstance(x, str) else x)
    minmax = MinMaxScaler()
    minmax.fit(X_train[['deck']])
    X_train['discrete_deck'] = minmax.transform(X_train[['deck']])
    iter_imp = IterativeImputer(initial_strategy='median', imputation_order='random')
    iter_imp.fit(X_train[['discrete_deck', 'Fare']])
    X_train['discrete_deck'] = iter_imp.transform(X_train[['discrete_deck', 'Fare']])

    # Transform
    transform_titanic = ColumnTransformer([
        ('name_length', name_length_transformer, ['Name']),
        ('price', minmax, ['Fare']),
        ('family_size', discret, ['family']),
        ('do_nothing', 'passthrough', ['Age', 'is_male', 'discrete_deck'])
    ])
    transform_titanic.fit(X_train)
    return transform_titanic.transform(X_train)

### Cross validation

In [None]:
X_train_preprocessed_np = process_titanic_data(X_train)

model = LogisticRegression(max_iter=100000)
accuracy = cross_val_score(
    model,
    X_train_preprocessed_np,
    y_train.values.ravel(),
    cv=5,
    scoring='accuracy',
)
accuracy

In [None]:
model = LogisticRegression(max_iter=100000)
precision = cross_val_score(
    model,
    X_train_preprocessed_np,
    y_train.values.ravel(),
    cv=5,
    scoring='precision',
)
precision

In [None]:
X_train_preprocessed_np = process_titanic_data(X_train)
X_test_preprocessed_np = process_titanic_data(X_test)

model = LogisticRegression(C=1e5, max_iter=100000)
f = model.fit(X_train_preprocessed_np, y_train.values.ravel())
pred = model.predict(X_test_preprocessed_np)


### kaggle

In [None]:
kaggle_X_train = pd.read_csv('input_data/train_kaggle.csv')
kaggle_X_test = pd.read_csv('input_data/test_kaggle.csv')

kaggle_X_train_preprocessed_np = process_titanic_data(kaggle_X_train)
kaggle_y = pd.read_csv('input_data/train_kaggle.csv')[['Survived']]
kaggle_X_test_preprocessed_np = process_titanic_data(kaggle_X_test)

model = LogisticRegression(C=1e5, max_iter=100000)
f = model.fit(kaggle_X_train_preprocessed_np, kaggle_y.values.ravel())
pred = model.predict(kaggle_X_test_preprocessed_np)

kaggle_X_test['Survived'] = pred
result = kaggle_X_test[['PassengerId', 'Survived']].set_index('PassengerId')
result.to_csv('output_data/kaggle_prediction.csv')
