In [155]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold as SK
from sklearn.linear_model import Lasso, LassoCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import xgboost
import catboost
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, HistGradientBoostingClassifier, VotingClassifier, RandomForestClassifier, StackingClassifier
from sklearn.metrics import roc_auc_score, confusion_matrix, accuracy_score

In [156]:
data_train = pd.read_csv('files/titanic.csv')
data_test = pd.read_csv('files/titanic_test.csv')
all_df=pd.concat([data_train, data_test], ignore_index=True,sort=False)

In [157]:
for i in all_df.columns:
    print('{:10s} {:.5f}'.format(i, np.sum(all_df[i].isnull())/all_df.shape[0] ))

PassengerId 0.00000
Survived   0.31933
Pclass     0.00000
Name       0.00000
Sex        0.00000
Age        0.20092
SibSp      0.00000
Parch      0.00000
Ticket     0.00000
Fare       0.00076
Cabin      0.77464
Embarked   0.00153


### Fill NA
We will fill Age based on sex, Pclass and Fare. Sum SibSp and Parch and fill Embarked with most popular - S

In [158]:
column_transformer = ColumnTransformer([
    ('ohe', OneHotEncoder(handle_unknown="ignore"), ['Pclass', 'Sex']),
    ('scaling', StandardScaler(), ['Fare'])
])

pipeline = Pipeline(steps=[
    ('ohe_and_scaling', column_transformer),
    ('regression', LassoCV())
])

all_df['Fare'] = all_df['Fare'].fillna(all_df.loc[(all_df.Pclass==3) * (all_df.Sex=='male') * all_df.Fare.notnull(), 'Fare'].mean())

X_withAge = all_df[all_df.Age.notnull()]
y_withAge = all_df[all_df.Age.notnull()]['Age']
X_withoutAge = all_df[all_df.Age.isnull()]

predictAge = pipeline.fit(X_withAge, y_withAge)
age_predict = predictAge.predict(X_withoutAge)

all_df.loc[all_df.Age.isnull(), 'Age'] = age_predict
all_df['Rel'] = all_df.SibSp + all_df.Parch
all_df.loc[all_df.Embarked.isnull(), 'Embarked'] = 'S'
all_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Rel
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0


In [159]:
np.min(age_predict), np.max(age_predict)

(21.15362800777276, 42.87505678006943)

# Ticket
Have a look at Ticket. We can took the second part A/5 21171 $-->$ 21171

In [160]:
for i in all_df.columns:
    print('{:10s} {:.5f}'.format(i, np.sum(all_df[i].isnull())/all_df.shape[0] ))

PassengerId 0.00000
Survived   0.31933
Pclass     0.00000
Name       0.00000
Sex        0.00000
Age        0.00000
SibSp      0.00000
Parch      0.00000
Ticket     0.00000
Fare       0.00000
Cabin      0.77464
Embarked   0.00000
Rel        0.00000


In [161]:
all_df['Ticket_Pre'] = all_df.Ticket.apply(lambda x: x.split(' ')[0] if x.isdigit()==False else 'NoPre')
reps = {'.' : '','/':''}
all_df.Ticket_Pre=all_df.Ticket_Pre.str.translate(str.maketrans(reps))
all_df['Ticket_Pre'].unique()

array(['A5', 'PC', 'STONO2', 'NoPre', 'PP', 'CA', 'SCParis', 'SCA4', 'A4',
       'SP', 'SOC', 'WC', 'SOTONOQ', 'WEP', 'STONO', 'C', 'SCPARIS',
       'SOP', 'Fa', 'LINE', 'FCC', 'SWPP', 'SCOW', 'PPP', 'SC', 'SCAH',
       'AS', 'SOPP', 'FC', 'SOTONO2', 'CASOTON', 'SCA3', 'STONOQ', 'AQ4',
       'A', 'LP', 'AQ3'], dtype=object)

In [162]:
all_df['Ticket'] = all_df.Ticket.apply(lambda x: x.split()[-1])

In [163]:
all_df[all_df['Ticket'].apply(lambda x: not x.isdigit())]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Rel,Ticket_Pre
179,180,0.0,3,"Leonard, Mr. Lionel",male,36.0,0,0,LINE,0.0,,S,0,LINE
271,272,1.0,3,"Tornquist, Mr. William Henry",male,25.0,0,0,LINE,0.0,,S,0,LINE
302,303,0.0,3,"Johnson, Mr. William Cahoone Jr",male,19.0,0,0,LINE,0.0,,S,0,LINE
597,598,0.0,3,"Johnson, Mr. Alfred",male,49.0,0,0,LINE,0.0,,S,0,LINE


In [164]:
all_df.loc[(all_df.Pclass == 3) * (all_df.Sex == 'male') * (all_df.Embarked == 'S'), 'Ticket'].describe()

count      366
unique     318
top       1601
freq         8
Name: Ticket, dtype: object

In [165]:
all_df.loc[all_df['Ticket'].apply(lambda x: not x.isdigit()), 'Ticket'] = 1601

In [166]:
all_df.loc[:, 'Ticket'] = all_df.loc[:, 'Ticket'].apply(lambda x: int(x))

In [167]:
all_df['Name_Prefix']=all_df['Name'].apply(lambda x: x[x.find(', ')+len(', '):x.rfind('.')])
all_df['Name_Prefix']=all_df['Name_Prefix'].replace("Mlle","Miss") #French  to En
all_df['Name_Prefix']=all_df['Name_Prefix'].replace("Mme","Mrs")   #French  to En
all_df['Name_Prefix']=all_df['Name_Prefix'].replace("Don","Sir")   #Spanish to En
all_df['Name_Prefix']=all_df['Name_Prefix'].replace("Dona","Mrs")  #Spanish to En

all_df['Name_Prefix']=all_df['Name_Prefix'].replace("Mrs. Martin (Elizabeth L","Mrs")  #Spanish to En
all_df['Name_Prefix']=all_df['Name_Prefix'].replace("Ms","Miss")  #Spanish to En
all_df['Name_Prefix']=all_df['Name_Prefix'].replace("Major","Col")  #Spanish to En
all_df['Name_Prefix']=all_df['Name_Prefix'].replace("Capt","Col")  #Spanish to En
all_df['Name_Prefix']=all_df['Name_Prefix'].replace('the Countess', "Mrs")  #Spanish to En
all_df['Name_Prefix']=all_df['Name_Prefix'].replace('Lady', "Miss")  #Spanish to En
all_df['Name_Prefix']=all_df['Name_Prefix'].replace('Jonkheer', "Mr")  #Spanish to En
all_df['Name_Prefix']=all_df['Name_Prefix'].replace('Sir', "Mr")  #Spanish to En
all_df['Name_Prefix'].value_counts()

Mr        760
Miss      265
Mrs       200
Master     61
Rev         8
Dr          8
Col         7
Name: Name_Prefix, dtype: int64

In [168]:
all_df.loc[all_df['Name_Prefix'] == 'Dr']

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Rel,Ticket_Pre,Name_Prefix
245,246,0.0,1,"Minahan, Dr. William Edward",male,44.0,2,0,19928,90.0,C78,Q,2,NoPre,Dr
317,318,0.0,2,"Moraweck, Dr. Ernest",male,54.0,0,0,29011,14.0,,S,0,NoPre,Dr
398,399,0.0,2,"Pain, Dr. Alfred",male,23.0,0,0,244278,10.5,,S,0,NoPre,Dr
632,633,1.0,1,"Stahelin-Maeglin, Dr. Max",male,32.0,0,0,13214,30.5,B50,C,0,NoPre,Dr
660,661,1.0,1,"Frauenthal, Dr. Henry William",male,50.0,2,0,17611,133.65,,S,2,PC,Dr
766,767,0.0,1,"Brewe, Dr. Arthur Jackson",male,41.942594,0,0,112379,39.6,,C,0,NoPre,Dr
796,797,1.0,1,"Leader, Dr. Alice (Farnham)",female,49.0,0,0,17465,25.9292,D17,S,0,NoPre,Dr
1184,1185,,1,"Dodge, Dr. Washington",male,53.0,1,1,33638,81.8583,A34,S,2,NoPre,Dr


In [171]:
categorical = ['Sex', 'Embarked', 'Name_Prefix']
numeric_features = ['Pclass', 'Age', 'Fare', 'Rel', 'Ticket']

column_transformer = ColumnTransformer([
    ('ohe', OneHotEncoder(drop='first'), categorical),
    ('scaling', StandardScaler(), numeric_features) # how to not scaling?
])
all_df_trans = column_transformer.fit_transform(all_df)
X_train = all_df_trans[:891]
y_train = all_df.iloc[:891]['Survived']
X_test = all_df_trans[891:]
y_test = all_df.iloc[891:]['Survived']

cv = SK(n_splits=11, shuffle=True, random_state=42)
acc = []
models = []

for i in cv.split(X_train, y_train):
    model = xgboost.XGBClassifier(max_depth=3, eval_metric='logloss', n_estimators=500, learning_rate=1e-5, use_label_encoder=False)
    # model = SVC(C=1e-3)
    # model = LogisticRegression(C=1e-5)
    # model = LassoCV()
    model = pipeline2.fit(X_train[i[0]], y_train[i[0]])
    models.append(model)
    y_pred = model.predict(X_train[i[1]])
    acc.append(accuracy_score(y_pred, y_train.iloc[i[1]]))
round(np.mean(acc), 4)

0.8272

In [127]:
y_pred = np.vstack([models[i].predict(X_test) for i in range(11)])
y_pred = (np.sum(y_pred, axis=0) > 5) * 1
answer = pd.DataFrame({'PassengerId': all_df.iloc[891:].PassengerId, 'Survived': y_pred})
answer.to_csv('files/titanic_answ.csv', index=False)