In [41]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

from xgboost import XGBClassifier

In [69]:
titanic_df = pd.read_csv('titanic/train.csv')

y = titanic_df.Survived
X = titanic_df.drop(['Survived'], axis=1)
X_train,X_valid, y_train , y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

test_titanic_df = pd.read_csv('titanic/test.csv')
test_titanic_df

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [33]:
X.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,2.0,20.125,0.0,0.0,7.9104
50%,446.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,3.0,38.0,1.0,0.0,31.0
max,891.0,3.0,80.0,8.0,6.0,512.3292


In [34]:
categorical_cols = [col for col in X_train.columns if X_train[col].dtype == 'object' and X_train[col].nunique() < 10]
categorical_cols


['Sex', 'Embarked']

In [35]:
numerical_cols = [col for col in X_train.columns if X_train[col].dtype in ['int64', 'float64']]
numerical_cols

['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

In [65]:
selected_cols = numerical_cols + categorical_cols
X_train_new = X_train[selected_cols].copy()
X_valid_new = X_valid[selected_cols].copy()
X_test = test_titanic_df[selected_cols].copy()

In [70]:
numerical_transformer = SimpleImputer(strategy='constant')
categorical_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('categorical', categorical_transformer, categorical_cols)
    ]
)

titanic_pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('model', XGBClassifier(n_estimators=400, learning_rate = 0.048))
    ]
)

titanic_pipeline.fit(X_train_new, y_train)

y_prediction = titanic_pipeline.predict(X_test)
# print(accuracy_score(y_valid, y_prediction, normalize=True))
y_prediction
output = pd.DataFrame({'PassengerId': X_test.PassengerId,
                       'Survived': y_prediction})
output.to_csv('submission.csv', index=False)