<a href="https://www.kaggle.com/code/jannikca/titanic-competition-with-xgboost?scriptVersionId=123745225" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import xgboost as xgb

#Own imports
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/spaceship-titanic/sample_submission.csv
/kaggle/input/spaceship-titanic/train.csv
/kaggle/input/spaceship-titanic/test.csv


# **Intention of the Notebook**

This is my first competition notebook that is independent of any exercise of a course.
I try to recap lots of concepts on my own to make suitable predictions.
Later on, I think about how to improve those predictions.

In [2]:
#Read training and test data
X = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
X_test = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')


#Remove rows with empty target and separate target from predictors
X.dropna(axis=0, subset=['Transported'], inplace=True)
y = X.Transported
X.drop(['Transported'], axis=1, inplace=True)


# Select categorical columns with relatively low cardinality. In fact, we remove columns likes ids or names because they are unique and we cannot learn from them.
categorical_cols = [cname for cname in X.columns if
                    X[cname].nunique() < 20 and 
                    X[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X.columns if 
                X[cname].dtype in ['int64', 'float64']]


# Keep selected columns only
my_cols = categorical_cols + numerical_cols



X_train = X[my_cols].copy()



Next, we setup a pipeline to deal with missing values and to encode categorical variables. Afterward, we use an XGBoost model.

In [3]:
numerical_transformer = SimpleImputer(strategy='mean')

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

model = xgb.XGBClassifier(learning_rate=0.025,n_estimators=500, n_jobs=8)

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                     ])

parameters = {
    'model__max_depth': range (2, 5, 1),
    'model__n_estimators': range(60, 250, 20),
    'model__learning_rate': [0.1, 0.01, 0.05]
}


grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=parameters,
    scoring = 'accuracy',
    n_jobs = 4,
    cv = 6,
    verbose=True
)


grid_search.fit(X_train,y)





Fitting 6 folds for each of 90 candidates, totalling 540 fits


GridSearchCV(cv=6,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         SimpleImputer(),
                                                                         ['Age',
                                                                          'RoomService',
                                                                          'FoodCourt',
                                                                          'ShoppingMall',
                                                                          'Spa',
                                                                          'VRDeck']),
                                                                        ('cat',
                                                                         Pipeline(steps=[('imputer',
                                                            

In [4]:
predictions = grid_search.best_estimator_.predict(X_test).astype('bool')

print(grid_search.best_score_)

0.7961590326471423


In [5]:
# Run the code to save predictions in the format used for competition scoring
output = pd.DataFrame({'PassengerId': X_test.PassengerId,
                       'Transported': predictions})
output.to_csv('submission.csv', index=False)