# 1. Read, Skim and Pre-process data

In [1]:
# 1.0 Initial Codes given from Kaggle

# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.inspection import permutation_importance

# 1.1 Read and Skim data

df = pd.read_csv('/kaggle/input/titanic/train.csv')

print(df.head())
df.info()
df.describe()

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
<c

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [3]:
# 1.2 Find where to pre-processing

print(df["Embarked"].unique())                                       # ['S' 'C' 'Q' nan]
print(df["Embarked"].value_counts())                                 # mode : 'S' (644/891)

# Remove : 1 PassengerId, 3 Name, 8 Ticket (useless) / 10 Cabin (too many NaN)
# Replace : 4 Sex(categorical) 5 Age(fill NaN) 11 Embarked(some NaN, categorical)

['S' 'C' 'Q' nan]
S    644
C    168
Q     77
Name: Embarked, dtype: int64


In [4]:
# 1.3 Pre-processing : Remove or replace NaN

# Remove : 1 PassengerId, 3 Name, 8 Ticket (useless) / 10 Cabin (too many NaN)
# Replace : 4 Sex(categorical) 5 Age(fill NaN) 11 Embarked(some NaN, categorical)

df.drop(["PassengerId", "Name", "Ticket", "Cabin"], axis=1, inplace=True)
df["Age"].fillna(df.Age.mean(), inplace=True)
df["Embarked"].fillna("S", inplace=True)                  # "S" : mode
df = pd.get_dummies(df, columns=["Embarked", "Sex"])
# df["Sex"].replace(to_replace="male", value=1, inplace=True)
# df["Sex"].replace(to_replace="female", value=0, inplace=True)

print(df.head())
df.info()
df.describe()


   Survived  Pclass   Age  SibSp  Parch     Fare  Embarked_C  Embarked_Q  \
0         0       3  22.0      1      0   7.2500           0           0   
1         1       1  38.0      1      0  71.2833           1           0   
2         1       3  26.0      0      0   7.9250           0           0   
3         1       1  35.0      1      0  53.1000           0           0   
4         0       3  35.0      0      0   8.0500           0           0   

   Embarked_S  Sex_female  Sex_male  
0           1           0         1  
1           0           1         0  
2           1           1         0  
3           1           1         0  
4           1           0         1  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Survived    891 non-null    int64  
 1   Pclass      891 non-null    int64  
 2   Age         891 non-null    float64
 3   

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208,0.188552,0.08642,0.725028,0.352413,0.647587
std,0.486592,0.836071,13.002015,1.102743,0.806057,49.693429,0.391372,0.281141,0.446751,0.47799,0.47799
min,0.0,1.0,0.42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,2.0,22.0,0.0,0.0,7.9104,0.0,0.0,0.0,0.0,0.0
50%,0.0,3.0,29.699118,0.0,0.0,14.4542,0.0,0.0,1.0,0.0,1.0
75%,1.0,3.0,35.0,1.0,0.0,31.0,0.0,0.0,1.0,1.0,1.0
max,1.0,3.0,80.0,8.0,6.0,512.3292,1.0,1.0,1.0,1.0,1.0


# 2. HGB

In [5]:
# 2.1 Split input and target data

data = df.iloc[:,1:].to_numpy()                   # except 0 : Survived (target)
target = df.iloc[:,0].to_numpy()

print(len(data))                                  # 891
print(len(target))                                # 891

print(data[:5,])
print(target[:5])

891
891
[[ 3.     22.      1.      0.      7.25    0.      0.      1.      0.
   1.    ]
 [ 1.     38.      1.      0.     71.2833  1.      0.      0.      1.
   0.    ]
 [ 3.     26.      0.      0.      7.925   0.      0.      1.      1.
   0.    ]
 [ 1.     35.      1.      0.     53.1     0.      0.      1.      1.
   0.    ]
 [ 3.     35.      0.      0.      8.05    0.      0.      1.      0.
   1.    ]]
[0 1 1 1 0]


In [6]:
# 2.2 HGB

train_input, valid_input, train_target, valid_target = train_test_split(data, target, test_size=0.2, random_state=604)

hgb = HistGradientBoostingClassifier(random_state=604)
scores = cross_validate(hgb, train_input, train_target, return_train_score=True, n_jobs=-1)
print(np.mean(scores['train_score']), np.mean(scores['test_score']))

hgb.fit(train_input, train_target)

result = permutation_importance(hgb, train_input, train_target, n_repeats=100, random_state=604, n_jobs=-1)
print(result.importances_mean)

result = permutation_importance(hgb, valid_input, valid_target, n_repeats=100, random_state=42, n_jobs=-1)
print(result.importances_mean)

print(hgb.score(valid_input, valid_target))

0.9459309962075663 0.8217275682064414
[0.10011236 0.14502809 0.02331461 0.02769663 0.1272191  0.00710674
 0.00230337 0.00734551 0.21139045 0.00049157]
[ 0.05893855  0.09547486  0.00910615  0.00083799  0.04061453 -0.01653631
  0.00039106 -0.00072626  0.16463687 -0.00346369]
0.8324022346368715


# 3. Submit

In [7]:
# 3.1 Read and pre-process the test data

test = pd.read_csv('/kaggle/input/titanic/test.csv')

# print(test.head())
test.drop(["Name", "Ticket", "Cabin"], axis=1, inplace=True)            # "PassengerId" should be remained
test["Age"].fillna(test.Age.mean(), inplace=True)
test["Fare"].fillna(test.Fare.mean(), inplace=True)
test["Embarked"].fillna("S", inplace=True)
test = pd.get_dummies(test, columns=["Embarked", "Sex"])

print(test.head())
test.info()

test_input = test.iloc[:,1:].to_numpy() 

   PassengerId  Pclass   Age  SibSp  Parch     Fare  Embarked_C  Embarked_Q  \
0          892       3  34.5      0      0   7.8292           0           1   
1          893       3  47.0      1      0   7.0000           0           0   
2          894       2  62.0      0      0   9.6875           0           1   
3          895       3  27.0      0      0   8.6625           0           0   
4          896       3  22.0      1      1  12.2875           0           0   

   Embarked_S  Sex_female  Sex_male  
0           0           0         1  
1           1           1         0  
2           0           0         1  
3           1           0         1  
4           1           1         0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Age          418 no

In [8]:
# 3.2 Generate the submission file

test_id = test["PassengerId"]
test_output = hgb.predict(test_input)
submission = pd.DataFrame({"PassengerId": test_id, "Survived": test_output})
submission.to_csv("./submission_hgb.csv", index=False)

submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,1
4,896,0
