Initialization

In [328]:
import pandas as pd
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

train_data = pd.read_csv("./train.csv")
val_data = pd.read_csv("./test.csv")

print(train_data.columns)

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


In [329]:
from sklearn import preprocessing

In [330]:
y = train_data["Survived"]
print(y)

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64


In [331]:
train_data["Sex"] = train_data["Sex"].apply(lambda sex: 0 if sex == "male" else 1)

In [332]:
X = train_data.drop(["Survived", "PassengerId", "Name", "Ticket", "Cabin", "Embarked"], axis=1)
X = X.interpolate(method="linear", axis=0)

In [333]:
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
0,3,0,22.0,1,0,7.25
1,1,1,38.0,1,0,71.2833
2,3,1,26.0,0,0,7.925
3,1,1,35.0,1,0,53.1
4,3,0,35.0,0,0,8.05


In [334]:
def norm(x: float | int, df: pd.DataFrame, label: str) -> float:
    return (x - df[label].min())/(df[label].max() - df[label].min()) * 100

In [335]:
X["Age"] = X["Age"].apply(lambda x: norm(x, X, "Age"))
X["Fare"] = X["Fare"].apply(lambda x: norm(x, X, "Fare"))

In [336]:
print(y.size)

891


In [337]:
X.sample(10)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
297,1,1,1.985423,1,2,29.58059
261,3,0,3.242021,4,2,6.126432
767,3,1,37.798442,0,0,1.512699
376,3,1,27.117366,0,0,1.415106
18,3,1,38.42674,1,0,3.513366
711,1,0,44.709726,0,0,5.182215
262,1,0,64.81528,1,1,15.546645
480,3,0,10.781603,5,2,9.15427
869,3,0,4.498618,1,1,2.173075
233,3,1,5.755215,4,2,6.126432


In [338]:
from sklearn.ensemble import RandomForestClassifier

In [339]:
?RandomForestClassifier

[1;31mInit signature:[0m
[0mRandomForestClassifier[0m[1;33m([0m[1;33m
[0m    [0mn_estimators[0m[1;33m=[0m[1;36m100[0m[1;33m,[0m[1;33m
[0m    [1;33m*[0m[1;33m,[0m[1;33m
[0m    [0mcriterion[0m[1;33m=[0m[1;34m'gini'[0m[1;33m,[0m[1;33m
[0m    [0mmax_depth[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mmin_samples_split[0m[1;33m=[0m[1;36m2[0m[1;33m,[0m[1;33m
[0m    [0mmin_samples_leaf[0m[1;33m=[0m[1;36m1[0m[1;33m,[0m[1;33m
[0m    [0mmin_weight_fraction_leaf[0m[1;33m=[0m[1;36m0.0[0m[1;33m,[0m[1;33m
[0m    [0mmax_features[0m[1;33m=[0m[1;34m'sqrt'[0m[1;33m,[0m[1;33m
[0m    [0mmax_leaf_nodes[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mmin_impurity_decrease[0m[1;33m=[0m[1;36m0.0[0m[1;33m,[0m[1;33m
[0m    [0mbootstrap[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0moob_score[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0mn_jobs[0m[1;33m=[0m[1;32mN

In [340]:
model = RandomForestClassifier(n_estimators=200, random_state=0)

In [341]:
model.fit(X, y)

In [342]:
test_X = val_data.drop(["PassengerId", "Name", "Ticket", "Cabin", "Embarked"], axis=1)
test_X["Sex"] = test_X["Sex"].apply(lambda sex: 0 if sex == "male" else 1)
test_X = test_X.interpolate(method="linear", axis=0)
test_X["Age"] = test_X["Age"].apply(lambda x: norm(x, test_X, "Age"))
test_X["Fare"] = test_X["Fare"].apply(lambda x: norm(x, test_X, "Fare"))

In [343]:
X.describe()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0
mean,2.308642,0.352413,36.825912,0.523008,0.381594,6.285843
std,0.836071,0.47799,17.469657,1.102743,0.806057,9.699511
min,1.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,0.0,25.860769,0.0,0.0,1.544007
50%,3.0,0.0,35.285248,0.0,0.0,2.821272
75%,3.0,1.0,47.22292,1.0,0.0,6.050797
max,3.0,1.0,100.0,8.0,6.0,100.0


In [344]:
test_X.describe()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
count,418.0,418.0,418.0,418.0,418.0,418.0
mean,2.26555,0.363636,39.397667,0.447368,0.392344,6.942016
std,0.841838,0.481622,17.806552,0.89676,0.981429,10.902077
min,1.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,28.128709,0.0,0.0,1.541158
50%,3.0,0.0,36.304893,0.0,0.0,2.821272
75%,3.0,1.0,51.041804,1.0,0.0,6.142901
max,3.0,1.0,100.0,8.0,9.0,100.0


In [345]:
pred_y = model.predict(test_X)

In [346]:
submission = pd.DataFrame({
    "PassengerId": val_data["PassengerId"],
    "Survived": pred_y
})
submission.to_csv("./submission.csv", index=False)