In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df_train = pd.read_csv("data/train.csv")

y_train = df_train["Survived"]
X_train = df_train.drop(["Survived", "Cabin", "Embarked"], axis = 1)
y_test = pd.read_csv("data/gender_submission.csv")["Survived"]
X_test = pd.read_csv("data/test.csv").drop(["Cabin", "Embarked"], axis = 1)

X = pd.concat([X_train, X_test])
y = pd.concat([y_train, y_test], ignore_index = True)

df = X.copy()
df["Survived"] = y

# print(X_train.shape[0], y_train.size, X_test.shape[0], y_test.size)
print("Train size: {}\nTest size: {}\n".format(X_train.shape[0], X_test.shape[0]) ) 

print(X_train.columns)

Train size: 891
Test size: 418

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare'],
      dtype='object')


In [23]:
survived = df[ df["Survived"] == 1]
n_survived = survived.shape[0]
not_survived = df[ df["Survived"] == 0]
n_not_survived = not_survived.shape[0]

survived_stats = survived.describe()
not_survived_stats = not_survived.describe()

comparation_stats = pd.DataFrame()
comparation_stats["Survived_Mean"] = survived_stats.loc["mean"]
comparation_stats["Not_Survived_Mean"] = not_survived_stats.loc["mean"]
comparation_stats["Survived_Std"] = survived_stats.loc["std"]
comparation_stats["Not_Survived_Std"] = not_survived_stats.loc["std"]
comparation_stats.drop("Survived", inplace = True, axis = 0)
print(comparation_stats)


             Survived_Mean  Not_Survived_Mean  Survived_Std  Not_Survived_Std
PassengerId     659.144554         652.396766    380.892025        376.419364
Pclass            2.067327           2.437811      0.863399          0.789005
Age              28.930622          30.513806     14.484727         14.342518
SibSp             0.445545           0.532338      0.713743          1.201990
Parch             0.390099           0.381841      0.715684          0.948076
Fare             43.928036          26.608753     63.737427         41.193629


# Analysing Age feature

In [27]:
age_series = X["Age"]
age_series.describe()

ascending_ages = age_series.sort_values(ascending = True)
unique_ages = pd.unique(age_series)
estimated_ages = age_series[age_series%1 == 0.5]
n_estimated = estimated_ages.size
n_unique = unique_ages.size
print("Quantity of estimated ages: {}\nQuantity of unique ages: {}".format(n_estimated, n_unique) )

age_and_survived = df[["Age", "Survived"]]
age_survived = age_and_survived[ age_and_survived["Survived"]==1 ]["Age"]
age_not_survived = age_and_survived[ age_and_survived["Survived"]==0 ]["Age"]

intervals = np.arange(0,101,10)
bins_survived = pd.cut(age_survived, intervals, right = False)
frequency_survived = bins_survived.value_counts(sort = False) / n_survived

bins_not_survived = pd.cut(age_not_survived, intervals, right = False)
frequency_not_survived = bins_not_survived.value_counts(sort = False) / n_not_survived

comparation_ages = pd.DataFrame()
comparation_ages["Survived"] = frequency_survived
comparation_ages["Not_Survived"] = frequency_not_survived

print()
print(comparation_ages)

Quantity of estimated ages: 33
Quantity of unique ages: 99

           Survived  Not_Survived
Age                              
[0, 10)    0.087129      0.047264
[10, 20)   0.110891      0.108209
[20, 30)   0.257426      0.266169
[30, 40)   0.192079      0.167910
[40, 50)   0.100990      0.104478
[50, 60)   0.055446      0.052239
[60, 70)   0.021782      0.026119
[70, 80)   0.000000      0.008706
[80, 90)   0.001980      0.000000
[90, 100)  0.000000      0.000000


# Feature selection

In [None]:
from sklearn.feature_selection import (
SelectKBest, 
f_classif)
