In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

df = pd.read_csv("data/train.csv")
df['Ticket'] = pd.factorize(df['Ticket'])[0]
df['Embarked'] = pd.factorize(df['Embarked'])[0]
df_test_result = pd.read_csv("data/test.csv")
df_test_result['Ticket'] = pd.factorize(df_test_result['Ticket'])[0]
df_test_result['Embarked'] = pd.factorize(df_test_result['Embarked'])[0]

In [12]:
y = df["Survived"]
X = df.drop(["Survived", "Cabin", "Name", "PassengerId"], axis = 1, inplace = False)
X['Sex'] = pd.factorize(X['Sex'])[0]

df_test_result['Sex'] = pd.factorize(df_test_result['Sex'])[0]
df_test_result = df_test_result.drop(["Cabin", "Name", "PassengerId"], axis = 1, inplace = False)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 23, train_size = 0.85)

# print(X_train.shape[0], y_train.size, X_test.shape[0], y_test.size)
print("Train size: {}\nTest size: {}\n".format(X_train.shape[0], X_test.shape[0]) ) 

print(X_train.columns)
print(X_train.describe())

Train size: 757
Test size: 134

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Embarked'], dtype='object')
           Pclass         Sex         Age       SibSp       Parch      Ticket  \
count  757.000000  757.000000  611.000000  757.000000  757.000000  757.000000   
mean     2.301189    0.361955   29.582111    0.536328    0.392338  306.799207   
std      0.838341    0.480884   14.601585    1.141017    0.812009  197.560484   
min      1.000000    0.000000    0.420000    0.000000    0.000000    0.000000   
25%      2.000000    0.000000   20.000000    0.000000    0.000000  132.000000   
50%      3.000000    0.000000   28.000000    0.000000    0.000000  286.000000   
75%      3.000000    1.000000   38.000000    1.000000    0.000000  473.000000   
max      3.000000    1.000000   80.000000    8.000000    6.000000  680.000000   

             Fare    Embarked  
count  757.000000  757.000000  
mean    32.457678    0.361955  
std     49.177664    0.637065  
min      0.000

In [13]:
survived = df[ df["Survived"] == 1]
n_survived = survived.shape[0]
not_survived = df[ df["Survived"] == 0]
n_not_survived = not_survived.shape[0]

survived_stats = survived.describe()
not_survived_stats = not_survived.describe()

comparation_stats = pd.DataFrame()
comparation_stats["Survived_Mean"] = survived_stats.loc["mean"]
comparation_stats["Not_Survived_Mean"] = not_survived_stats.loc["mean"]
comparation_stats["Survived_Std"] = survived_stats.loc["std"]
comparation_stats["Not_Survived_Std"] = not_survived_stats.loc["std"]
comparation_stats = comparation_stats.drop("Survived", inplace = False, axis = 0)
print(comparation_stats)


             Survived_Mean  Not_Survived_Mean  Survived_Std  Not_Survived_Std
PassengerId     444.368421         447.016393    252.358840        260.640469
Pclass            1.950292           2.531876      0.863321          0.735805
Age              28.343690          30.626179     14.950952         14.172110
SibSp             0.473684           0.553734      0.708688          1.288399
Parch             0.464912           0.329690      0.771712          0.823166
Ticket          295.043860         314.220401    183.833314        205.040971
Fare             48.395408          22.117887     66.596998         31.388207
Embarked          0.441520           0.307832      0.659535          0.620478


# Analysing Age feature

In [14]:
age_series = X["Age"]
age_series.describe()

ascending_ages = age_series.sort_values(ascending = True)
unique_ages = pd.unique(age_series)
estimated_ages = age_series[age_series%1 == 0.5]
n_estimated = estimated_ages.size
n_unique = unique_ages.size
print("Quantity of estimated ages: {}\nQuantity of unique ages: {}".format(n_estimated, n_unique) )

age_and_survived = df[["Age", "Survived"]]
age_survived = age_and_survived[ age_and_survived["Survived"]==1 ]["Age"]
age_not_survived = age_and_survived[ age_and_survived["Survived"]==0 ]["Age"]

intervals = np.arange(0,101,10)
bins_survived = pd.cut(age_survived, intervals, right = False)
frequency_survived = bins_survived.value_counts(sort = False) / n_survived

bins_not_survived = pd.cut(age_not_survived, intervals, right = False)
frequency_not_survived = bins_not_survived.value_counts(sort = False) / n_not_survived

comparation_ages = pd.DataFrame()
comparation_ages["Survived"] = frequency_survived
comparation_ages["Not_Survived"] = frequency_not_survived

print()
print(comparation_ages)

Quantity of estimated ages: 18
Quantity of unique ages: 89

           Survived  Not_Survived
Age                              
[0, 10)    0.111111      0.043716
[10, 20)   0.119883      0.111111
[20, 30)   0.225146      0.260474
[30, 40)   0.213450      0.171220
[40, 50)   0.099415      0.100182
[50, 60)   0.058480      0.051002
[60, 70)   0.017544      0.023679
[70, 80)   0.000000      0.010929
[80, 90)   0.002924      0.000000
[90, 100)  0.000000      0.000000


In [15]:
# Dropping Age feature

X_train = X_train.drop("Age", axis = 1, inplace = False)
X_test = X_test.drop("Age", axis = 1, inplace = False)
X = X.drop("Age", axis = 1, inplace = False)
df_test_result = df_test_result.drop("Age", axis = 1, inplace = False)


# Dropping and replacing remaining NaNs

X_train = X_train.dropna()
y_train = y_train.dropna()
X_test = X_test.dropna()
y_test = y_test.dropna()

mean_values = df_test_result.mean()
df_test_result = df_test_result.fillna(mean_values)

# Normalizing data

In [16]:
X_train = (X_train - X_train.min()) / (X_train.max() - X_train.min())
y_train = (y_train - y_train.min()) / (y_train.max() - y_train.min())
X_test = (X_test - X_test.min()) / (X_test.max() - X_test.min())
y_test = (y_test - y_test.min()) / (y_test.max() - y_test.min())
df_test_result = (df_test_result - df_test_result.min()) / (df_test_result.max() - df_test_result.min())


X_train.to_csv("final_data/X_train.csv", index = False)
y_train.to_csv("final_data/y_train.csv", index = False)
X_test.to_csv("final_data/X_test.csv", index = False)
y_test.to_csv("final_data/y_test.csv", index = False)
df_test_result.to_csv("final_data/df_test_result.csv", index = False)