In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


In [None]:
plt.ion()
# pd.options.plotting.backend = "plotly"

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

train.head(20)


In [None]:
train.describe()


In [None]:
missing_values = train.isnull()

sns.heatmap(missing_values, yticklabels=False, cbar=False, cmap="viridis")

In [None]:
sns.countplot(x='Survived', data=train)

In [None]:
sns.countplot(x='Survived', data=train, hue='Pclass')

In [None]:
sns.boxplot(x="Pclass", y="Age", hue="Survived", fill=False, gap=.1, data=train)

In [None]:
sns.boxplot(x="Pclass", y="Age", fill=False, gap=.1, data=train)

In [None]:
def impute_age(cols):
    Age = cols['Age']
    Pclass = cols['Pclass']
    if pd.isnull(Age):
        if Pclass == 1:
            return 37
        elif Pclass == 2:
            return 29
        else:
            return 24
    else:
        return Age

train['Age'] = train[['Age', 'Pclass']].apply(impute_age, axis=1)
train.head(20)

In [None]:
missing_values = train.isnull()
sns.heatmap(missing_values, yticklabels=False, cbar=False, cmap="viridis")            

In [None]:
train.drop(columns='Cabin', inplace=True)
missing_values = train.isnull()
sns.heatmap(missing_values, yticklabels=False, cbar=False, cmap="viridis")  

In [None]:
test.drop(columns='Cabin', inplace=True)
test.dropna(inplace=True)
missing_values = test.isnull()
sns.heatmap(missing_values, yticklabels=False, cbar=False, cmap="viridis")  

In [None]:
sex = pd.get_dummies(train['Sex'], drop_first=True)
embark = pd.get_dummies(train['Embarked'], drop_first=True)
train.drop(['Sex', 'Embarked', 'Ticket', 'Name', 'PassengerId'], axis=1, inplace=True)
# print(embark)
train = pd.concat([train, sex, embark], axis=1)
train.head(20)

In [None]:
sex = pd.get_dummies(test['Sex'], drop_first=True)
embark = pd.get_dummies(test['Embarked'], drop_first=True)
test.drop(['Sex', 'Embarked', 'Ticket', 'Name', 'PassengerId'], axis=1, inplace=True)
# print(embark)
test = pd.concat([test, sex, embark], axis=1)
test.head(20)

In [None]:
x = train[['Pclass', 'Age','SibSp','Parch','Fare', 'male', 'Q', 'S']]
y = train['Survived']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)


logmodel = LogisticRegression(max_iter=1000)
logmodel.fit(x_train, y_train)

y_pred = logmodel.predict(x_test)
print(classification_report(y_test, y_pred))
# print(x_test.iloc[:, 0])


In [None]:
predict = pd.concat([x_test, pd.DataFrame({'Y Predict': y_pred, 'Y Real': y_test})], axis=1)
predict.head(50)
