In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures

In [12]:
# 1

df = pd.read_csv('titanic.csv')
df = df.drop(['PassengerId', 'Name', 'Parch', 'Ticket', 'Cabin'], axis=1)

df.Sex = df.Sex.map({'male': 0, 'female': 1})

df.Age.fillna(df.Age.mean(), inplace=True)
df.Age = df.Age.astype(int)

df.Embarked = df.Embarked.map({'S': 0, 'C': 1, 'Q': 2})
df.Embarked.fillna(df.Embarked.mean(), inplace=True)
df.Embarked = df.Embarked.astype(int)

df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare,Embarked
0,0,3,0,22,1,7.2500,0
1,1,1,1,38,1,71.2833,1
2,1,3,1,26,0,7.9250,0
3,1,1,1,35,1,53.1000,0
4,0,3,0,35,0,8.0500,0
...,...,...,...,...,...,...,...
886,0,2,0,27,0,13.0000,0
887,1,1,1,19,0,30.0000,0
888,0,3,1,29,1,23.4500,0
889,1,1,0,26,0,30.0000,1


In [3]:
df.corr()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked
Survived,1.0,-0.338481,0.543351,-0.067809,0.257307,0.106811
Pclass,-0.338481,1.0,-0.1319,-0.335071,-0.5495,0.045702
Sex,0.543351,-0.1319,1.0,-0.082533,0.182333,0.116569
Age,-0.067809,-0.335071,-0.082533,1.0,0.093856,0.000234
Fare,0.257307,-0.5495,0.182333,0.093856,1.0,0.062142
Embarked,0.106811,0.045702,0.116569,0.000234,0.062142,1.0


In [7]:
target_y = df.Survived
df_test = df.drop('Survived', axis=1)

X_train, X_test, y_train, y_test \
    = train_test_split(df_test, target_y, test_size=0.3, random_state=42)

classifiers = [
    ('Random Forest Classifier', RandomForestClassifier(max_depth=5)),
    ('K-nearest neighbours', KNeighborsClassifier(n_neighbors=5)),
    ('Support Vector Machine', SVC(kernel='linear', C=1.0)),
    ('Naїve Bayes: Gaussian', GaussianNB()),
    ('Naїve Bayes: Multinomial', MultinomialNB()),
    ('Naїve Bayes: Bernoulli', BernoulliNB())
]

classifier_accuracies = list(map(
    lambda c: (c[0], round(c[1].fit(X_train, y_train).score(X_test, y_test), 6)),
    classifiers
))

sorted(classifier_accuracies, key=lambda x: x[1], reverse=True)

[('Random Forest Classifier', 0.817164),
 ('Naїve Bayes: Bernoulli', 0.798507),
 ('Support Vector Machine', 0.791045),
 ('Naїve Bayes: Gaussian', 0.791045),
 ('Naїve Bayes: Multinomial', 0.708955),
 ('K-nearest neighbors', 0.679104)]

In [8]:
linreg_classifiers = [
    ('Linear Regression: Standard Scaler', pd.DataFrame(StandardScaler().fit_transform(df_test), columns=df_test.columns)),
    ('Linear Regression: Min-Max Scaler', pd.DataFrame(MinMaxScaler().fit_transform(df_test), columns=df_test.columns)),
    ('Linear Regression: Polynomial', pd.DataFrame(PolynomialFeatures(degree=2, include_bias=False).fit_transform(df_test)))
]

def l(c):
    _X_train, _X_test, _y_train, _y_test = train_test_split(c[1], target_y, test_size=0.3, random_state=42)
    return c[0], round(LinearRegression().fit(_X_train, _y_train).score(_X_test, _y_test), 6)

linger_classifier_accuracies = list(map(
    lambda c: l(c),
    linreg_classifiers
))

sorted(linger_classifier_accuracies, key=lambda x: x[1], reverse=True)

[('Linear Regression: Standard Scaler', 0.433254),
 ('Linear Regression: Min-Max Scaler', 0.433254),
 ('Linear Regression: Polynomial', 0.41877)]

In [10]:
max(classifier_accuracies + linger_classifier_accuracies, key=lambda c:c[1])

('Random Forest Classifier', 0.817164)