In [224]:
# make it nice and dark
import pandas as pd
from jupyterthemes import jtplot
jtplot.style()

test_df = pd.read_csv("test.csv")
train_df = pd.read_csv("train.csv")

In [225]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [226]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [227]:
# haal uit
#tit_df = pd.read_csv("titanic.csv")
#df=test_df[['Name', 'Ticket', 'PassengerId']].rename(columns={'Name':'name', 'Ticket':'ticket'})
#pd.merge(df ,tit_df, on=['name', 'ticket'], how='inner')
# sort of test answers

In [228]:
X = train_df[['Pclass', 'Name', 'Sex', 'Age', 'SibSp',
              'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']]
y = train_df[['Survived']]

In [229]:
# from previous chapters
from sklearn.base import BaseEstimator, TransformerMixin

# A class to select numerical or categorical columns
# since Scikit-Learn doesn't handle DataFrames yet


class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.attribute_names]

In [230]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('selector', DataFrameSelector(['Age', 'SibSp', 'Parch', 'Fare'])),
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler()),
])

In [231]:
num_pipeline.fit_transform(
    train_df), num_pipeline.fit_transform(train_df).shape

(array([[-0.56573646,  0.43279337, -0.47367361, -0.50244517],
        [ 0.66386103,  0.43279337, -0.47367361,  0.78684529],
        [-0.25833709, -0.4745452 , -0.47367361, -0.48885426],
        ...,
        [-0.1046374 ,  0.43279337,  2.00893337, -0.17626324],
        [-0.25833709, -0.4745452 , -0.47367361, -0.04438104],
        [ 0.20276197, -0.4745452 , -0.47367361, -0.49237783]]), (891, 4))

In [232]:
from sklearn.preprocessing import OneHotEncoder
cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(['Pclass', 'Sex', 'Embarked'])),
    ('imputer', SimpleImputer(strategy="most_frequent")),
    ('encoder', OneHotEncoder(sparse=False)),
])

In [233]:
cat_pipeline.fit_transform(
    train_df), cat_pipeline.fit_transform(train_df).shape

(array([[0., 0., 1., ..., 0., 0., 1.],
        [1., 0., 0., ..., 1., 0., 0.],
        [0., 0., 1., ..., 0., 0., 1.],
        ...,
        [0., 0., 1., ..., 0., 0., 1.],
        [1., 0., 0., ..., 1., 0., 0.],
        [0., 0., 1., ..., 0., 1., 0.]]), (891, 8))

In [234]:
from sklearn.pipeline import FeatureUnion
preprocess_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline),
])

In [235]:
X_train = preprocess_pipeline.fit_transform(train_df)
y_train = train_df["Survived"]  # check if they survived
X_test = preprocess_pipeline.transform(test_df)
scores = pd.DataFrame(columns=['Score', 'Model'])

In [236]:
# from book
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt


def plot_learning_curves(model, X, y):
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, random_state=10)
    train_errors, val_errors = [], []
    for m in range(2, len(X_train)):
        model.fit(X_train[:m], y_train[:m])
        y_train_predict = model.predict(X_train[:m])
        y_val_predict = model.predict(X_val)
        train_errors.append(mean_squared_error(y_train[:m], y_train_predict))
        val_errors.append(mean_squared_error(y_val, y_val_predict))

    plt.plot(np.sqrt(train_errors), "r-+", linewidth=2, label="train")
    plt.plot(np.sqrt(val_errors), "b-", linewidth=3, label="val")
    plt.legend(loc="upper right", fontsize=14)   # not shown in the book
    plt.xlabel("Training set size", fontsize=14)  # not shown
    plt.ylabel("RMSE", fontsize=14)              # not shown

In [237]:
# softmax
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

softmax_reg = LogisticRegression(
    multi_class="multinomial", solver="lbfgs", C=10, random_state=42)
print(softmax_reg.fit(X_train, y_train))

scores = pd.concat([scores,  pd.DataFrame([[cross_val_score(
    softmax_reg, X_train, y_train, cv=10).mean(), 'softmax']], columns=['Score', 'Model'])])
print(scores[scores['Model'] == 'softmax'])
print(softmax_reg.coef_)

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=42, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)
      Score    Model
0  0.799197  softmax
[[-0.25006007 -0.17709996 -0.03757076  0.0585666   0.52968566  0.07173743
  -0.54246169  0.70848315 -0.64952175  0.10154609  0.07290374 -0.11548844]]


In [238]:
# tree regression
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(max_depth=2, random_state=42)
print(tree_reg.fit(X_train, y_train))

scores = pd.concat([scores,  pd.DataFrame([[cross_val_score(
    tree_reg, X_train, y_train, cv=10).mean(), 'tree_reg']], columns=['Score', 'Model'])])
print(scores[scores['Model'] == 'tree_reg'])

DecisionTreeRegressor(criterion='mse', max_depth=2, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=42, splitter='best')
      Score     Model
0  0.342846  tree_reg


In [239]:
# SVC
from sklearn.svm import SVC

svm_clf = SVC(gamma="auto")
print(svm_clf.fit(X_train, y_train))

scores = pd.concat([scores,  pd.DataFrame([[cross_val_score(
    svm_clf, X_train, y_train, cv=10).mean(), 'SVC']], columns=['Score', 'Model'])])
print(scores[scores['Model'] == 'SVC'])

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
      Score Model
0  0.826064   SVC


In [240]:
# RFC
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
print(forest_clf.fit(X_train, y_train))

scores = pd.concat([scores,  pd.DataFrame([[cross_val_score(
    forest_clf, X_train, y_train, cv=10).mean(), 'RFC']], columns=['Score', 'Model'])])
print(scores[scores['Model'] == 'RFC'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)
      Score Model
0  0.814953   RFC


In [241]:
# Linear R
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
print(lin_reg.fit(X_train, y_train))

scores = pd.concat([scores,  pd.DataFrame([[cross_val_score(
    lin_reg, X_train, y_train, cv=10).mean(), 'Linear_R']], columns=['Score', 'Model'])])
print(scores[scores['Model'] == 'Linear_R'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)
     Score     Model
0  0.36251  Linear_R


In [242]:
# KNN
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(2)
print(knn.fit(X_train, y_train))

scores = pd.concat([scores,  pd.DataFrame([[cross_val_score(
    knn, X_train, y_train, cv=10).mean(), 'KNN']], columns=['Score', 'Model'])])
print(scores[scores['Model'] == 'KNN'])

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=2, p=2,
           weights='uniform')
    Score Model
0  0.7958   KNN


In [243]:
from sklearn.naive_bayes import GaussianNB

gnb_clf = GaussianNB()
print(gnb_clf.fit(X_train, y_train))

scores = pd.concat([scores,  pd.DataFrame([[cross_val_score(
    gnb_clf, X_train, y_train, cv=10).mean(), 'gnb_clf']], columns=['Score', 'Model'])])
print(scores[scores['Model'] == 'gnb_clf'])

GaussianNB(priors=None, var_smoothing=1e-09)
      Score    Model
0  0.784677  gnb_clf


In [244]:
scores.sort_values(by=['Score'], ascending=False)

Unnamed: 0,Score,Model
0,0.826064,SVC
0,0.814953,RFC
0,0.799197,softmax
0,0.7958,KNN
0,0.784677,gnb_clf
0,0.36251,Linear_R
0,0.342846,tree_reg
