In [520]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics

# titanic data

Najpierw zaloguj się do https://www.kaggle.com/ i przejdź do wyzwania https://www.kaggle.com/c/titanic, aby pobrać 
 * train.csv i test.csv. 

Zapisz je w katalogu datasets/titanic.

In [521]:
import os
TITANIC_PATH = os.path.join("datasets", "titanic")

In [522]:
import pandas as pd

def load_titanic_data(filename, titanic_path=TITANIC_PATH):
    csv_path = os.path.join(titanic_path, filename)
    return pd.read_csv(csv_path)

In [523]:
train_data = pd.read_csv('../input/titanic/train.csv')
test_data = pd.read_csv('../input/titanic/test.csv')

* Dane są już podzielone na zestaw treningowy i zestaw testów. 
* Jednak dane testowe nie zawierają etykiet: Twoim celem jest wyszkolenie najlepszego modelu, który możesz wykorzystać w danych treningowych, następnie dokonanie swoich przewidywań na danych testowych i przesłanie ich do Kaggle, aby zobaczyć ostateczny wynik.

Rzućmy okiem na kilka pierwszych rzędów zestawu treningowego:

In [524]:
train_data.head()

The attributes have the following meaning:

* Survived: that's the target, 0 means the passenger did not survive, while 1 means he/she survived.
* Pclass: passenger class.
* Name, Sex, Age: self-explanatory
* SibSp: how many siblings & spouses of the passenger aboard the Titanic.
* Parch: how many children & parents of the passenger aboard the Titanic.
* Ticket: ticket id
* Fare: price paid (in pounds)
* Cabin: passenger's cabin number
* Embarked: where the passenger embarked the Titanic
* Let's get more info to see how much data is missing:

In [525]:
train_data.info()

Atrybuty **Age**, **Cabin** oraz **Embarked** są czasami zerowe (mniej niż 891 wartości bez wartości null), szczególnie w przypadku **Cabin** (77% ma wartość zerową). Zignorujemy teraz **Cabin** i skupimy się na reszcie. Atrybut **Age** ma około 19% wartości pustych, więc będziemy musieli zdecydować, co z nimi zrobić. Zastąpienie wartości null medianą wieku wydaje się uzasadnione.

Atrybuty **Name** i **Ticket** mogą mieć pewną wartość, ale będą one nieco trudne do przekształcenia w użyteczne liczby. Na razie będziemy je ignorować.

Rzućmy okiem na atrybuty liczbowe:

In [526]:
train_data.describe()

In [527]:
# inspired from various titanic discussions
def clean_data(data):
    # fill missing age & fare by taking the median value from the person's class
    # and store values in buckets
    data['Age'] = data.groupby(['Pclass', 'Sex'])['Age'].apply(lambda x: x.fillna(x.median()))
    data['Age'] = pd.qcut(data['Age'], q=10, duplicates='drop', labels=['0','1','2','3','4','5','6','7','8','9'])
    data.Age = pd.to_numeric(data.Age, errors='coerce')
    
    data['Fare'] = data.groupby(['Pclass', 'Sex'])['Fare'].apply(lambda x: x.fillna(x.median()))   
    data['Fare'] = pd.qcut(data['Fare'], q=4, duplicates='drop', labels=['0','1','2','3'])
    data.Fare = pd.to_numeric(data.Fare, errors='coerce')
        
    # replace parch and sibsp with new attribute: family size
    data['FamilySize'] = data['Parch'] + data['SibSp'] + 1
    data['Alone'] = 0
    data.loc[data['FamilySize']==1, 'Alone'] = 1
    
    data.drop(['Cabin', 'Ticket', 'Name', 'Parch', 'SibSp'], axis=1, inplace=True)
    
    return data

In [528]:
train_data=clean_data(train_data)
test_data=clean_data(test_data)

In [529]:
train_data.describe()

* Tylko 38% przeżyło: to wystarczająco blisko do 40%, więc **accuracy** będzie rozsądną miarą do oceny naszego modelu.

Sprawdźmy, czy etykiety przyjmują wartości 0 lub 1:

In [530]:
train_data.info()
train_data['Survived'].value_counts()

Nie zapomnij o etykietach:

In [531]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_data, train_data['Survived'], test_size=0.20, random_state=42)
y_train.head()

Teraz rzućmy okiem na wszystkie atrybuty kategoryczne:

In [532]:
X_train['Sex'].value_counts()

In [533]:
X_train['Embarked'].value_counts()

Atrybut **Embarked** mówi nam, gdzie pasażer zaokrętował: C = Cherbourg, Q = Queenstown, S = Southampton.

Teraz zbudujmy nasze **pipeline** preprocessingu. 

Wykorzystamy DataframeSelector aby wybrać określone atrybuty z DataFrame:

In [534]:
from sklearn.base import BaseEstimator, TransformerMixin

# A class to select numerical or categorical columns 
# since Scikit-Learn doesn't handle DataFrames yet
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]

Zbudujmy **pipeline** dla atrybutów numerycznych:

In [535]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer


num_pipeline = Pipeline([
        ("select_numeric", DataFrameSelector(['Pclass', 'FamilySize', 'Alone'])),
        ("imputer", SimpleImputer(strategy="median")),
    ])

In [536]:
num_pipeline.fit_transform(X_train)

Będziemy także potrzebować imputera do kategorycznych kolumn  napisowych (zwykły Imputer nie działa na tych kolumnach):

In [537]:
# Inspired from stackoverflow.com/questions/25239958
class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],
                                        index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)

Teraz możemy zbudować **pipeline** dla atrybutów kategorycznych:

In [538]:
# from future_encoders import OneHotEncoder
from sklearn.preprocessing import OneHotEncoder

cat_pipeline = Pipeline([
        ("select_cat", DataFrameSelector(['Sex', 'Embarked', 'Age', 'Fare'])),
        ("imputer", MostFrequentImputer()),
        ("cat_encoder", OneHotEncoder(sparse=False, handle_unknown = 'ignore')),
    ])

In [539]:
cat_pipeline.fit_transform(X_train)

Na koniec połączmy powyższe podejścia:

In [540]:
from sklearn.pipeline import FeatureUnion
preprocess_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])

Teraz mamy fajny **pipeline** przetwarzania wstępnego, który pobiera dane wejściowe i zwraca dane wyjściowe złorzone z liczb, które możemy podać do dowolnego modelu uczenia maszynowego.

# Zad

Robimy StratifiedKFold i znajdujemy optymalne parametry dla

* SVM z jądrem rbf
* SVM z jądrem poly
* SVM liniowego
* Regresji logistycznej

In [541]:
from sklearn.model_selection import StratifiedKFold

kfold = StratifiedKFold(n_splits=5)

In [542]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [543]:
# pipe = Pipeline([
#     ('preprocessing', preprocess_pipeline), 
#     ('classifier', SVC(kernel='linear'))])


# param_grid = {
#             'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]
# }

# grid_1 = GridSearchCV(pipe, param_grid, cv=kfold)

# grid_1.fit(X_train, y_train)
# grid_1.best_params_

In [544]:
# pipe = Pipeline([
#     ('preprocessing', preprocess_pipeline),
#     ('classifier', SVC(kernel='poly'))])

# param_grid = {
#             'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]
# }

# grid_2 = GridSearchCV(pipe, param_grid, cv=kfold)

# grid_2.fit(X_train, y_train)
# grid_2.best_params_

In [545]:
# pipe = Pipeline([
#     ('preprocessing', preprocess_pipeline),
#     ('classifier', SVC(kernel='linear'))])

# param_grid = {
#             'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]
# }

# grid_3 = GridSearchCV(pipe, param_grid, cv=kfold)

# grid_3.fit(X_train, y_train)
# grid_3.best_params_

In [546]:
# from sklearn.linear_model import LogisticRegression

# pipe = Pipeline([
#     ('preprocessing', preprocess_pipeline),
#     ('classifier', LogisticRegression())])

# param_grid = {
#             'classifier__C': [0.001, 0.01, 0.1, 1, 10]
# }

# grid_4 = GridSearchCV(pipe, param_grid, cv=kfold)

# grid_4.fit(X_train, y_train)
# grid_4.best_params_

In [547]:
from sklearn.ensemble import RandomForestClassifier

pipe = Pipeline([
    ('preprocessing', preprocess_pipeline),
    ('classifier', RandomForestClassifier(max_features='auto', random_state=420, criterion='gini'))])

param_grid = {           
    'classifier__n_estimators': [100,150,200,300,400],
    'classifier__max_depth': [2,4,5,8,10],
    'classifier__max_features': [2,4,5,8,10]
}

grid_5 = GridSearchCV(pipe, param_grid, cv=kfold)
    
grid_5.fit(X_train, y_train)
grid_5.best_params_

In [548]:
from sklearn import  metrics


models = []
# models.append(('SVM linear', grid_1.best_estimator_))
# models.append(('SVM poly', grid_2.best_estimator_))
# models.append(('SVM linear', grid_3.best_estimator_))
# models.append(('Logistic regression', grid_4.best_estimator_))
models.append(('Random forest', grid_5.best_estimator_))


precision_score = []
recall_score = []
f1_score = []
accuracy_score = []
for name, model in models:
    print(name)
    print("precision_score: {}".format(metrics.precision_score(y_test, model.predict(X_test)) ))
    print("recall_score: {}".format( metrics.recall_score(y_test, model.predict(X_test)) ))
    print("f1_score: {}".format( metrics.f1_score(y_test, model.predict(X_test)) ))
    print("accuracy_score: {}".format( metrics.accuracy_score(y_test, model.predict(X_test)) ))
    precision_score.append(metrics.precision_score(y_test, model.predict(X_test)))
    recall_score.append(metrics.recall_score(y_test, model.predict(X_test)))
    f1_score.append( metrics.f1_score(y_test, model.predict(X_test)))
    accuracy_score.append(metrics.accuracy_score(y_test, model.predict(X_test)))

In [549]:
# import pandas as pd
# d = {'precision_score': precision_score, 
#      'recall_score': recall_score, 
#      'f1_score': f1_score,
#      'accuracy_score' : accuracy_score
#     }
# df = pd.DataFrame(data=d)
# df.insert(loc=0, column='Method', value=['SVM rbf', 'SVM poly', 'SVM linear', 'Logistic Regression', 'Random forest'])
# print(df)

In [550]:
Y_pred = grid_5.predict(test_data)

In [551]:
submission = pd.DataFrame({
        "PassengerId": test_data["PassengerId"],
        "Survived": Y_pred
    })
submission.to_csv('./submission.csv', index=False)