# Титаник

## Зареждане на необходимите библиотеки

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import mglearn

from sklearn.linear_model import Lasso, LogisticRegression

from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier

from sklearn.svm import SVC

from sklearn.preprocessing import \
    Imputer, \
    LabelBinarizer, \
    StandardScaler, \
    PolynomialFeatures

from sklearn.pipeline import Pipeline, FeatureUnion

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import GridSearchCV

from IPython.display import display

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
sns.set()

In [None]:
# Loading the data
train = pd.read_csv('data/titanic-train.csv', index_col='PassengerId')
test = pd.read_csv('data/titanic-test.csv', index_col='PassengerId')

full_data = [train, test]

# Lets have a look at the data
train.head(5)

## Разглеждане на данните

In [None]:
train.describe()

Имаме липсващи данни.

In [None]:
train.info()

In [None]:
for data in full_data:
    print(data.isnull().sum().sort_values())

Трябва да попълним липсващите данни, но преди това...

## Базови модели

In [None]:
X = train[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']]
y = train['Survived']

X['Age'].fillna(X['Age'].mean(), inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=8, stratify=y)

In [None]:
lr_model = LogisticRegression().fit(X_train, y_train)
rf_model = RandomForestClassifier().fit(X_train, y_train)
svc_model = SVC(C=1).fit(X_train, y_train)

print("logistic recression train score:", lr_model.score(X_train, y_train))
print("logistic recression test score: ", lr_model.score(X_test, y_test), end='\n\n')

print("random forest train score:", rf_model.score(X_train, y_train))
print("random forest recression test score: ", rf_model.score(X_test, y_test), end='\n\n')

print("support vector classifier score:", svc_model.score(X_train, y_train))
print("support vector classifier score: ", svc_model.score(X_test, y_test), end='\n\n')

Доста овърфит при svm и random forest класификаторите.

# Малко помощни класове

Този клас ще one-hot-encode-ва стрингови (категорийни) фийчъри. Подходящ е за pipeline-ове.

In [None]:
class StringLabelBinerizer(LabelBinarizer):
    def __init__(self, prefix='col', key=lambda x: x, **kwargs):
        self.prefix = prefix
        self.key = key
        super(StringLabelBinerizer, self).__init__(**kwargs)
    
    def map_input(self, y):
        return y

    def fit(self, y, prefix='col', key=lambda x: x):
        self.key = key
        return super(StringLabelBinerizer, self).fit(self.map_input(y))

    def transform(self, y):
        cols = list(map(lambda s: self.prefix + '_' + s, self.classes_))
        if len(cols) == 2:
            cols = [self.prefix]
        result = pd.DataFrame(super(StringLabelBinerizer, self).transform(self.map_input(y)), columns=cols)
        result.index += 1
        return result
        
    def fit_transform(self, y, *_):
        self.fit(y)
        return self.transform(y)

Тези класове са от лекции. Използват се при създаването на трансформиращия пайплайн.

In [None]:
class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[[self.key]]

class StringImputer(TransformerMixin):
    def fit(self, X, *_):
        self.modes = X.mode().iloc[0]
        return self
        
    def transform(self, X, y=None):
        return X.fillna(self.modes)

# Малко feature engineering

In [None]:
for data in full_data:
    data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
    data['IsAlone'] = (data['FamilySize'] == 1).apply(int)

Липсваше и една стойност във Fare фийчъра на тест сет-а. 

In [None]:
test['Fare'] = train['Fare'].mean()

Да се опитаме да попълним липсващите Age стойности с линейна регресия.

In [None]:
X_age_train = train[['Pclass', 'FamilySize', 'IsAlone', 'Fare']]
X_age_test = test[['Pclass', 'FamilySize', 'IsAlone', 'Fare']]

X_age_non_empty = X_age_train[~train['Age'].isnull()]
y_age_non_empty = train['Age'][~train['Age'].isnull()]

poly_feature_mapper = PolynomialFeatures(degree=1)
poly_feature_mapper.fit(X_age_non_empty)

age_implant_model = Lasso(random_state=0, alpha=0.01)
age_implant_model.fit(poly_feature_mapper.transform(X_age_non_empty), y_age_non_empty)

scores = cross_val_score(age_implant_model, poly_feature_mapper.transform(X_age_non_empty), y_age_non_empty, cv=3)
print('Age inplant model score: '+ str(scores.mean()))

Не можах да докарам до по-добра оценка :D

In [None]:
train['LearnedAge'] = train['Age']
test['LearnedAge'] = test['Age']

X_age_empty_train = poly_feature_mapper.transform(X_age_train[train['Age'].isnull()])
X_age_empty_test = poly_feature_mapper.transform(X_age_test[test['Age'].isnull()])

train['LearnedAge'][train['Age'].isnull()] = age_implant_model.predict(X_age_empty_train)
test['LearnedAge'][test['Age'].isnull()] = age_implant_model.predict(X_age_empty_test)

In [None]:
for data in full_data:
    data['IsChild'] = (data['LearnedAge'] <= 18).apply(int)

In [None]:
sns.distplot(train['Age'][~train['Age'].isnull()])

Да видим как изглежда дистрибуцията на годините след вкарването на новите данни.

In [None]:
sns.distplot(train['LearnedAge'])

Изглежда сме добавили доста хора на по 25-30 години.

## Нека да видим зависимостите в числовите фийчъри 

In [None]:
def corr_map(data_frame):
    plt.figure(figsize=(14, 12))
    plt.title('Correlation of features', y=1.05, size=15)
    sns.set(font_scale=0.8)
    sns.heatmap(data_frame.astype(float).corr(), square=True, annot=True, cmap=plt.cm.viridis_r)

numeric_cols = ['Pclass', 'LearnedAge', 'SibSp', 'Parch', 'Fare', 'FamilySize', 'IsAlone', 'IsChild']
corr_map(train[numeric_cols])

# Създаване на pipeline за енкодване и скалиране на данните

In [None]:
transformer = Pipeline([
    ('union', FeatureUnion([
        ('age', Pipeline([
            ('select', ItemSelector('LearnedAge')),
            ('scaler', StandardScaler()),
        ])),
        ('family_size', Pipeline([
            ('select', ItemSelector('FamilySize')),
            ('scaler', StandardScaler()),
        ])),
        ('is_alone', Pipeline([
            ('select', ItemSelector('IsAlone')),
        ])),
        ('is_child', Pipeline([
            ('select', ItemSelector('IsChild')),
        ])),
        ('sibsp', Pipeline([
            ('select', ItemSelector('SibSp')),
            ('scaler', StandardScaler()),
        ])),
        ('parch', Pipeline([
            ('select', ItemSelector('Parch')),
            ('scaler', StandardScaler()),
        ])),
        ('Fare', Pipeline([
            ('select', ItemSelector('Fare')),
            ('scaler', StandardScaler()),
        ])),
        ('gender', Pipeline([
            ('select', ItemSelector('Sex')),
            ('imputer', StringImputer()),
            ('encoder', StringLabelBinerizer()),
        ])),
        ('embarked', Pipeline([
            ('select', ItemSelector('Embarked')),
            ('imputer', StringImputer()),
            ('encoder', StringLabelBinerizer()),
        ])),
    ])),
])

In [None]:
svc_model = SVC(random_state=88)
logistic_model = LogisticRegression(random_state=888)
forest_model = RandomForestClassifier(random_state=88)

X = transformer.fit_transform(train.drop(['Survived'], axis=1))
y = train['Survived']

scores = cross_val_score(svc_model, X, train['Survived'], cv=5)
print('SVC model', scores.mean())

scores = cross_val_score(logistic_model, X, train['Survived'], cv=5)
print('Logistic model', scores.mean())

scores = cross_val_score(forest_model, X, train['Survived'], cv=5)
print('Random forest model', scores.mean())

# Намаляне на броя параметри

In [None]:
feature_importance_model = ExtraTreesClassifier(random_state=88)
feature_importance_model.fit(X, y)

In [None]:
def plot_feature_importance(model):
    features = pd.DataFrame()
    features['importance'] = model.feature_importances_
    features.sort_values(by=['importance'], ascending=True, inplace=True)
    features.plot(kind='bar', figsize=(20, 5))
    
plot_feature_importance(feature_importance_model)

За съжаление пайплайна ни е изплюл data-та без лейбали >:(, лош пайплайн.

# Търсене на добри хиперпараметри

Първо да подготвим моделите и параметрите за които ще търсим.

In [None]:
svc_model = SVC(random_state=88)
logistic_model = LogisticRegression(random_state=888)
forest_model = RandomForestClassifier(random_state=88)

cross_validation = StratifiedKFold(y, n_folds=3)

svc_params = [
    {'C': [1, 10], 'kernel': ['linear']},
    {'C': [1, 10], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
]
logistic_params = {'C': [0.01, 0.1, 1, 10] }
forest_params = {
    'max_depth' : [5, 6],
    'n_estimators': [200, 210],
    'criterion': ['entropy']
}

svc_grid_search = GridSearchCV(svc_model, param_grid=svc_params, cv=cross_validation)
logistic_grid_search = GridSearchCV(logistic_model, param_grid=logistic_params, cv=cross_validation)
forest_grid_search = GridSearchCV(forest_model, param_grid=forest_params, cv=cross_validation)

Пускаме моделите да се тренират, т.е. да търсят добри параметри. **(Тази стъпка ще отнеме повечко време)**

In [None]:
svc_grid_search.fit(X, y)
logistic_grid_search.fit(X, y)
forest_grid_search.fit(X, y)

print('Forest score', svc_grid_search.best_score_)
print('Forest score', svc_grid_search.best_params_, end='\n\n')

print('Forest score', logistic_grid_search.best_score_)
print('Forest score', logistic_grid_search.best_params_, end='\n\n')

print('Forest score', forest_grid_search.best_score_)
print('Forest score', forest_grid_search.best_params_, end='\n\n')

Форест модела изглежда най-обещаващ,

Forest score {'criterion': 'entropy', 'max_depth': 5, 'n_estimators': 210}

# Събмитване в кагел

Избираме най-добрия модел с най-оптимизираните параметри и предиктваме с него. Полученият резултат събмитваме в кагел.

In [None]:
def save_predictions(predictions):
    frame = pd.DataFrame({
        'PassengerId': pd.read_csv('data/titanic-test.csv').PassengerId,
        'Survived': predictions
    })
    frame = frame.set_index('PassengerId')
    frame.to_csv('data/titanic-test-output.csv')
    frame.head()

In [None]:
model = forest_grid_search

X_test = transformer.transform(test)
y_test = model.predict(X_test)

In [None]:
save_predictions(y_test)

Your submission scored 0.78468. Keep trying!

# TITENIC

![titenic](data/titenic.jpg)