In [2]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import LabelEncoder,OneHotEncoder

from random import randrange, choice





In [12]:
def cat2num(x):
    if x == 'C': x = 0
    elif x == 'Q': x = 1
    else: x = 2
    return x

def split4cv(data4split, k = 3):  
    k_folds = list() # k_folds = [fold1=[[X],[y]], fold2=[[X],[y]] ...]
    fold_size = data4split.shape[0] / k 
    X,y = list(data4split.drop('survived', axis=1).values), list(data4split['survived'].values)
    for i in range(k): # make k_folds list 
        fold = list()
        tmp_X = list()
        tmp_y = list()
        while len(tmp_X) < fold_size:
            index = randrange(len(X))  
            tmp_X.append(X.pop(index))
            tmp_y.append(y.pop(index))
        fold.append([tmp_X,tmp_y])
        k_folds.append(fold) 
    return k_folds, k
  
def cross_validation(data4cv,model,j):
    k_folds, k = split4cv(train_set)
    X, y = [k_folds[i][0][0] for i in range(k)], [k_folds[i][0][1] for i in range(k)]
    # calculate empirical risk:
    scores = []
    for i in range(k):
        X_train = np.concatenate([X[j] for j in range(k) if j != i])
        y_train = np.concatenate([y[j] for j in range(k) if j != i])
        X_test = X[i]
        y_test = np.array(y[i])
        model.fit(X_train, y_train)
        if j==0:
            y_pred = model.predict(X_train)
            empirical_risk = np.sum((y_pred - y_train)**2)
            scores.append(empirical_risk)
        else:
            y_pred = model.predict(X_test)
            empirical_risk = np.sum((y_pred - y_test)**2)
            scores.append(empirical_risk)
    if j==0: 
        print('TRAIN AVERAGE EMPIRICAL RISK = ' + str(np.mean(np.array(scores))))
    else:
        print('TEST AVERAGE EMPIRICAL RISK = ' + str(np.mean(np.array(scores))))
    

In [13]:

# Reading data
train_set = pd.read_csv('train.csv', index_col = 'PassengerId')
test_set = pd.read_csv('test.csv', index_col = 'PassengerId')

''' Preprocessing dataset '''
#Delete some feature
train_set = train_set.drop(['cabin', 'ticket', 'name'], axis=1)
test_set = test_set.drop(['cabin', 'ticket', 'name'], axis=1)
#Fill missed cells in 'Age'
train_set['age'].fillna(train_set['age'].mean(), inplace=True)
test_set['age'].fillna(test_set['age'].mean(), inplace=True)
# Fill missed cells in 'Embarked'
train_set['embarked'] = train_set['embarked'].apply(cat2num)
test_set['embarked'] = test_set['embarked'].apply(cat2num)

# Encode categorical features to numerical
label_encoder = LabelEncoder()

label_encoder.fit(train_set['sex'])
train_set['sex'] = label_encoder.transform(train_set['sex'])

label_encoder.fit(test_set['sex'])
test_set['sex'] = label_encoder.transform(test_set['sex'])

In [15]:
# Tuning the hyperparmeters
print('Tuning the hyperparmeters:\n\n')
print('RandomForestClassifier') 
n_forests = [8, 15, 30, 100, 1000, 3000] # n_estimators
n_level   = [5, 8, 10, 12, 25, 50]     # max_depth
for i in range(len(n_level)):
    randomforest = RandomForestClassifier(n_estimators = n_forests[i], criterion ='gini',
                               max_features = 'auto', min_samples_split=2, max_depth=n_level[i], random_state=42, n_jobs=-1)
    print('n_estimators: ' + str(n_forests[i]))
    print('max_depth:' + str(n_level[i]))
    cross_validation(train_set,randomforest,0)
    cross_validation(test_set,randomforest,1)
    print('\n\n')
print('LogisticRegression')
print('solver=newton-cg')
lr=LogisticRegression(solver='newton-cg')
cross_validation(train_set,lr,0)
cross_validation(test_set,lr,1)
print('\n\n')
print('solver=lbfgs')
lr=LogisticRegression(solver='lbfgs')
cross_validation(train_set,lr,0)
cross_validation(test_set,lr,1)
print('\n\n')
print('solver=liblinear')
lr=LogisticRegression(solver='liblinear')
cross_validation(train_set,lr,0)
cross_validation(test_set,lr,1)
print('\n\n')
print('solver=sag')
lr=LogisticRegression(solver='sag')
cross_validation(train_set,lr,0)
cross_validation(test_set,lr,1)
print('\n\n')
print('solver=saga')
lr=LogisticRegression(solver='saga')
cross_validation(train_set,lr,0)
cross_validation(test_set,lr,1)
print('\n\n')

Tuning the hyperparmeters:


RandomForestClassifier
n_estimators: 8
max_depth:5
TRAIN AVERAGE EMPIRICAL RISK = 91.33333333333333
TEST AVERAGE EMPIRICAL RISK = 58.333333333333336



n_estimators: 15
max_depth:8
TRAIN AVERAGE EMPIRICAL RISK = 53.333333333333336
TEST AVERAGE EMPIRICAL RISK = 56.666666666666664



n_estimators: 30
max_depth:10
TRAIN AVERAGE EMPIRICAL RISK = 28.333333333333332
TEST AVERAGE EMPIRICAL RISK = 52.333333333333336



n_estimators: 100
max_depth:12
TRAIN AVERAGE EMPIRICAL RISK = 17.666666666666668
TEST AVERAGE EMPIRICAL RISK = 56.333333333333336



n_estimators: 1000
max_depth:25
TRAIN AVERAGE EMPIRICAL RISK = 8.0
TEST AVERAGE EMPIRICAL RISK = 57.666666666666664



n_estimators: 3000
max_depth:50
TRAIN AVERAGE EMPIRICAL RISK = 8.333333333333334
TEST AVERAGE EMPIRICAL RISK = 55.666666666666664



LogisticRegression
solver=newton-cg
TRAIN AVERAGE EMPIRICAL RISK = 115.33333333333333
TEST AVERAGE EMPIRICAL RISK = 57.666666666666664



solver=lbfgs
TRAIN AVERAGE EMPIRI



TEST AVERAGE EMPIRICAL RISK = 93.0





In [18]:
#Наименьший эмпирический риск получается при классификаторе RandomForest с количеством деревьев 1000 
#и количеством уровней 25, при классификаторе LogisticRegression наиболее оптимальное значение получаем при solver=newton-cg.