In [15]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import lightgbm as lgb
from catboost import CatBoostClassifier
%matplotlib inline

In [27]:
features = ["Age", "Pclass", "Sex_male", "Alone", "Fare", "Embarked_S", "Embarked_C", "Embarked_Q"]

def data_preprocessing(train, test):

    dataset = pd.concat([train, test], axis=0)

    # 家族人数を表すカラムを追加
    dataset['Family'] = dataset['SibSp'] + dataset['Parch']
    dataset.loc[dataset['Family'] == 0, 'Alone'] = 1
    dataset['Alone'] = dataset['Alone'].fillna(value=0)

    # Age, honorificの処理 *********************************************************************************
    # 最大分割回数 = 2で、名前を","または"."で分割
    dataset['honorific'] = dataset['Name'].str.split('[,.]', 2, expand = True)[1].str.strip()

    # データ可視化用の処理
    #hist_honorific = dataset[['honorific', 'Survived', 'PassengerId']].groupby(['honorific', 'Survived']).count().unstack()
    #hist_honorific.plot.bar(stacked=True)

    # 敬称でgroupbyし、平均年齢を算出
    average_age = dataset[['honorific', 'Age']].groupby(['honorific']).mean()

    # 置き換え用の一時カラム'abc'に、敬称に基づいた平均年齢を格納
    dataset['abc'] = dataset['honorific'].apply(lambda x: average_age.loc[x,'Age'])
    # 年齢がNaNについては敬称に基づいた平均年齢で補完
    dataset['Age'] = dataset['Age'].fillna(dataset['abc'])
    # 置き換え用の一時カラムを削除
    dataset = dataset.drop(columns = ['abc'])

    # 重要そうな敬称以外は'Other'に統一
    dataset['honorific'] = dataset['honorific'].where((dataset['honorific'] == 'Mr') | (dataset['honorific'] == 'Miss') | (dataset['honorific'] == 'Mrs') | (dataset['honorific'] == 'Master'), other = 'Other')
    
    from sklearn.preprocessing import LabelEncoder
    encoder = LabelEncoder()
    dataset['honorific'] = encoder.fit_transform(dataset['honorific'])
    # **********************************************************************************************

    # Ageの欠損値にはpandaではなくscikit-learnのライブラリを使用
    from sklearn.impute import SimpleImputer
    imputer = SimpleImputer(missing_values = np.nan, strategy = 'median')
    dataset['Fare'] = imputer.fit_transform(dataset['Fare'].values.reshape(-1,1))
    imputer = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')
    dataset['Embarked'] = imputer.fit_transform(dataset['Embarked'].values.reshape(-1,1))

    dataset = pd.get_dummies(dataset, columns=['Embarked'], drop_first=False)
    # SexはLabelEncodingを用いると多重共線性が発生? OneHot Encodingへ変更
    dataset = pd.get_dummies(dataset, columns=['Sex'], drop_first=True)

    # Features Scaling
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    dataset['Age'] = scaler.fit_transform(dataset['Age'].values.reshape(-1,1))
    dataset['Fare'] = scaler.fit_transform(dataset['Fare'].values.reshape(-1,1))
    
    dataset_train = dataset.iloc[:train.shape[0], :]
    dataset_test = dataset.iloc[train.shape[0]:, :]

    return dataset_train, dataset_test

dataset_train = pd.read_csv('data/train.csv', sep=',')
dataset_test = pd.read_csv('data/test.csv', sep=',')
dataset_train, dataset_test = data_preprocessing(dataset_train, dataset_test)

X_train = dataset_train[features]
y_train = dataset_train['Survived']

Id = dataset_test['PassengerId']
X_test = dataset_test[features]


In [24]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
logi_classifier = LogisticRegression()
logi_classifier.fit(X_train, y_train)
y_pred = logi_classifier.predict(X_test)

from sklearn.metrics import confusion_matrix, accuracy_score
def create_confusion_matrix(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    print(f'accuracy_score = {accuracy_score(y_test,y_pred):.5f}')

#create_confusion_matrix(y_test, y_pred)

In [54]:
# 過学習?のため結果はいまいち。EDAを見直す必要あり?

from sklearn.model_selection import train_test_split
X_fit, X_pred, y_fit, y_pred = train_test_split(X_train, y_train, test_size=0.2)

cat_classifier = CatBoostClassifier(iterations=5000, use_best_model=True, eval_metric='Accuracy', od_pval=0, learning_rate=0.0001, depth=10)
cat_classifier.fit(X_fit, y_fit, eval_set=(X_pred, y_pred))
prediction = cat_classifier.predict(X_pred)
cm = confusion_matrix(y_pred, prediction)
print(cm)
print(f'accuracy_score = {accuracy_score(y_pred,prediction):.5f}')

#y_pred = cat_classifier.predict(X_test)
#output = pd.DataFrame({'PassengerId': Id, 'Survived': y_pred})
#output.to_csv('my_submission.csv', index=False)
#print("Your submission was successfully saved!")


4776:	learn: 0.8426966	test: 0.8324022	best: 0.8491620 (2)	total: 42.9s	remaining: 2s
4777:	learn: 0.8426966	test: 0.8324022	best: 0.8491620 (2)	total: 42.9s	remaining: 1.99s
4778:	learn: 0.8426966	test: 0.8324022	best: 0.8491620 (2)	total: 42.9s	remaining: 1.98s
4779:	learn: 0.8426966	test: 0.8324022	best: 0.8491620 (2)	total: 42.9s	remaining: 1.98s
4780:	learn: 0.8426966	test: 0.8324022	best: 0.8491620 (2)	total: 42.9s	remaining: 1.97s
4781:	learn: 0.8426966	test: 0.8324022	best: 0.8491620 (2)	total: 43s	remaining: 1.96s
4782:	learn: 0.8426966	test: 0.8324022	best: 0.8491620 (2)	total: 43s	remaining: 1.95s
4783:	learn: 0.8426966	test: 0.8324022	best: 0.8491620 (2)	total: 43s	remaining: 1.94s
4784:	learn: 0.8426966	test: 0.8324022	best: 0.8491620 (2)	total: 43s	remaining: 1.93s
4785:	learn: 0.8426966	test: 0.8324022	best: 0.8491620 (2)	total: 43s	remaining: 1.92s
4786:	learn: 0.8426966	test: 0.8324022	best: 0.8491620 (2)	total: 43s	remaining: 1.91s
4787:	learn: 0.8426966	test: 0.8324

In [13]:
# K-Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier
k_classifier = KNeighborsClassifier(n_neighbors=2, metric='minkowski', p=1)
k_classifier.fit(X_train, y_train)
y_pred = k_classifier.predict(X_test)

create_confusion_matrix(y_test, y_pred)

[[137  11]
 [ 39  36]]
accuracy_score = 0.77578


In [14]:
# Support Vector Machine
from sklearn.svm import SVC
svm_classifier = SVC(kernel='linear', random_state=0)
svm_classifier.fit(X_train, y_train)
y_pred = svm_classifier.predict(X_test)

create_confusion_matrix(y_test, y_pred)

[[131  17]
 [ 23  52]]
accuracy_score = 0.82063


In [15]:
# Naive Bayes
from sklearn.naive_bayes import GaussianNB
naive_classifier = GaussianNB()
naive_classifier.fit(X_train, y_train)
y_pred = naive_classifier.predict(X_test)

create_confusion_matrix(y_test, y_pred)

[[128  20]
 [ 28  47]]
accuracy_score = 0.78475


In [39]:
# Random Forest Classification
from sklearn.ensemble import RandomForestClassifier
rndm_classifier = RandomForestClassifier(n_estimators=10, criterion='gini', bootstrap=False, max_features=2, min_samples_leaf=1, min_samples_split=20, random_state=0)
rndm_classifier.fit(X_train, y_train)
y_pred = rndm_classifier.predict(X_test)

create_confusion_matrix(y_test, y_pred)

NameError: name 'y_test' is not defined

In [221]:
# KFold
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
kfold = KFold(n_splits=10, random_state=42)
result = cross_val_score = cross_val_score(RandomForestClassifier(), X_train, y_train, cv = kfold, scoring = 'accuracy')
print(result.mean())

0.8070104025327905


In [222]:
# grid search
'''
from sklearn.model_selection import GridSearchCV
grid_parameters = [
    {'n_estimators': [1, 2, 5, 10, 100, 1000], 
    'criterion': ['gini', 'entropy'],
    'max_features': [1, 2, 5, 10, 20],
    'min_samples_split': [1, 2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 5, 10, 20],
    'bootstrap': [True, False],
    }
]

grid_search = GridSearchCV(RandomForestClassifier(), grid_parameters, cv=5, scoring='accuracy', n_jobs = -1)
grid_search.fit(X_train, y_train)
grid_search.best_params_

SyntaxError: EOF while scanning triple-quoted string literal (<ipython-input-222-da4f042cd10a>, line 16)

In [18]:
# LightGBM
lgb_train = lgb.Dataset(X_train, y_train)
lgb_test = lgb.Dataset(X_test, y_test)
parameter = {
    'objective': 'binary',
    'random_seed': 1234,
    'num_iterations': 100,
    'max_depth': -1,
    'num_leaves':20,
    'max_bin': 500,
    'min_data_in_leaf': 57
}
model = lgb.train(parameter, train_set=lgb_train, valid_sets=lgb_test, num_boost_round=200,
                    early_stopping_rounds=20, verbose_eval=10)


[LightGBM] [Info] Number of positive: 267, number of negative: 401
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 193
[LightGBM] [Info] Number of data points in the train set: 668, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.399701 -> initscore=-0.406713
[LightGBM] [Info] Start training from score -0.406713
Training until validation scores don't improve for 20 rounds
[10]	valid_0's binary_logloss: 0.465456
[20]	valid_0's binary_logloss: 0.433973
[30]	valid_0's binary_logloss: 0.4236
[40]	valid_0's binary_logloss: 0.416612
[50]	valid_0's binary_logloss: 0.417427
[60]	valid_0's binary_logloss: 0.416141
[70]	valid_0's binary_logloss: 0.414786
[80]	valid_0's binary_logloss: 0.413357
[90]	valid_0's binary_logloss: 0.413727
[100]	valid_0's binary_logloss: 0.413648
Did not meet early stopping. Best iteration is:
[86]	valid_0's binary_logloss: 0.411205


In [21]:
y_pred = model.predict(X_test)
metric = []
range_index = 10000
for x in range(0, range_index, 1):
    y_adjust = np.where(y_pred > x/range_index, 1, 0)
    metric.append(accuracy_score(y_test, y_adjust))
#create_confusion_matrix(y_test, y_pred)
print(f'\n\nmax accuracy is {max(metric):.4f} at {metric.index(max(metric))/range_index}')

y_pred = np.where(y_pred > 0.5429, 1, 0)
create_confusion_matrix(y_test, y_pred)



max accuracy is 0.8520 at 0.5429
[[142   6]
 [ 27  48]]
accuracy_score = 0.85202


In [223]:
# Predict
test_dataset = pd.read_csv('data/test.csv')
X, y, test_dataset = data_preprocessing(test_dataset)
y_pred = model.predict(X)
y_pred = np.where(y_pred > 0.4128, 1, 0)

output = pd.DataFrame({'PassengerId': test_dataset['PassengerId'], 'Survived': y_pred})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
