In [23]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import lightgbm as lgb
%matplotlib inline

In [24]:
dataset = pd.read_csv('data/train.csv', sep=',')
#print(dataset)

#features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
# notebookに倣ってfeaturesを絞ってみる
features = ["Pclass", "Sex_male", "SibSp", "Parch", "Fare", "Embarked_S", "Embarked_C", "Embarked_Q"]

# 欠損値の確認
#print(dataset.isnull().sum())
def data_preprocessing(dataset):
    # Embarkedの欠損率は約2%... 消すか --> 消さない！
    #dataset = dataset.dropna(subset=['Embarked'])

    # 家族人数を表すカラムを追加
    dataset['Family'] = dataset['SibSp'] + dataset['Parch']

    # Ageの欠損値にはpandaではなくscikit-learnのライブラリを使用
    from sklearn.impute import SimpleImputer
    imputer = SimpleImputer(missing_values = np.nan, strategy = 'median')
    dataset['Age'] = imputer.fit_transform(dataset['Age'].values.reshape(-1,1))
    dataset['Fare'] = imputer.fit_transform(dataset['Fare'].values.reshape(-1,1))
    imputer = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')
    dataset['Embarked'] = imputer.fit_transform(dataset['Embarked'].values.reshape(-1,1))

    # Sex column, Embarkedをエンコーディング
    '''
    from sklearn.preprocessing import LabelEncoder
    label_encoder = LabelEncoder()
    dataset['Embarked'] = label_encoder.fit_transform(dataset['Embarked'])
    dataset['Sex'] = label_encoder.fit_transform(dataset['Sex'])
    '''
    dataset = pd.get_dummies(dataset, columns=['Embarked'], drop_first=False)
    # SexはLabelEncodingを用いると多重共線性が発生? OneHot Encodingへ変更
    dataset = pd.get_dummies(dataset, columns=['Sex'], drop_first=True)

    # Features Scaling
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    dataset['Age'] = scaler.fit_transform(dataset['Age'].values.reshape(-1,1))
    dataset['Fare'] = scaler.fit_transform(dataset['Fare'].values.reshape(-1,1))


    X = dataset.loc[:,features].values
    if 'Survived' in dataset.columns:
        y = dataset.loc[:,'Survived'].values
    else:
        y = np.nan

    # それ以外の欠損はimpute
    X = imputer.fit_transform(X)
    
    return X, y, dataset

X, y, dataset = data_preprocessing(dataset)
print(dataset)

     PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
..           ...       ...     ...   
886          887         0       2   
887          888         1       1   
888          889         0       3   
889          890         1       1   
890          891         0       3   

                                                  Name       Age  SibSp  \
0                              Braund, Mr. Owen Harris -0.565736      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  0.663861      1   
2                               Heikkinen, Miss. Laina -0.258337      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  0.433312      1   
4                             Allen, Mr. William Henry  0.433312      0   
..                                                 ...       ...    ...   
886               

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [26]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
logi_classifier = LogisticRegression()
logi_classifier.fit(X_train, y_train)
y_pred = logi_classifier.predict(X_test)

from sklearn.metrics import confusion_matrix, accuracy_score
def create_confusion_matrix(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    print(f'accuracy_score = {accuracy_score(y_test,y_pred):.5f}')

create_confusion_matrix(y_test, y_pred)

[[115  19]
 [ 26  63]]
accuracy_score = 0.79821


In [27]:
# K-Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier
k_classifier = KNeighborsClassifier(n_neighbors=2, metric='minkowski', p=1)
k_classifier.fit(X_train, y_train)
y_pred = k_classifier.predict(X_test)

create_confusion_matrix(y_test, y_pred)

[[119  15]
 [ 39  50]]
accuracy_score = 0.75785


In [28]:
# Support Vector Machine
from sklearn.svm import SVC
svm_classifier = SVC(kernel='linear', random_state=0)
svm_classifier.fit(X_train, y_train)
y_pred = svm_classifier.predict(X_test)

create_confusion_matrix(y_test, y_pred)

[[111  23]
 [ 27  62]]
accuracy_score = 0.77578


In [29]:
# Naive Bayes
from sklearn.naive_bayes import GaussianNB
naive_classifier = GaussianNB()
naive_classifier.fit(X_train, y_train)
y_pred = naive_classifier.predict(X_test)

create_confusion_matrix(y_test, y_pred)

[[110  24]
 [ 25  64]]
accuracy_score = 0.78027


In [30]:
# Random Forest Classification
from sklearn.ensemble import RandomForestClassifier
rndm_classifier = RandomForestClassifier(n_estimators=100, criterion='gini', bootstrap=False, max_features=2, min_samples_leaf=5, min_samples_split=10, random_state=0)
rndm_classifier.fit(X_train, y_train)
y_pred = rndm_classifier.predict(X_test)

create_confusion_matrix(y_test, y_pred)

[[122  12]
 [ 34  55]]
accuracy_score = 0.79372


In [31]:
# grid search
'''
from sklearn.model_selection import GridSearchCV
grid_parameters = [
    {'n_estimators': [1, 2, 5, 10, 100, 1000], 
    'criterion': ['gini', 'entropy'],
    'max_features': [1, 2, 5, 10, 20],
    'min_samples_split': [1, 2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 5, 10, 20],
    'bootstrap': [True, False],
    }
]

grid_search = GridSearchCV(RandomForestClassifier(), grid_parameters, cv=5, scoring='accuracy', n_jobs = -1)
grid_search.fit(X_train, y_train)
grid_search.best_params_
'''

"\nfrom sklearn.model_selection import GridSearchCV\ngrid_parameters = [\n    {'n_estimators': [1, 2, 5, 10, 100, 1000], \n    'criterion': ['gini', 'entropy'],\n    'max_features': [1, 2, 5, 10, 20],\n    'min_samples_split': [1, 2, 5, 10, 20],\n    'min_samples_leaf': [1, 2, 5, 10, 20],\n    'bootstrap': [True, False],\n    }\n]\n\ngrid_search = GridSearchCV(RandomForestClassifier(), grid_parameters, cv=5, scoring='accuracy', n_jobs = -1)\ngrid_search.fit(X_train, y_train)\ngrid_search.best_params_\n"

In [47]:
# LightGBM
lgb_train = lgb.Dataset(X_train, y_train)
lgb_test = lgb.Dataset(X_test, y_test)
parameter = {
    'objective': 'binary',
    'random_seed': 1234
}
model = lgb.train(parameter, train_set=lgb_train, valid_sets=lgb_test, num_boost_round=200,
                    early_stopping_rounds=20, verbose_eval=10)


[LightGBM] [Info] Number of positive: 253, number of negative: 415
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 145
[LightGBM] [Info] Number of data points in the train set: 668, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.378743 -> initscore=-0.494889
[LightGBM] [Info] Start training from score -0.494889
Training until validation scores don't improve for 20 rounds
[10]	valid_0's binary_logloss: 0.454588
[20]	valid_0's binary_logloss: 0.41203
[30]	valid_0's binary_logloss: 0.405831
[40]	valid_0's binary_logloss: 0.408208
Early stopping, best iteration is:
[28]	valid_0's binary_logloss: 0.40484
[1 1 1 1 0 0 1 1 0 1 1 1 1 0 1 1 0 0 0 1 0 1 0 1 0 0 1 1 0 0 0 0 1 1 1 1 0
 1 0 0 0 0 1 0 0 0 0 1 1 1 1 0 0 1 0 1 1 0 0 0 0 0 0 0 0 1 0 1 1 0 1 0 0 1
 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 1 0 0 0 1 1 0 1 0 0 1 0 0 0 0 0 0 0 1 1 1 1
 0 1 0 0 1 1 0 1 1 0 0 1 1 1 0 0 0 

In [55]:
y_pred = model.predict(X_test)
y_pred = np.where(y_pred > 0.3, 1, 0)
create_confusion_matrix(y_test, y_pred)

[[ 32 102]
 [ 78  11]]
accuracy_score = 0.19283


In [48]:
# Predict
test_dataset = pd.read_csv('data/test.csv')
X, y, test_dataset = data_preprocessing(test_dataset)
y_pred = rndm_classifier.predict(X)

output = pd.DataFrame({'PassengerId': test_dataset['PassengerId'], 'Survived': y_pred})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
