<a href="https://colab.research.google.com/github/iwatsuki-yuuki/Matsuo-GCI/blob/main/conpetion1%EF%BC%88%E6%8F%90%E5%87%BA%E7%94%A8%E3%82%B3%E3%83%BC%E3%83%89%EF%BC%89.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ドライブ読み込み
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# ライブラリのインポート
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

import warnings
warnings.filterwarnings('ignore')

In [None]:
# pandas-profilingを使ってみる
!pip install ydata-profiling

In [None]:
# 使用するデータセットの読み込み
path =  '/content/drive/MyDrive/Colab Notebooks/GCI/conp/'

df = pd.read_csv(path + 'train.csv')
df_test = pd.read_csv(path + 'test.csv')

In [None]:
# データの理解
print(df.shape)
print(df_test.shape)

In [None]:
# 訓練データにはPerished(死亡したかどうか)のデータがあって今回はこれを予想するやつ
df.head()

In [None]:
df_test.head()

In [None]:
# データが何型が確認
df.dtypes

In [None]:
# 欠損値の確認
df.isnull().sum()

In [None]:
df_test.isnull().sum()

In [None]:
# 生存者の割合の確認
df['Perished'].value_counts()

In [None]:
# 一様図示
df['Perished'].value_counts().plot.pie(figsize=(5,5), autopct='%1.1f%%')

In [None]:
from ydata_profiling import ProfileReport
# プロファイリングレポートを作成
profile = ProfileReport(df, title='Pandas Profiling Report', explorative=True)

# Jupyter Notebook で表示
profile.to_notebook_iframe()

In [None]:
# NameにMr、Mrs、Msの三つでgroupbyし、それぞれで年齢の平均値を算出する
# それで欠損値を補う
# 敬称を抽出する関数（正規表現を使用）
df['Title'] = df['Name'].str.extract(r'\b(Mr|Mrs|Ms|Miss)\b', expand=False)
df.head()

In [None]:
# 平均値を算出
title_age_means = df.groupby('Title')['Age'].mean()
title_age_means

In [None]:
# 欠損値を補完
def fill_age(row):
  if pd.isnull(row['Age']) and row['Title'] in title_age_means:
    return title_age_means[row['Title']]
  return row['Age']

df['Age'] = df.apply(fill_age, axis=1)
df.isnull().sum()

In [None]:
# 残りのAgeのNuLLには全体の平均を入れる
all_mean_age = df['Age'].mean()
df['Age'] = df['Age'].fillna(all_mean_age)
df.isnull().sum()

In [None]:
# 同様の内容をテストデータにも施す
df_test['Title'] = df_test['Name'].str.extract(r'\b(Mr|Mrs|Ms|Miss)\b', expand=False)
df.head()
# 欠損値を補完
def fill_age(row):
  if pd.isnull(row['Age']) and row['Title'] in title_age_means:
    return title_age_means[row['Title']]
  return row['Age']

df_test['Age'] = df_test.apply(fill_age, axis=1)

df_test['Age'] = df_test['Age'].fillna(all_mean_age)
df_test.isnull().sum()

In [None]:
# Cabinが欠損値が多いので全体のうちどのくらいが欠損値なのか判断する
df['Cabin'].isnull().value_counts()

In [None]:
df_test['Cabin'].isnull().value_counts()

In [None]:
# ほとんどが欠損値なのでこの列を削除する
df = df.drop('Cabin',axis=1)
df_test = df_test.drop('Cabin',axis=1)
df.isnull().sum()

In [None]:
# Emberkの欠損値の中身を確認
df['Embarked'].value_counts()

In [None]:
# 欠損値を全てSで保管する
df['Embarked'].fillna('S', inplace=True)
df_test['Embarked'].fillna('S', inplace=True)

df.isnull().sum()

In [None]:
# テストデータのfareのnunを消してはだめ、データのサイズが変わるからここでは平均値で補完する
df_test['Fare'] = df_test['Fare'].fillna(df_test['Fare'].mean())
df_test.isnull().sum()

In [None]:
# カテゴリカルデータの処理
# 料金のデータはデカすぎるから正規化
# 性別は数値型じゃないデータをonehot表現する
# C,Q,Sはブール値で数値型として扱えるからそのままでいい
# Pclassも数値だけどカテゴリカルデータであるのでdumysで展開しておく
embarked = pd.concat([df['Embarked'], df_test['Embarked']])

embarked_ohe = pd.get_dummies(embarked)

embarked_ohe_train = embarked_ohe[:891]
embarked_ohe_test = embarked_ohe[891:]

df = pd.concat([df, embarked_ohe_train], axis=1)
df_test = pd.concat([df_test, embarked_ohe_test], axis=1)

df.drop('Embarked', axis=1, inplace=True)
df_test.drop('Embarked', axis=1, inplace=True)
df.head()

In [None]:
# 性別は数値型じゃないデータをonehot表現する
# Pclassも数値だけどカテゴリカルデータであるのでdumysで展開しておく
df = pd.get_dummies(df, columns=['Sex', 'Pclass'])
df_test = pd.get_dummies(df_test, columns=['Sex', 'Pclass'])
df.head()

In [None]:
# チケット番号の取り扱いはこの記事を参考にした
# https://qiita.com/seri28/items/ae98aa1965fc29cd864d
# うまく数値に変換できなかったのでパス

In [None]:
# Fareの正規化
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# 正規化の学習は学習データのみにさせて、その結果を学習データとテストデータに利用する
scaler.fit(df[['Fare']])

df['Fare'] = scaler.transform(df[['Fare']])
df_test['Fare'] = scaler.transform(df_test[['Fare']])
df.head()

In [None]:
# 全てのブール値を念の為0と1に変換
bool_cols = df.select_dtypes(include='bool').columns
df[bool_cols] = df[bool_cols].astype(int)
df.head()

In [None]:
# 説明変数を決める
df.columns

In [None]:
X = df.drop(columns=['Name', 'Ticket', 'Title', 'Perished']).values
y = df['Perished']

X_test = df_test.drop(columns=['Name', 'Ticket', 'Title']).values

In [None]:
# データの分割（ホールドアウト方法）
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# ロジスティック回帰

lr = LogisticRegression(random_state=42)

lr.fit(X_train, y_train)

print('Train Score: {}'.format(round(lr.score(X_train, y_train), 3)))
print(' Test Score: {}'.format(round(lr.score(X_valid, y_valid), 3)))

In [None]:
# ランダムフォレスト(1)
rfc = RandomForestClassifier(max_depth=10, min_samples_leaf=1, n_estimators=100, n_jobs=-1, random_state=42)
rfc.fit(X_train, y_train)

param_grid = {'max_depth': [3, 5, 7],
              'min_samples_leaf': [1, 2, 4]}

for max_depth in param_grid['max_depth']:
    for min_samples_leaf in param_grid['min_samples_leaf']:
        rfc_grid = RandomForestClassifier(max_depth=max_depth, min_samples_leaf=min_samples_leaf,
                                            n_estimators=100, n_jobs=-1, random_state=42)
        rfc_grid.fit(X_train, y_train)
        print('max_depth: {}, min_samples_leaf: {}'.format(max_depth, min_samples_leaf))
        print('    Train Score: {}, Test Score: {}'.format(round(rfc_grid.score(X_train, y_train), 3),
                                                           round(rfc_grid.score(X_valid, y_valid), 3)))

In [None]:
# クロスバリエーションを用いたランダムフォレスト（ハイパラcvを用調整）
rfc_gs = GridSearchCV(RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42), param_grid, cv=10)
rfc_gs.fit(X, y)

print('Best Parameters: {}'.format(rfc_gs.best_params_))
print('CV Score: {}'.format(round(rfc_gs.best_score_, 3)))

In [None]:
# 多層パーセプトロン
mlpc = MLPClassifier(hidden_layer_sizes=(100, 100, 10), random_state=0)
mlpc.fit(X_train, y_train)

print('Multilayer Perceptron \n')
print('Train Score: {}'.format(round(mlpc.score(X_train, y_train), 3)))
print(' Test Score: {}'.format(round(mlpc.score(X_valid, y_valid), 3)))

In [None]:
# アンサンブリング
rfc_pred = rfc.predict_proba(X_test)
lr_pred = lr.predict_proba(X_test)
# mlpc_pred = mlpc.predict_proba(X_test)

pred_proba = (rfc_pred + lr_pred) / 2
pred = pred_proba.argmax(axis=1)

In [None]:
# アンさんブリングしない方法
pred_proba = lr_pred
pred = pred_proba.argmax(axis=1)

In [None]:
path =  '/content/drive/MyDrive/Colab Notebooks/GCI/conp/'

submission = pd.read_csv(path + 'gender_submission.csv')
submission

In [None]:
pred.shape

In [None]:
submission['Perished'] = pred
submission

In [None]:
submission.to_csv('/content/drive/MyDrive/Colab Notebooks/GCI/conp/submit03.csv', index=False)

In [None]:
# kaggle用のデータ整形
submission = submission.rename(columns={'Perished': 'Survived'})

In [None]:
submission = submission[['PassengerId', 'Survived']]

In [None]:
submission.to_csv('/content/drive/MyDrive/Colab Notebooks/GCI/conp/submit04.csv', index=False)