In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')
import re
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
from sklearn import tree
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve, roc_auc_score

print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

## 1.データを確認

In [None]:
train_data = pd.read_csv('../input/train.csv')

In [None]:
test_data = pd.read_csv('../input/test.csv')

In [None]:
train_data.shape, test_data.shape

In [None]:
train_data.info()

In [None]:
test_data.info()

In [None]:
# トレーニングデータとテストデータを結合
all_data = train_data.append(test_data, ignore_index=True, sort=True).copy()

In [None]:
all_data.info()

In [None]:
all_data.head()

## 2.欠損値を補完する

### 2.1 Embarkedを補完

In [None]:
all_data[all_data.Embarked.isnull()]

In [None]:
all_data[~all_data.Embarked.isnull()]

In [None]:
embarked_mode = all_data.Embarked.dropna().mode().values
embarked_mode

In [None]:
# Embarkedの欠損値に最頻値'S'を入れる
all_data.Embarked = all_data.Embarked.fillna(value=embarked_mode[0])
all_data.info()

### 2.2 年齢を補完

In [None]:
# 名前からタイトルを抽出
all_data['title'] = all_data.Name.str.split(',').map(lambda x: x[1]).str.split('.').map(lambda x: x[0]).str.strip()

In [None]:
# 年齢が欠損しているタイトル別の行数
age_na_titles = all_data[all_data.Age.isnull()].title.value_counts()
age_na_titles

In [None]:
# 年齢が欠損しているtitleがMrで結婚している行数
all_data[all_data.Age.isnull() & (all_data.SibSp >0) & (all_data.title == 'Mr')]

In [None]:
all_data[all_data.Age.isnull() & (all_data.SibSp == 0) & (all_data.Parch == 0) & (all_data.title == 'Mr')]

In [None]:
# 年齢が欠損していない欠損タイトル別の行数
age_na_title_df = all_data[all_data.title.isin(list(age_na_titles.index))]
age_na_title_df[~age_na_title_df.Age.isnull()].title.value_counts()

In [None]:

age_mean = age_na_title_df[~age_na_title_df.Age.isnull()].groupby('title').Age.mean().to_dict()
age_mean

In [None]:
plt.figure(figsize=(4, 3))
g = sns.FacetGrid(age_na_title_df, col="title", hue='Survived')
g.map(plt.hist, "Age")
for ax in g.axes[0]:
    title = ax.get_title().split('=')[1].split()[0]
    ax.axvline(age_mean[title], ls='--')
plt.show()

In [None]:
#各titleのAge平均値をAge欠損値に追加する
all_data.Name.str.split(',').map(lambda x: x[1]).str.split('.').map(lambda x: x[0]).value_counts()
all_data.loc[all_data.Age.isna(), 'Age'] = all_data[all_data.Age.isna()].title.map(age_mean)

### 2.3 Fareを補完

In [None]:
all_data[all_data.Fare.isnull()]

In [None]:
# Fareの欠損値にEmbarked=S & Pclass=3のFare中央値で補完する
all_data[(all_data.Pclass == 3) & (all_data.Embarked == 'S') & (all_data.Parch == 0) & (all_data.SibSp == 0)]
fare_median = all_data[(all_data.Pclass == 3) & (all_data.Embarked == 'S') & (all_data.Parch == 0) & (all_data.SibSp == 0)].Fare.dropna().median()
fare_median

In [None]:
#上のデータを見ると、74番目のBing, Mr. Lee、　1258番目のRiihivouri, Miss. Susanna Juhantytar Sanni""と１２７３番目のRisien、　Mrs. Samuel (Emma)のFareが異常に高い、それでこの二つ名前について調べる
# all_data[all_data.Name.str.contains('Risien') | all_data.Name.str.contains('Riihivouri') | all_data.Name.str.contains('Lee')]


In [None]:
#1258番目のRiihivouri, Miss. Susanna Juhantytar Sanni""は一人ですが、Fareはかなり高い、ネットで調べたらPanulaの近所ということで、Panulaの家族と一緒に登船、
# all_data[all_data.Name.str.contains('Risien') | all_data.Name.str.contains('Riihivouri') | all_data.Name.str.contains('Panula') | all_data.Name.str.contains('Lee')]

**　6/10 夜ここまで　**

In [None]:
all_data.loc[:, 'Fare'] = all_data.Fare.fillna(fare_median)

In [None]:
all_data.info()

### 2.4 キャビンの欠損値

In [None]:
# Cabinあり、なし特徴量を追加
all_data['has_cabin'] = 1
all_data.loc[all_data.Cabin.isna(), 'has_cabin'] = 0

In [None]:
'''
combine = [all_data]
for train in combine: 
    all_data['Cabin_Lett'] = all_data['Cabin'].apply(lambda x: str(x)[0]) 
    all_data['Cabin_Lett'] = all_data['Cabin_Lett'].apply(lambda x: str(x)) 
    all_data['Cabin_Lett'] = np.where(
        (all_data['Cabin_Lett']).isin(
            [ 'F', 'E', 'D', 'C', 'B', 'A']
        ),
        all_data['Cabin_Lett'],
        np.where(
            (all_data['Cabin_Lett']).isin(
                ['W', '4', '7', '6', 'L', '5', '8']
            ), 
            '0',
            '0')
    )
del all_data['Cabin'] 
all_data['Cabin_Lett']=all_data['Cabin_Lett'].replace("A",1).replace("B",2).replace("C",1).replace("0",0).replace("D",2).replace("E",2).replace("F",1)

'''

In [None]:
# all_data['Cabin_Lett']

In [None]:
fig, ax =plt.subplots()
data = all_data[~all_data['Survived'].isna()]
x = data.has_cabin.unique().tolist()
y_all = data.has_cabin.value_counts().values.tolist()
y_survi = data.groupby('has_cabin').Survived.sum().values.tolist()
labels = x

ax.bar(x, y_all, tick_label=labels, label='died')
ax.bar(x, y_survi, label='survived')
ax.legend()
plt.show()

In [None]:
all_data.drop('Cabin',axis=1,inplace=True)

In [None]:
all_data.info()

In [None]:
#cabinを削除
#all_data.drop(axis=1, columns=['Cabin'], inplace=True)

In [None]:
#nullでないCabinの頭文字を取る
#cabin = all_data[~all_data.Cabin.isna()].Cabin
#cabin.sample(10)

In [None]:
#reg = re.compile(r'^([A-Z])\d*')
#all_data['cabin_class'] = cabin.str.split(' ').map(lambda x: reg.match(x[0]).groups()[0])


**2019/6/13**

In [None]:
#家族がいる欠損値を家族のメンバーのCabin頭文字で埋める
#all_data[~all_data.cabin_class.isna()]

In [None]:
#Fareが同じのグループのCabin頭文字は同じ

In [None]:
#同じ港から登船した客の頭文字titleごとに近いはず

## 2－2　カテゴリ変数のエンコーディング

1. Ticketを処理

1.1 Ticketを数字とアルファベットが混ざるものに分離する

In [None]:
number_ticket = all_data[all_data['Ticket'].str.match('\d+')]
num_alpha_ticket = all_data[all_data['Ticket'].str.match('[A-Z]+.+')]

1.2 数字だけのチケットの分布を確認

In [None]:
number_ticket['Ticket'] = number_ticket['Ticket'].astype(int)
number_ticket['Ticket'].head()

In [None]:
number_ticket.sort_values('Ticket', inplace=True)
plt.figure()
plt.ylim(0, 3300000) 
plt.plot(number_ticket['Ticket'], '-o')
plt.show()

In [None]:
number_ticket.sort_values('Ticket', inplace=True)
plt.figure()
plt.ylim(0, 500000) 
plt.plot(number_ticket['Ticket'], '-o')
plt.show()

In [None]:
x = [1, 2, 3, 4, 5]
number_ticket_group1 = number_ticket[number_ticket['Ticket'] <= 100000]
number_ticket_group2 = number_ticket[(number_ticket['Ticket'] > 100000) & (number_ticket['Ticket'] < 200000)]
number_ticket_group3 = number_ticket[(number_ticket['Ticket'] > 200000) & (number_ticket['Ticket'] < 300000)]
number_ticket_group4 = number_ticket[(number_ticket['Ticket'] > 300000) & (number_ticket['Ticket'] < 400000)]
number_ticket_group5 = number_ticket[number_ticket['Ticket'] > 3000000]

In [None]:
y = [number_ticket_group1['Survived'].mean(), number_ticket_group2['Survived'].mean(),
     number_ticket_group3['Survived'].mean(), number_ticket_group4['Survived'].mean(),
     number_ticket_group5['Survived'].mean()
    ]
plt.figure()
plt.bar(x, y)
plt.xlabel('ticket number')
plt.ylabel('Survived')
plt.show()

1.3アルファベットが入ったチケット 

In [None]:
num_alpha_ticket.info()

In [None]:
num_alpha_ticket['Ticket'].str.split(' ').map(lambda x: x[0]).value_counts()

In [None]:
A_ticket = num_alpha_ticket[num_alpha_ticket['Ticket'].str.match('A.+')]
CA_ticket = num_alpha_ticket[num_alpha_ticket['Ticket'].str.match('C\.*A\.*.+')]
PC_ticket = num_alpha_ticket[num_alpha_ticket['Ticket'].str.match('PC.+')]
PP_ticket = num_alpha_ticket[num_alpha_ticket['Ticket'].str.match('PP.+')]
SOTON_ticket = num_alpha_ticket[num_alpha_ticket['Ticket'].str.match('SOTON.+')]
STON_ticket = num_alpha_ticket[num_alpha_ticket['Ticket'].str.match('STON.+')]
LINE_ticket = num_alpha_ticket[num_alpha_ticket['Ticket'].str.match('LINE.*')]
FC_ticket = num_alpha_ticket[num_alpha_ticket['Ticket'].str.match('F\.C\.(C\.)*.+')]
W_ticket = num_alpha_ticket[num_alpha_ticket['Ticket'].str.match('W.+')]
C_ticket = num_alpha_ticket[num_alpha_ticket['Ticket'].str.match('C.+')]
SC_ticket = num_alpha_ticket[num_alpha_ticket['Ticket'].str.match('S(\.)*C.+')]
SO_ticket = num_alpha_ticket[num_alpha_ticket['Ticket'].str.match('S(\.)*O.+')]
other_ticket = num_alpha_ticket[
    num_alpha_ticket['Ticket'].str.match(
        '(Fa)*(P/PP)*(S\.P)*(S\.*W)*.+'
    )
]
x = [i for i in range(1, 14)]
y = [A_ticket['Survived'].mean(), CA_ticket['Survived'].mean(), PC_ticket['Survived'].mean()
    ,PP_ticket['Survived'].mean(), SOTON_ticket['Survived'].mean(), STON_ticket['Survived'].mean()
    ,LINE_ticket['Survived'].mean(), FC_ticket['Survived'].mean(), W_ticket['Survived'].mean()
    ,C_ticket['Survived'].mean(), SC_ticket['Survived'].mean(), SO_ticket['Survived'].mean()
    ,other_ticket['Survived'].mean()
    ]
plt.figure()
plt.bar(x, y)
plt.ylabel('survived')
plt.show()

In [None]:
W_ticket['Ticket'].shape

In [None]:
number_ticket.loc[number_ticket['Ticket'] <= 100000, 'Ticket'] = 14
number_ticket.loc[(number_ticket['Ticket'] > 100000) & (number_ticket['Ticket'] <= 200000), 'Ticket'] = 15
number_ticket.loc[(number_ticket['Ticket'] > 200000) & (number_ticket['Ticket'] <= 300000), 'Ticket'] = 13
number_ticket.loc[(number_ticket['Ticket'] > 300000) & (number_ticket['Ticket'] <= 400000), 'Ticket'] = 5
number_ticket.loc[number_ticket['Ticket'] > 3000000, 'Ticket'] = 6
num_alpha_ticket.loc[num_alpha_ticket['Ticket'].str.match('A.+'), 'Ticket'] = "1"
num_alpha_ticket.loc[num_alpha_ticket['Ticket'].str.match('C\.*A\.*.+'), 'Ticket'] = "8"
num_alpha_ticket.loc[num_alpha_ticket['Ticket'].str.match('PC.+'), 'Ticket'] = "16"
num_alpha_ticket.loc[num_alpha_ticket['Ticket'].str.match('PP.+'), 'Ticket'] = "18"
num_alpha_ticket.loc[num_alpha_ticket['Ticket'].str.match('SOTON.+'), 'Ticket'] = "3"
num_alpha_ticket.loc[num_alpha_ticket['Ticket'].str.match('STON.+'), 'Ticket'] = "11"
num_alpha_ticket.loc[num_alpha_ticket['Ticket'].str.match('LINE.*'), 'Ticket'] = "7"
num_alpha_ticket.loc[num_alpha_ticket['Ticket'].str.match('F\.C\.(C\.)*.+'), 'Ticket'] = "17"
num_alpha_ticket.loc[num_alpha_ticket['Ticket'].str.match('W.+'), 'Ticket'] = "4"
num_alpha_ticket.loc[num_alpha_ticket['Ticket'].str.match('C.+'), 'Ticket'] = "9"
num_alpha_ticket.loc[num_alpha_ticket['Ticket'].str.match('S(\.)*C.+'), 'Ticket'] = "12"
num_alpha_ticket.loc[num_alpha_ticket['Ticket'].str.match('S(\.)*O.+'), 'Ticket'] = "2"
num_alpha_ticket.loc[num_alpha_ticket['Ticket'].str.match('[^\d](Fa)*(P/PP)*(S\.P)*(S\.*W)*.+'), 'Ticket'] = "10"
num_alpha_ticket['Ticket'] = num_alpha_ticket['Ticket'].apply(lambda x: int(x))
all_data = pd.concat([number_ticket, num_alpha_ticket])

**Fareの処理**

1. PclassごとにFareが同じ人（家族、または知り合い）を見つける

In [None]:
#Pclass=3のシングルFareの中央値
fare_median_Pclass3 = all_data[(all_data.Pclass == 3)  & (all_data.Parch == 0) & (all_data.SibSp == 0)].Fare.dropna().median()
fare_median_Pclass3

In [None]:
#Pclass=3の異常値を見つける
fare_yijo_Pclass3 = all_data[(all_data.Fare > (fare_median_Pclass3*2)) & (all_data.Pclass == 3)] 
fare_yijo_Pclass3

In [None]:
#Pclass=２のシングルFareの中央値
fare_median_Pclass2 = all_data[(all_data.Pclass == 2) & (all_data.Parch == 0) & (all_data.SibSp == 0)].Fare.dropna().median()
fare_median_Pclass2

In [None]:
#Pclass=2の異常値を見つける
fare_yijo_Pclass2 = all_data[(all_data.Fare > (fare_median_Pclass2*2)) & (all_data.Pclass == 2)] 
fare_yijo_Pclass2.shape

In [None]:
#Pclass=１のシングルFareの中央値
fare_median_Pclass1 = all_data[(all_data.Pclass == 1) & (all_data.Parch == 0) & (all_data.SibSp == 0)].Fare.dropna().median()
fare_median_Pclass1

In [None]:
#Pclass=１の異常値を見つける
fare_yijo_Pclass1 = all_data[(all_data.Fare > (fare_median_Pclass1*2)) & (all_data.Pclass == 1)] 
fare_yijo_Pclass1

2. 異常Fareの中、PclassごとにFareが同じの人をgroupbyする 

In [None]:
#Pclass 3の場合
fare_group_3 = fare_yijo_Pclass3.groupby(['Fare'])['Name'].count().reset_index()
fare_group_3.rename({'Name': 'group_size'}, axis=1, inplace=True)
fare_yijo_Pclass3 = fare_yijo_Pclass3.merge(fare_group_3, on='Fare').copy()
fare_yijo_Pclass3.info()

In [None]:
#Pclass 2の場合
fare_group_2 = fare_yijo_Pclass2.groupby(['Fare'])['Name'].count().reset_index()
fare_group_2.rename({'Name': 'group_size'}, axis=1, inplace=True)
fare_yijo_Pclass2 = fare_yijo_Pclass2.merge(fare_group_2, on='Fare').copy()
fare_yijo_Pclass2.info()

In [None]:
#Pclass 1の場合
fare_group_1 = fare_yijo_Pclass1.groupby(['Fare'])['Name'].count().reset_index()
fare_group_1.rename({'Name': 'group_size'}, axis=1, inplace=True)
fare_yijo_Pclass1 = fare_yijo_Pclass1.merge(fare_group_1, on='Fare').copy()
fare_yijo_Pclass1.info()

In [None]:
fare_error = pd.concat([fare_yijo_Pclass1, fare_yijo_Pclass2, fare_yijo_Pclass3], copy=True)
fare_error.drop_duplicates(subset=['Fare', 'Pclass'], inplace=True)

In [None]:
all_data = all_data.merge(fare_error, on=['Fare', 'Pclass'], how='left', suffixes=('', '_drop'))
drop_col = all_data.filter(regex='.*_drop', axis=1)
all_data.drop(columns=list(drop_col.columns), inplace=True, axis=1)

In [None]:
all_data.group_size.fillna(1, inplace=True)

In [None]:
all_data

In [None]:
fig, ax =plt.subplots()
data = all_data[~all_data['Survived'].isna()]
x = data.group_size.unique().tolist()
y_all = data.group_size.value_counts().values.tolist()
y_survi = data.groupby('group_size').Survived.sum().values.tolist()
labels = x

ax.bar(x, y_all, tick_label=labels, label='died')
ax.bar(x, y_survi, label='survived')
ax.legend()
plt.show()

In [None]:
for a, b, c in zip(x,y_survi,y_all):
    print(a,':', b/c)

In [None]:
#グループサイズを分類
groupDf=pd.DataFrame()
#groupDf['group_size']=all_data['group_size']
groupDf['Group_Small']=all_data['group_size'].map(lambda s : 1 if s <= 2 else 0)
groupDf['Group_Middle1'] =all_data['group_size'].map(lambda s : 1 if 3 <= s <= 4 else 0)
groupDf['Group_Middle2'] =all_data['group_size'].map(lambda s : 1 if 5 <= s <= 8 else 0)
groupDf['Group_Large'] =all_data['group_size'].map(lambda s : 1 if 9 <= s else 0)
groupDf.head()

In [None]:
all_data=pd.concat([all_data,groupDf],axis=1)

In [None]:
all_data.drop('group_size',axis=1,inplace=True)

In [None]:
all_data.head()

In [None]:
#all_data[(all_data.Pclass == 3) & (all_data.Embarked == 'S') & (all_data.Parch == 0) & (all_data.SibSp == 0) & (all_data.Fare < 10)]

In [None]:
#fare_mean2 = all_data[(all_data.Pclass == 3) & (all_data.Embarked == 'S') & (all_data.Parch == 0) & (all_data.SibSp == 0) & (all_data.Fare < 10)].Fare.dropna().mean()
#fare_mean2

In [None]:
#all_data['lastname'] = all_data.Name.str.split(',').map(lambda x: x[0]).str.strip()

In [None]:
#all_data['cabin_count'] = all_data[~all_data.Cabin.isna()].Cabin.str.split(' ').map(lambda x: len(x))

In [None]:
#notna_cabin = all_data[~all_data.Cabin.isna()]
#sns.distplot(notna_cabin.Fare)

In [None]:
# Fare, Pclass, cabin_countを利用してCabinタイプを推測するモデルを作って欠損値を設定
#sub_cabin = all_data[['Cabin', 'Fare', 'Pclass', 'cabin_count']].copy()

In [None]:
#sub_cabin_train = sub_cabin[~sub_cabin.Cabin.isna()]

In [None]:
#sub_cabin_pred = sub_cabin[sub_cabin.Cabin.isna()]

In [None]:
#pattern = r'(\[A-G])\d'
#result = re.match(pattern, string)

In [None]:
#sub_cabin_train.Cabin.str.split(' ').map(lambda x: x)

## 性別を数字化

In [None]:
sex_mapDict={'male':1,'female':0}
all_data['Sex']=all_data['Sex'].map(sex_mapDict)

## 乗船港をダミー化

In [None]:
embarkedDf=pd.DataFrame()
embarkedDf=pd.get_dummies(all_data['Embarked'],prefix='Embarked')
embarkedDf.head()

In [None]:
all_data=pd.concat([all_data,embarkedDf],axis=1)
all_data.drop('Embarked',axis=1,inplace=True)
all_data.head()

## Pclass（客室クラス）をダミー化

In [None]:
pclassDf=pd.DataFrame()
pclassDf=pd.get_dummies(all_data['Pclass'],prefix='Pclass')
pclassDf.head()

In [None]:
all_data=pd.concat([all_data,pclassDf],axis=1)
all_data.drop('Pclass',axis=1,inplace=True)
all_data.head()

## Nameのタイトルをまとめる

In [None]:
"""
name1='Braund, Mr. Owen Harris'
str1=name1.split(',')[1]
#Mr.
str2=str1.split('.')[0]
str3=str2.strip()

def getTitle(name):
    str1=name.split(',')[1]
    str2=str1.split('.')[0]
    str3=str2.strip()
    
    return str3
titleDf =pd.DataFrame()
titleDf['Title']=train_data['Name'].map(getTitle)
titleDf.head()
"""

In [None]:
title_mapDict={
    "Capt":  "Officer",
    "Col":  "Officer",
    "Major": "Officer",
    "Jonkheer":"Royalty",
    "Don":"Royalty",
    "Sir":"Royalty",
    "Dr":"Officer",
    "Rev":"Officer",
    "the countess":"Royalty",
    "Dona": "Royalty",
    "Mme":"Mrs",
    "Mlle":"Miss",
    "Ms":"Mrs",
    "Mr":"Mr",
    "Mrs":"Mrs",
    "Miss":"Miss",
    "Master":"Master",
    "Lady":"Royalty"
              }
all_data['Title']=all_data.title.map(title_mapDict)
titleDf=pd.get_dummies(all_data['Title'])
titleDf.head()

In [None]:
all_data=pd.concat([all_data, titleDf], axis=1)
all_data.drop(['Name', 'title', 'Title'],axis=1,inplace=True)

In [None]:
all_data.head()

In [None]:
'''
familyDf=pd.DataFrame()
familyDf['FamilySize']=all_data['Parch']+ all_data['SibSp']+1
familyDf['Family_Single']=familyDf['FamilySize'].map(lambda s : 1 if s == 1 else 0)
familyDf['Family_Small'] =familyDf['FamilySize'].map(lambda s : 1 if 2 <= s <= 4 else 0)
familyDf['Family_Large'] =familyDf['FamilySize'].map(lambda s : 1 if 5 <= s else 0)
familyDf.head()
'''

In [None]:
#all_data=pd.concat([all_data,familyDf],axis=1)
#all_data.head()

In [None]:
#all_data.drop('Cabin',axis=1,inplace=True)

In [None]:
#all_data=pd.concat([all_data, groupDf], axis=1)


In [None]:
corrDf_train=all_data.corr()
corrDf_train

In [None]:
corrDf_train['Survived'].sort_values(ascending=False)

In [None]:

tree_df = all_data.drop(['PassengerId','Embarked_Q'], axis=1).copy()

In [None]:
tree_train_df = tree_df[~tree_df.Survived.isna()]
tree_test_df = tree_df[tree_df.Survived.isna()].drop('Survived', axis=1).copy()

In [None]:
tree_train_df.info()

In [None]:
tree_test_df.info()

In [None]:
X_col = list(tree_train_df.columns)
X_col.remove('Survived')
X_col

In [None]:
x = tree_train_df.columns.tolist()
x.remove('Survived')
x

In [None]:
tree_train_df.shape

In [None]:
tree_train_X = tree_train_df[x]
print(tree_train_X.shape)
tree_train_Y = tree_train_df['Survived']

In [None]:
tree_train_X

In [None]:
t_train_X, t_test_X, t_train_y, t_test_y=train_test_split(tree_train_X, tree_train_Y, train_size=.8)

In [None]:
t_train_X.shape, t_test_X.shape

## ランダムフォレストモデルを作成

In [None]:

from sklearn.ensemble import RandomForestClassifier
random_forest=RandomForestClassifier(random_state=1234, min_samples_leaf=5, min_samples_split=10, n_estimators=500)
parameters = {
    'max_features':[0.2,0.3,0.4,0.5], 
    'max_leaf_nodes':[10,50,100,200], 
    'criterion':['gini', 'entropy']
}
clf = GridSearchCV(random_forest, parameters, n_jobs=4)
clf.fit(X=t_train_X, y=t_train_y)
random_forest = clf.best_estimator_
print (clf.best_score_, clf.best_params_) 





#random_forest.fit(t_train_X, t_train_y)
t_test_pred = random_forest.predict(t_test_X)
fpr, tpr, thresholds = roc_curve(t_test_y, t_test_pred, pos_label=1)
print(roc_auc_score(t_test_y, t_test_pred))
#accuracy_score(t_test_pred, t_test_y)
plt.clf()
plt.plot(fpr, tpr)
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC curve')
plt.show()

In [None]:
pred_Y = random_forest.predict(tree_test_df)

## 決定木モデルを作成

In [None]:
'''parameters = {'max_depth':range(3,20)}
clf = GridSearchCV(tree.DecisionTreeClassifier(), parameters, n_jobs=4)
clf.fit(X=t_train_X, y=t_train_y)
tree_model = clf.best_estimator_
print (clf.best_score_, clf.best_params_) '''

In [None]:
'''predictions = tree_model.predict_proba(t_test_X)

print(roc_auc_score(t_test_y, predictions[:,1]))

fpr, tpr, _ = roc_curve(t_test_y, predictions[:,1])

plt.clf()
plt.plot(fpr, tpr)
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC curve')
plt.show()'''

In [None]:
#pred_Y = tree_model.predict(tree_test_df)

In [None]:
pred_Y=pred_Y.astype(int)
passenger_id = all_data.loc[all_data.Survived.isna(), 'PassengerId']

In [None]:
predDF = pd.DataFrame(
    {
        'PassengerId': passenger_id,
        'Survived': pred_Y
    }
)

In [None]:
predDF.to_csv('titanic_pred.csv', index = False)

In [None]:
"""
full_data_X=pd.concat([titleDf,
                       pclassDf,
                       familyDf,
                       train_data['Fare'],
                       embarkedDf,
                       train_data['Sex'],
                       train_data['Parch']
                       ],axis=1)
full_data_X.head()
"""

In [None]:
#titleDf.shape

In [None]:
#full_data_X.shape
#train_data_X = full_data_X.loc[0:890, :]
#pred_X = full_data_X.loc[891:,:]
#train_data_Y = train_data.loc[0:890, 'Survived']

In [None]:
#pred_X['Fare']=pred_X['Fare'].fillna(full_data_X['Fare'].mean())

In [None]:
#pred_X.info()

In [None]:
#from sklearn.model_selection import train_test_split

In [None]:
#train_X, test_X, train_y, test_y=train_test_split(train_data_X, train_data_Y, train_size=.8)

In [None]:
#from sklearn.linear_model import LogisticRegression
#model=LogisticRegression()
#model.fit(train_X, train_y)

In [None]:
#model.score(test_X, test_y)

In [None]:
#pred_Y = model.predict(pred_X)

In [None]:
#pred_Y=pred_Y.astype(int)
#passenger_id = train_data.loc[891:, 'PassengerId']

In [None]:
"""
predDF = pd.DataFrame(
    {
        'PassengerId': passenger_id,
        'Survived': pred_Y
    }
)
"""

In [None]:
#predDF.to_csv('titanic_pred.csv', index = False)

In [None]:
#top_title = all_data[all_data.title.isin(list(all_data[all_data.Age.isnull()].title.value_counts().index))]