# Standard Approach for Kaggle 

## import

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import re as re

from sklearn import tree

## read csv

In [2]:
train = pd.read_csv('../csv/train.csv')
test = pd.read_csv('../csv/test.csv')

### confirming shape

In [5]:
train_shape = train.shape
test_shape = test.shape

In [6]:
print(train_shape)
print(test_shape)

(891, 12)
(418, 11)


## Confirming null values

In [9]:
def null_table(df):
    null_val = df.isnull().sum()
    null_ratio = null_val/len(df) * 100
    null_table = pd.concat([null_val, null_ratio], axis=1)
    null_table_ren_columns = null_table.rename(
    columns = {0 : "Null value", 1 : "%"})
    return null_table_ren_columns

In [10]:
train_null = null_table(train)
test_null = null_table(test)
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [11]:
test_null

Unnamed: 0,Null value,%
PassengerId,0,0.0
Pclass,0,0.0
Name,0,0.0
Sex,0,0.0
Age,86,20.574163
SibSp,0,0.0
Parch,0,0.0
Ticket,0,0.0
Fare,1,0.239234
Cabin,327,78.229665


## Pre-processing

In [12]:
train["Age"] = train["Age"].fillna(train["Age"].median()) #中央値を代入(1)
test["Age"] = test["Age"].fillna(test["Age"].median())
train["Embarked"] = train["Embarked"].fillna("S") #最頻出のSを代入(1)
test["Fare"] = test["Fare"].fillna(test["Fare"].median())

In [13]:
train["Sex"][train["Sex"] == "male"] = 0
train["Sex"][train["Sex"] == "female"] = 1
train["Embarked"][train["Embarked"] == "S"] = 0
train["Embarked"][train["Embarked"] == "C"] = 1
train["Embarked"][train["Embarked"] == "Q"] = 2

test["Sex"][test["Sex"] == "male"] = 0
test["Sex"][test["Sex"] == "female"] = 1
test["Embarked"][test["Embarked"] == "S"] = 0
test["Embarked"][test["Embarked"] == "C"] = 1
test["Embarked"][test["Embarked"] == "Q"] = 2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["Sex"][train["Sex"] == "male"] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["Sex"][train["Sex"] == "female"] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["Embarked"][train["Embarked"] == "S"] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["Embarked"][train["Embarked"] 

## Predictive model

### Decision tree

In [44]:
#trainの目的変数と説明変数の値を取得
target = train["Survived"].values
train_features_one = train[["Pclass", "Sex", "Age", "Fare"]].values
train_features_two = train[["Pclass", "Sex", "Age", "Fare", "SibSp", "Parch", "Embarked"]].values

#決定木の作成
max_depth = 10
min_samples_split = 5
my_tree_one = tree.DecisionTreeClassifier() #とりまデフォルト設定 
my_tree_two = tree.DecisionTreeClassifier(max_depth=max_depth, min_samples_split=min_samples_split) #過学習抑制

my_tree_one = my_tree_one.fit(train_features_one, target)
my_tree_two = my_tree_two.fit(train_features_two, target)

#testの説明変数の値を取得
test_features_one = test[["Pclass", "Sex", "Age", "Fare"]].values
test_features_two = test[["Pclass", "Sex", "Age", "Fare", "SibSp", "Parch", "Embarked"]].values

#testの説明変数を使ってmy_tree_oneの構築モデルで予測する
predict_one = my_tree_one.predict(test_features_one)
predict_two = my_tree_two.predict(test_features_two)

## save

In [46]:
#PassengerIdを取得
PassengerId = np.array(test["PassengerId"]).astype(int)

#predictionとPassengerIdをDataFrameに落とし込む
my_solution_one = pd.DataFrame(predict_one, PassengerId, columns=["Survived"])
my_solution_two = pd.DataFrame(predict_two, PassengerId, columns=["Survived"])

#save
my_solution_one.to_csv("my_tree_one.csv", index_label=["PassengerId"])
my_solution_two.to_csv("my_tree_two.csv", index_label=["PassengerId"])

Unnamed: 0,Survived
892,0
893,0
894,1
895,1
896,1
...,...
1305,0
1306,1
1307,0
1308,0
