In [1]:
import numpy 
import pandas

# train.csvを読み込む
# Load train.csv
df = pandas.read_csv('train.csv')

##############################
# データ前処理
# 必要な項目を抽出する
# Data preprocessing 
# Extract necessary items
##############################
# 'Survived', 'Pclass', 'Sex', 'Fare'を抽出する
# Extract 'Survived', 'Pclass', 'Age', 'Fare'
df = df[['Survived', 'Pclass', 'Sex', 'Fare']]
df.head()

Unnamed: 0,Survived,Pclass,Sex,Fare
0,0,3,male,7.25
1,1,1,female,71.2833
2,1,3,female,7.925
3,1,1,female,53.1
4,0,3,male,8.05


In [3]:
##############################
# データ前処理 2
# 欠損値を処理する
##############################

# 欠損値がないか確認する
df.isnull().sum()

Survived    0
Pclass      0
Sex         0
Fare        0
dtype: int64

In [4]:
##############################
# データ前処理
# ラベル（名称）を数値化する
# Data preprocessing 
# Digitize labels
##############################
from sklearn.preprocessing import LabelEncoder
# 性別をLabelEncoderを利用して数値化する
# Digitize gender using LabelEncoder
encoder_sex = LabelEncoder()
df['Sex'] = encoder_sex.fit_transform(df['Sex'].values)
df.head()


Unnamed: 0,Survived,Pclass,Sex,Fare
0,0,3,1,7.25
1,1,1,0,71.2833
2,1,3,0,7.925
3,1,1,0,53.1
4,0,3,1,8.05


In [5]:
##############################
# データ前処理
# 数値を標準化する
# Data preprocessing
# Standardize numbers
##############################
from sklearn.preprocessing import StandardScaler

# 標準化
# Standardize numbers
standard = StandardScaler()
df_std = pandas.DataFrame(standard.fit_transform(df[['Pclass', 'Fare']]), columns=['Pclass', 'Fare'])

# Fare を標準化
# Standardize Fare
df['Pclass'] = df_std['Pclass']
df['Fare'] = df_std['Fare']

df.head()

Unnamed: 0,Survived,Pclass,Sex,Fare
0,0,0.827377,1,-0.502445
1,1,-1.566107,0,0.786845
2,1,0.827377,0,-0.488854
3,1,-1.566107,0,0.42073
4,0,0.827377,1,-0.486337


In [None]:
from sklearn.model_selection import train_test_split

x = df.drop(columns='Survived')
y = df[['Survived']]

In [None]:
#######################################
# トレーニングデータとテストデータを分ける
# Split training data and test data
#######################################
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1, shuffle=True)
y_train = numpy.ravel(y_train)
y_test = numpy.ravel(y_test)

In [None]:
#######################################
# モデルを評価する
# Evaluate the model
#######################################
from sklearn.svm import LinearSVC
model = LinearSVC(random_state=1)
model.fit(x_train, y_train)
score = model.score(x_test, y_test)
score

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
###############################################
# グリッドサーチで LogisticRegression のパラメータを試す
# Tuning LogisticRegression parameters with grid search
###############################################
pipe_svc = RandomForestClassifier(random_state=1)

param_grid = {'criterion':['gini','entropy'],
              'n_estimators':[25, 100, 500, 1000, 2000],
              'min_samples_split':[0.5, 2,4,10],
              'min_samples_leaf':[1,2,4,10],
              'bootstrap':[True, False]
              }

grid = GridSearchCV(estimator=RandomForestClassifier(random_state=1), param_grid=param_grid)
grid = grid.fit(x_train, y_train)

print(grid.best_score_)
print(grid.best_params_)

In [None]:
##############################
# モデルの構築
# Model building
##############################
from sklearn.ensemble import RandomForestClassifier

# モデルを生成する
# Generate a model
model = RandomForestClassifier(n_estimators=500, \
                               criterion='entropy', \
                               min_samples_split=2, \
                               min_samples_leaf=4, \
                               bootstrap=True, \
                               random_state=1)

##############################
# 学習
# Trainig
##############################
y = numpy.ravel(y)
model.fit(x, y)

# test.csv を変換する
# convert test.csv
##############################
# test.csvを読み込む
# Load test.csv
df_test = pandas.read_csv('test.csv')

# Fare のNanを変換
# Convert Fare Nan to 0 
df_test = df_test.fillna({'Fare':0})

# 'PassengerId'を抽出する(結果と結合するため)
# Extract 'PassengerId'(To combine with the result)
df_test_index = df_test[['PassengerId']]

# 'Pclass', 'Sex', 'Fare'を抽出する
# Extract 'Pclass', 'Sex', 'Fare'
df_test = df_test[['Pclass', 'Sex', 'Fare']]

# 標準化
# Standardize
df_test_std = pandas.DataFrame(standard.transform(df_test[['Pclass', 'Fare']]), columns=['Pclass', 'Fare'])
df_test['Pclass'] = df_test_std['Pclass']
df_test['Fare'] = df_test_std['Fare']

# ラベル エンコーディング
# Label Encoding
df_test ['Sex'] = encoder_sex.transform(df_test ['Sex'].values)

##############################
# 結果を予想する
# Predict results
##############################
x_test = df_test.values
y_test = model.predict(x_test)

# PassengerId のDataFrameと結果を結合する
# Combine the data frame of PassengerId and the result
df_output = pandas.concat([df_test_index, pandas.DataFrame(y_test, columns=['Survived'])], axis=1)

# result.csvをカレントディレクトリに書き込む
# Write result.csv to the current directory
df_output.to_csv('result.csv', index=False)

In [None]:
df_output