<a href="https://colab.research.google.com/github/hui509/Titanic_Survival_Prediction/blob/main/%E5%B0%88%E9%A1%8C%E5%AF%A6%E4%BD%9C%EF%BD%9C%E9%90%B5%E9%81%94%E5%B0%BC%E8%99%9F%E5%AD%98%E6%B4%BB%E9%A0%90%E6%B8%AC%E5%88%86%E6%9E%90.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **載入資料**

In [None]:
# 載入訓練集、測試集、示範檔資料
import pandas as pd
train_url = 'https://raw.githubusercontent.com/hui509/Titanic_Survival_Prediction/main/raw_data/train.csv'
test_url = 'https://raw.githubusercontent.com/hui509/Titanic_Survival_Prediction/main/raw_data/test.csv'
submit_url = 'https://raw.githubusercontent.com/hui509/Titanic_Survival_Prediction/main/raw_data/gender_submission.csv'

train = pd.read_csv(train_url)
test = pd.read_csv(test_url)
submit = pd.read_csv(submit_url)

# **認識資料**

1. 訓練集｜891筆資料，12個欄位，包含Survival，其中，Age、Cabin、Embarked有缺失值。

2. 測試集｜418筆資料，11個欄位，缺少Survival，其中，Age、Fare、Cabin有缺失值。

In [None]:
# 觀察訓練集
train.info()

In [None]:
# 觀察測試集
test.info()

In [None]:
# 訓練集缺失值統計
train.isnull().sum()

In [None]:
# 測試集缺失值統計
test.isnull().sum()

# **合併訓練集與測試集**

In [None]:
# 合併訓練集、測試集，方便後續模型分析
data = train._append(test)
data.reset_index(inplace=True, drop=True)

# **資料清理與型態轉換**

1. 缺失值｜Age、Fare 以平均數填補

2. 型態轉換｜Sex、Embarked







In [None]:
# Age 和 fare 為數值型態，以平均數填補
data['Age'].fillna(data['Age'].mean(),inplace=True)
data['Fare'].fillna(data['Fare'].mean(),inplace=True)

In [None]:
# 使用 OneHotEncoder 轉換 Embarked 為數值型態
data = pd.get_dummies(data,columns=['Embarked'],dtype=int)

In [None]:
# 將 Sex 轉換為數值型態
data['Sex'] = data['Sex'].replace({'male':1,'female':0})

In [None]:
# 刪除模型無法運算的欄位
data.drop(columns=['PassengerId','Name','Ticket','Cabin'],axis=1,inplace=True)

# **模型分析**

In [None]:
# 載入模型
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

# 產生訓練集和測試集
Train = data[ pd.notnull(data.Survived) ].copy()
Test = data[ pd.isnull(data.Survived) ].copy()

In [None]:
# X_Train 為訓練集的特徵欄位
# Y_Train 為訓練集的標籤欄位
X_Train = Train.drop( ['Survived'], axis=1 )
Y_Train = Train.Survived

# 測試集刪除 Survived 欄位
Test.drop( ['Survived'], axis=1, inplace=True )

In [None]:
# 使用Logistic Regression分析
model = LogisticRegression(random_state=0,max_iter=3000)
model.fit( X_Train, Y_Train )
scores = cross_val_score(model,X_Train,Y_Train.values.ravel(),cv=5,scoring='accuracy').mean()
print(scores)

In [None]:
# 預測測試集的 Survived 結果
Test_pred = model.predict(Test)
print( Test_pred )

In [None]:
# 取得提交檔案
submit['Survived'] = Test_pred.astype(int)
submit.to_csv( 'Titanic_LogisticRegression_raw.csv', index=False )
print( f'預測結果：' )
print(submit)

In [None]:
# 將提交檔案存入本機
from google.colab import files
files.download('Titanic_LogisticRegression_raw.csv')