<a href="https://colab.research.google.com/github/ice4869/titanic-analysis/blob/main/%E9%90%B5%E9%81%94%E5%B0%BC%E8%99%9F%E5%AD%98%E6%B4%BB%E9%A0%90%E6%B8%AC%E5%B0%88%E9%A1%8C%E5%AF%A6%E4%BD%9C%EF%BC%883%EF%BC%89.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

鐵達尼號存活預測專題實作（3）

1. 請問資料中有哪些「欄位」你覺得是多餘的，在分析過程中應該被排除的呢？為什麼？

在分析過程中，可能有些欄位是多餘的或對分析結果沒有太大影響的。以下是一些可能被排除的欄位：

PassengerId：乘客ID，通常是用來標識乘客的唯一值，對生存與否沒有實際意義。

Name：乘客姓名，通常也不具有直接的影響。

Ticket：船票號碼，對於生存與否沒有太大的關聯。

Cabin：客艙號碼，缺失值較多，而且與生存與否的關聯度較低。

這些欄位可能在分析中不會提供有用的資訊，因此可以考慮在分析過程中將它們排除。



2. 針對「有缺失值」的欄位，嘗試不同的處理策略（例如：常數、中位數、平均數填補）比較結果。

In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer

#讀取資料集
df = pd.read_csv('https://raw.githubusercontent.com/dsindy/kaggle-titanic/master/data/train.csv')

#選擇含有缺失值的欄位
columns_with_missing = ['Age' , 'Cabin']

#常數填補
df_constant = df.copy()
constant_imputer = SimpleImputer(strategy = 'constant' , fill_value = 'Missing')
df_constant[columns_with_missing] = constant_imputer.fit_transform(df_constant[columns_with_missing])

#最常出現的值填補
df_most_frequent = df.copy()
most_frequent_imputer = SimpleImputer(strategy = 'most_frequent')
df_most_frequent[columns_with_missing] = most_frequent_imputer.fit_transform(df_most_frequent[columns_with_missing])

#檢查填補後的資料集
print(df_constant.head())
print(df_most_frequent.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare    Cabin Embarked  
0      0         A/5 21171   7.2500  Missing        S  
1      0          PC 17599  71.2833      C85        C  
2      0  STON/O2. 3101282   7.9250  Missing        S  
3      0            113803  53.1000     C123        S  
4      0            373450   8.0500  Mis

3. 針對「非數值型」的欄位，嘗試不同的處理策略（例如：LabelEncoder 或 OneHotEncoder）比較結果。

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder , OneHotEncoder
from sklearn.compose import ColumnTransformer

#讀取資料集
df = pd.read_csv('https://raw.githubusercontent.com/dsindy/kaggle-titanic/master/data/train.csv')

#選擇非數值型的欄位
non_numeric_columns = ['Sex' , 'Embarked']

#使用 LabelEncoder 處理非數值行的欄位
df_label_encoded = df.copy()
label_encoder = LabelEncoder()
for column in non_numeric_columns:
  df_label_encoded[column] = label_encoder.fit_transform(df_label_encoded[column])

#使用 OneHotEncoder 處理非數值型欄位
df_one_hot_encoded = df.copy()
one_hot_encoder = ColumnTransformer(transformers = [('encoder' , OneHotEncoder(drop = 'first' ) , non_numeric_columns) ] , remainder='passthrough')
df_encoded = one_hot_encoder.fit_transform(df_one_hot_encoded)

#將編碼後的資料轉換為DataFrame
column_names = one_hot_encoder.named_transformers_['encoder'].get_feature_names_out(non_numeric_columns)
df_one_hot_encoded = pd.DataFrame(df_encoded , columns = list(column_names)+ list(df_one_hot_encoded.columns[len(non_numeric_columns):]))


#檢查處理後的資料集
print(df_label_encoded.head())
print(df_one_hot_encoded.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name  Sex   Age  SibSp  Parch  \
0                            Braund, Mr. Owen Harris    1  22.0      1      0   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...    0  38.0      1      0   
2                             Heikkinen, Miss. Laina    0  26.0      0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)    0  35.0      1      0   
4                           Allen, Mr. William Henry    1  35.0      0      0   

             Ticket     Fare Cabin  Embarked  
0         A/5 21171   7.2500   NaN         2  
1          PC 17599  71.2833   C85         0  
2  STON/O2. 3101282   7.9250   NaN         2  
3            113803  53.1000  C123         2  
4            373450   8.0500   NaN         2  
  Sex_male Emb

4. 請你嘗試不同的特徵工程手法，產生可以幫助分析結果的新變數。

In [15]:
import pandas as pd
import numpy as np

#讀取資料集
df = pd.read_csv('https://raw.githubusercontent.com/dsindy/kaggle-titanic/master/data/train.csv')

data = pd.DataFrame(df)

#資料前處理
data.info()
data.describe()
data.isnull().sum()

#將缺失值填補
data['Age'].fillna(data['Age'].mean() , inplace = True)
data['Embarked'].fillna(data['Embarked'].mode()[0] , inplace = True)
data['Fare'].fillna(data['Fare'].mean() , inplace = True)
data['Cabin'].fillna('None' , inplace = True)
data.isnull().sum()

#新增特徵
data['Family_Size'] = data['SibSp'] + data['Parch'] + 1
data['IsAlone'] = 0
data.loc[data['Family_Size'] == 1 , 'IsAlone'] = 1
data['Title'] = data['Name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]
data['FareBin'] = pd.qcut(data['Fare'] , 4)
data['AgeBin'] = pd.cut(data['Age'].astype(int) , 5)

#刪除不需要的欄位
data.drop(['Cabin'] , axis = 1 , inplace = True)
data.drop(['PassengerId' , 'Name' , 'Ticket'] , axis = 1 , inplace = True)
data.head()
data.info()
data.describe()
data.isnull().sum()

#將資料集命名為df_train，準備模型訓練
df_train = data
df_train.head()
df_train.info()
df_train.describe()
df_train.isnull().sum()

#定義特徵欄位和目標欄位
#使用Label Encoding 進行特徵工程
from sklearn.preprocessing import LabelEncoder
#將特徵欄位進行編碼
columns_to_encode = ['Survived' , 'Age' , 'Embarked' , 'Family_Size' , 'Title' , 'FareBin' , 'AgeBin' , 'Pclass' , 'Sex','SibSp' ,'Parch' , 'IsAlone' ]
le = LabelEncoder()
le.fit(columns_to_encode)
#print(le.classes_)

#對需要編碼的特徵欄位逐個進行 LabelEncoder 編碼
for col in columns_to_encode[1:]:
  le = LabelEncoder()
  df_train[col] = le.fit_transform(df_train[col])

#檢查結果
#print(df_train[columns_to_encode])

df_train.head()
#定義特徵欄位和目標欄位
columns_X = list(df_train.columns)
columns_X.remove('Survived')
columns_y = ['Survived']
train_X = df_train[columns_X]
train_y = df_train[columns_y]
#使用Logistic 迴歸模型進行交叉驗證
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

log = LogisticRegression(random_state=0, max_iter=3000)
scores = cross_val_score(log, train_X, train_y.values.ravel(),cv=5,scoring='accuracy')

print(scores)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   Survived     891 non-null    int64   
 1   Pclas