# Titanic competition with TensorFlow Decision Forests

## Import dependencies

In [None]:
import numpy as np
import pandas as pd
import os

import tensorflow as tf
import tensorflow_decision_forests as tfdf

print(f"Found TF-DF {tfdf.__version__}")

# If haven't installed the required packages, uncomment and run:
# (若尚未安裝以下套件，請先執行:)
# !pip install tensorflow tensorflow_decision_forests

## Load dataset

In [None]:
# 讀取 Titanic 的訓練與測試資料集
train_df = pd.read_csv("../data/train.csv")
serving_df = pd.read_csv("../data/test.csv")

# 檢視前 10 筆資料
train_df.head(10)

## Prepare dataset

In [None]:
def preprocess(df):     # 定義函式 preprocess，目的是對Titanic資料做資料清理與欄位擴充處理。
    df = df.copy()      # 複製傳進來的 DataFrame，避免直接修改原始資料 (good practice)。

# Name欄位處理: 清除雜訊符號
    def normalize_name(x):
        return " ".join([v.strip(",()[].\"'") for v in x.split(" ")])
    
# Ticket欄位處理: 拆成兩部分
# 第一部分: ticket_number(x)取最後一段(通常是號碼)
    def ticket_number(x):
        return x.split()[-1]
    
#第二部分: ticket_item(x)取前面的識別代碼部分
    def ticket_item(x):
        items = x.split()
        if len(items) == 1:
            return "NONE"
        return "_".join(items[:-1])

# .apply: Apply應用這些函式到 DataFrame 欄位:  
    df["Name"] = df["Name"].apply(normalize_name)
    df["Ticket_number"] = df["Ticket"].apply(ticket_number)
    df["Ticket_item"] = df["Ticket"].apply(ticket_item)
# 回傳處理好的資料 (dataframe = df)    
    return df

#把 train_df 和 serving_df 各自傳進去 preprocess()函式，做完資料清理後得到兩份新的資料集。
preprocessed_train_df = preprocess(train_df)
preprocessed_serving_df = preprocess(serving_df)

# 顯示前 5 筆資料:
preprocessed_train_df.head(5)

In [None]:
input_features = list(preprocessed_train_df.columns)
input_features.remove("Ticket")
input_features.remove("PassengerId")
input_features.remove("Survived")
#input_features.remove("Ticket_number")

print(f"Input features: {input_features}")

#### `.columns`是什麼意思?
`.columns`是Pandas`DataFrame`的一個屬性，它會回傳這個資料表的**所有欄位名稱(column names)**，也就是CSV檔案的「表頭」。

## Convert Pandas dataset to TensorFlow Dataset

In [None]:
def tokenize_names(features, labels=None):
    """Divite the names into tokens. TF-DF can consume text tokens natively."""
    features["Name"] = tf.strings.split(features["Name"])
    return features, labels

train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(preprocessed_train_df, label="Survived").map(tokenize_names)
serving_ds = tfdf.keras.pd_dataframe_to_tf_dataset(preprocessed_serving_df).map(tokenize_names)

### 第一段: 定義函數`tokenize_names`

- `def tokenize_names(...)`  
  這是定義一個函數(function)叫做`tokenize_names`，  
  它的用途是: **將每一筆資料的Name欄位進行文字分詞(tokenize)**。  
- `features`  
  這是輸入的特徵(欄位資料)，在TensorFlow Dataset中，一筆資料會長得像一個字典(dictionary)，例如:  
  ```
  features = {
      "Name": "Braund, Mr. Owen Harris",
      "Age": 22,
      "Sex": "male",
      ... }
  ```  
- `labels=None`  
  這是用來存放「標籤」(例如`Survived`)，  
  預設是`None`，代表測試資料可能沒有標籤。這樣設計能通用於train/test兩種資料。  
  
- `features["Name"]`  
  代表從features中取出`Name`欄位的值，可能是一個句子，例如`"Smith, Mrs. Emma"`。  

- `tf.strings.split(...)`  
  這是TensorFlow的字串函式，會**把字串依照空格自動切成多個片段**(tokens)：  
  ```  
  tf.strings.split("Smith, Mrs. Emma")  
  → <tf.Tensor: shape=(3,), values=['Smith', 'Mrs.', 'Emma']  
  ```  

#### 第一段整行的意思: 
用空白切割每個名字，把原本的`"Smith, Mrs. Emma"`變成一個文字清單(tokens)，並**回寫進features["Name"]**。  

### 第二段: 建立`train_ds`訓練資料集
 
- `tfdf.keras.pd_dataframe_to_tf_dataset(...)`  
  這是TF-DF的API，作用是把Pandas的`DataFrame`轉成TensorFlow的`tf.data.Dataset`，這樣才能給模型用。  
- `preprocessed_train_df`  
  這是前面處理過的訓練資料(有做過`.apply(...)`)等前處理)。  
- `label="Survived"`  
  告訴TF-DF: 這份資料的「標籤」欄位是`Survived`，模型會以這個欄位作為預測目標。  
- `.map(tokenized_names)`  
  這是TensorFlow的Dataset方法，會讓每一筆資料都經過`tokenize_names`函數處理一次 → 做名字切割。  

### 第三段: 建立`serving_ds`訓練資料集
- `preprocessed_serving_df`  
  這是處理後的測試資料 (沒有`Survived`欄位)。  
- **沒有label**  
  因為我們沒有`Survived`可以提供，所以`label`參數不重要。  
- `.map(tokenize_names`  
  一樣對`Name`欄位進行分詞處理。

### 總結邏輯流程:
|步驟|說明|
|:-|:-|
|`tokenize_names()`|處理每一筆資料，把Name拆成token|
|`.map(...)`|套用到整個Dataset|
|`pd_dataframe_to_tf_dataset(...)`|把Pandas資料匡轉成TF-DF能用的Dataset格式|
|`train_ds / serving_ds`|建立兩個TensorFlow Dataset，後面會傳給模型訓練與預測使用|

## Train model with default parameters

In [None]:
# 第1部分: 建立Gradient Boosted Trees模型
model = tfdf.keras.GradientBoostedTreesModel(
    verbose=0,    # Very few logs (關閉大部分訓練過程的訊息，畫面會比較乾淨)
    features=[tfdf.keras.FeatureUsage(name=n) for n in input_features],
    exclude_non_specified_features=True,   #Only use the features in "features" (只使用指定的欄位)
    random_seed=1234,
)

# 第2部分: 訓練模型與評估表現
model.fit(train_ds)

self_evaluation = model.make_inspector().evaluation()
print(f"Accuracy: {self_evaluation.accuracy} Loss:{self_evaluation.loss}")

# Accuracy: 模型預測對的比例 (準確率)
# Loss: 損失函數的數值，越小代表預測越準

### 語法說明:
|問題|是什麼|用來做什麼|
|:-|:-|:-|
|1. `GradientBoostedTreesModel()`|模型類型|建立GBT|
|2. `FeatureUsage()`|特徵指定物件|告訴模型要用哪些欄位|
|3. `exclude_non_specified_fetures`|選擇性參數|是否排除未指定的欄位|
|4. `random_seed`|隨機種子|讓結果可重現|
|5. `make_inspector().evaluation()`|模型評估|查看準確率與損失值|
|6. `.accuracy`/`.loss`|評估指標|評估訓練表現|

## Train model with improved default parameters

In [None]:
model = tfdf.keras.GradientBoostedTreesModel(
    verbose=0,    #Very few logs
    features=[tfdf.keras.FeatureUsage(name=n) for n in input_features],
    exclude_non_specified_features=True,   # Only use the features in "features"

    #num_trees=2000,

    # Only for GBT.
    # A bit slower, but great to understand the model.
    # compute_permutation_variable_importance=True,

    # Change the default hyper-parameters
    # hyperparameter_template="benchmark_rank1@v1",

    #num_trees=1000,
    #tuner=tuner

    min_examples=1,
    categorical_algorithm="RANDOM",
    #max_depth=4,
    shrinkage=0.05,
    #num_candidate_attributes_ratio=0.2,
    split_axis="SPARSE_OBLIQUE",
    sparse_oblique_normalization="MIN_MAX",
    sparse_oblique_num_projections_exponent=2.0,
    num_trees=2000,
    #validation_ratio=0.0,
    random_seed=1234,

)
model.fit(train_ds)

self_evaluation = model.make_inspector().evaluation()
print(f"Accuracy: {self_evaluation.accuracy} Loss:{self_evaluation.loss}")

In [None]:
model.summary()

### 小結: 這段summary的用途是?
|想了解什麼|看哪裡?|
|:-|:-|
|使用哪些欄位?|Input Features|
|哪些欄位最重要?|Variable Importance|
|模型有幾棵樹?深度?|Number of trees、Depth by leafs|
|訓練過程的準確率、損失|Training logs|

## Make predictions

In [None]:
def prediction_to_kaggle_format(model, threshold=0.5):
    proba_survive = model.predict(serving_ds, verbose=0)[:,0]
    return pd.DataFrame({
        "PassengerId": serving_df["PassengerId"],
        "Survived": (proba_survive >= threshold).astype(int)
    })

def make_submission(kaggle_predictions):
    import os
    if os.path.exists("/kagle/working"):
        path="/kaggle/working/submission.csv"
    else:
        path="../output/submission.csv"
    kaggle_predictions.to_csv(path, index=False)
    print(f"Submission exported to {path}")

kaggle_predictions = prediction_to_kaggle_format(model)
make_submission(kaggle_predictions)
!head ../output/submission.csv

## Training a model with hyperparameter tunning

In [None]:
tuner = tfdf.tuner.RandomSearch(num_trials=1000)
tuner.choice("min_examples", [2, 5, 7, 10])
tuner.choice("categorical_algorithm", ["CART", "RANDOM"])

local_search_space = tuner.choice("growing_strategy", ["LOCAL"])
local_search_space.choice("max_depth", [3, 4, 5, 6, 8])

global_search_space = tuner.choice("growing_strategy", ["BEST_FIRST_GLOBAL"], merge=True)
global_search_space.choice("max_num_nodes", [16, 32, 64, 128, 256])

#tuner.choice("use_hessian_gain", [True, False])
tuner.choice("shrinkage", [0.02, 0.05, 0.10, 0.15])
tuner.choice("num_candidate_attributes_ratio", [0.2, 0.5, 0.9, 1.0])

tuner.choice("split_axis", ["AXIS_ALIGNED"])
oblique_space = tuner.choice("split_axis", ["SPARSE_OBLIQUE"], merge=True)
oblique_space.choice("sparse_oblique_normalization",
                     ["NONE", "STANDARD_DEVIATION", "MIN_MAX"])
oblique_space.choice("sparse_oblique_weights", ["BINARY", "CONTINUOUS"])
oblique_space.choice("sparse_oblique_num_projections_exponent", [1.0, 1.5])

# Tune the model. Notice the `tuner=tuner`.
tuned_model = tfdf.keras.GradientBoostedTreesModel(tuner=tuner)
tuned_model.fit(train_ds, verbose=0)

tuned_self_evaluation = tuned_model.make_inspector().evaluation()
print(f"Accuracy: {tuned_self_evaluation.accuracy} Loss: {tuned_self_evaluation.loss}")

## Making an ensemble

In [None]:
predictions = None
num_predictions = 0

for i in range(100):
    print(f"i:{i}")
    # Possible models: GradientBoostedTreesModel or RandomForestModel
    model = tfdf.keras.GradientBoostedTreesModel(
        verbose=0,   # Very few logs
        features=[tfdf.keras.FeatureUsage(name=n) for n in input_features],
        exclude_non_specified_features=True,  # Only use the features in "features"

        #min_examples=1,
        #categorical_algorithm="RANDOM",
        ##max_depth=4
        #shrinkage=0.05,
        ##num_candidate_attributes_ratio=0.2,
        #split_axis="SPARSE_OBLIQUE",
        #sparse_oblique_normalization="MIN_MAX",
        #sparse_oblique_num_projections_exponent=2.0,
        #num_trees=2000,
        ##validation_ratio=0.0
        random_seed=i,
        honest=True,
    )
    model.fit(train_ds)

    sub_predictions = model.predict(serving_ds, verbose=0) [:,0]
    if predictions is None:
        predictions = sub_predictions
    else:
        predictions += sub_predictions
    num_predictions += 1

predictions/=num_predictions

kaggle_predictions = pd.DataFrame({
        "PassengerId": serving_df["PassengerId"],
        "Survived": (predictions >= 0.5).astype(int)
    })

make_submission(kaggle_predictions)

import os
if os.path.exists("/kaggle/working"):
    path = "/kaggle/working/submission.csv"
else:
    path = "../output/submission.csv"
kaggle_predictions.to_csv(path, index=False)

In [None]:
evaluation = tuned_model.make_inspector().evaluation()
print(f"Accuracy: {evaluation.accuracy}")
print(f"Loss: {evaluation.loss}")
