In [1]:
import numpy as np
import pandas as pd

In [None]:
# 自動化資料清理

In [3]:
data = {
    '姓名': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    '年齡': [25, np.nan, 30, 45, np.nan],
    '性別': ['女', '男', '男', '女', '女'],
    '讀書小時': [10, 5, 2, 20, 15],
    '考試成績': [85, 60, 45, 95, 88]
}

df_dirty = pd.DataFrame(data)
print("原始髒數據：")
df_dirty

原始髒數據：


Unnamed: 0,姓名,年齡,性別,讀書小時,考試成績
0,Alice,25.0,女,10,85
1,Bob,,男,5,60
2,Charlie,30.0,男,2,45
3,David,45.0,女,20,95
4,Eve,,女,15,88


In [4]:
df_clean = pd.get_dummies(df_dirty , columns=['性別'] , drop_first=True)
df_clean

Unnamed: 0,姓名,年齡,讀書小時,考試成績,性別_男
0,Alice,25.0,10,85,False
1,Bob,,5,60,True
2,Charlie,30.0,2,45,True
3,David,45.0,20,95,False
4,Eve,,15,88,False


In [5]:
mean_age = df_clean['年齡'].mean()
df_clean['年齡'] = df_clean['年齡'].fillna(mean_age)

print("\n清理後的數據：")
df_clean


清理後的數據：


Unnamed: 0,姓名,年齡,讀書小時,考試成績,性別_男
0,Alice,25.0,10,85,False
1,Bob,33.333333,5,60,True
2,Charlie,30.0,2,45,True
3,David,45.0,20,95,False
4,Eve,33.333333,15,88,False


In [None]:
# 標準化 (Scaling)

In [7]:
from sklearn.preprocessing import StandardScaler

In [8]:
scaler = StandardScaler()
cols_to_scal = ['年齡' , '讀書小時']
df_clean[cols_to_scal] = scaler.fit_transform(df_clean[cols_to_scal])

print("\n最終標準化後的數據：")
df_clean


最終標準化後的數據：


Unnamed: 0,姓名,年齡,讀書小時,考試成績,性別_男
0,Alice,-1.265924,-0.061256,85,False
1,Bob,0.0,-0.826961,60,True
2,Charlie,-0.50637,-1.286384,45,True
3,David,1.772294,1.470153,95,False
4,Eve,0.0,0.704448,88,False


In [None]:
# 隨機森林 (Random Forest)

In [10]:
from sklearn.ensemble import RandomForestRegressor

In [11]:
rf_model = RandomForestRegressor(n_estimators=100 , random_state=42)

X_final = df_clean[['年齡' , '讀書小時' , '性別_男']]
y_final = df_clean['考試成績']

rf_model.fit(X_final , y_final)

print(f"隨機森林的 R2 分數: {rf_model.score(X_final, y_final):.4f}")

隨機森林的 R2 分數: 0.9639


In [12]:
# day25 & 26_隨機森林模型的「可解釋性(Interpretability)」& 分析分類資料

In [13]:
importances = rf_model.feature_importances_
feature_names = X_final.columns

importance_df = pd.DataFrame({'特徵': feature_names, '重要性': importances})
importance_df = importance_df.sort_values(by='重要性', ascending=False)

print("隨機森林認為的影響權重：")
importance_df

隨機森林認為的影響權重：


Unnamed: 0,特徵,重要性
1,讀書小時,0.515127
2,性別_男,0.377291
0,年齡,0.107583


In [None]:
# 自動化 Pipeline

In [15]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [None]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, ['年齡', '讀書小時']),
        ('cat', categorical_transformer, ['性別'])
    ])

clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

X_dirty = df_dirty[['年齡', '讀書小時', '性別']]
y_dirty = df_dirty['考試成績']
clf.fit(X_dirty, y_dirty)

print("自動化工廠建立完成！現在你可以直接輸入原始資料進行預測。")

自動化工廠建立完成！現在你可以直接輸入原始資料進行預測。


In [17]:
test_student = pd.DataFrame({
    '年齡': [35], 
    '讀書小時': [12], 
    '性別': ['女'] 
})
Result = clf.predict(test_student)
print(Result)

[85.96]


In [None]:
#交叉驗證 (Cross-Validation)

In [19]:
from sklearn.model_selection import cross_val_score

np.random.seed(42)
big_data = {
    '年齡': np.random.randint(10, 81, size=100),
    '讀書小時': np.random.uniform(0, 20, size=100),
    '性別': np.random.choice(['男', '女'], size=100)
}
df_big_dirty = pd.DataFrame(big_data)

df_big_dirty['考試成績'] = (df_big_dirty['讀書小時'] * 25) + \
                          (df_big_dirty['性別'].map({'男': 10, '女': 15})) + \
                          np.random.normal(0, 10, size=100)

X_final_big = df_big_dirty[['年齡', '讀書小時', '性別']]
y_final_big = df_big_dirty['考試成績']

scores_final = cross_val_score(clf, X_final_big, y_final_big, cv=5)

print(f"百人數據的 5 次考驗分數：\n{scores_final}")
print(f"\n平均 R2 分數：{scores_final.mean():.4f}")

百人數據的 5 次考驗分數：
[0.9929171  0.99049935 0.98869996 0.98715366 0.9935535 ]

平均 R2 分數：0.9906


In [None]:
# 模型管理與存檔

In [21]:
import joblib

In [22]:
joblib.dump(clf, 'super_stable_model_v2.pkl')
print("V2 版預訓練模型已存檔")

V2 版預訓練模型已存檔
