In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

## 填補missing value：使用panda填補

In [14]:
car_sales_missing = pd.read_csv("./data set/car-sales-extended-missing-data.csv")
car_sales_missing.head(n=10)

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
5,Honda,Red,42652.0,4.0,23883.0
6,Toyota,Blue,163453.0,4.0,8473.0
7,Honda,White,,4.0,20306.0
8,,White,130538.0,4.0,9374.0
9,Honda,Blue,51029.0,4.0,26683.0


- 查詢缺失值總數

In [15]:
car_sales_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

- 針對【Make】，缺失值改用"missing"代替。
- 針對【Coolor】，缺失值改用"missing"代替。

In [17]:
car_sales_missing["Make"] = car_sales_missing["Make"].fillna(value="missing")
car_sales_missing["Colour"] = car_sales_missing["Colour"].fillna(value="missing")

In [18]:
car_sales_missing.isna().sum()

Make              0
Colour            0
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

- 針對【Doors】，我們知道一般車子的門數為4
- 那這邊改用【眾數】取填他

In [20]:
# NOTE: 先去查詢眾數
car_sales_missing["Doors"].value_counts()

Doors
4.0    811
5.0     75
3.0     64
Name: count, dtype: int64

In [22]:
car_sales_missing["Doors"] = car_sales_missing["Doors"].fillna(value=4)
car_sales_missing.isna().sum() # 查詢缺失值

Make              0
Colour            0
Odometer (KM)    50
Doors             0
Price            50
dtype: int64

- 針對【Odometer (KM)】，我們選用平均數去填

In [23]:
car_sales_missing["Odometer (KM)"] = car_sales_missing["Odometer (KM)"].fillna(value=car_sales_missing["Odometer (KM)"].mean())
car_sales_missing.isna().sum() # 查詢缺失值

Make              0
Colour            0
Odometer (KM)     0
Doors             0
Price            50
dtype: int64

- 針對【Price】，我們就把缺失值刪掉

In [25]:
# Remove rows with missing Price labels
car_sales_missing.dropna(inplace=True)
print(car_sales_missing.isna().sum()) # 查詢缺失值
print()
# 順便去看一下，當我做完缺失值處裡後，還剩多少資料
print("剩餘資料：", len(car_sales_missing))


Make             0
Colour           0
Odometer (KM)    0
Doors            0
Price            0
dtype: int64

剩餘資料 950


## 填補missing value：使用SimpleImputer(data imputation)
- SimpleImputer 是 scikit-learn 提供的一個工具，用於處理數據中的缺失值。
- 當數據集中出現 NaN（Not a Number）或其他缺失值時，SimpleImputer 可以根據指定的策略來填補這些值。
- 平均值（Mean）
- 中位數（Median）
- 最頻繁值（Most Frequent）
- 常數（指定固定值）
- 使用方式
```python
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="mean")  # 填補缺失值的策略
imputer.fit(X)  # 學習數據中的統計量（如平均值）
X_filled = imputer.transform(X)  # 用學到的統計量填補缺失值
```

In [58]:
car_sales_missing = pd.read_csv("./data set/car-sales-extended-missing-data.csv")
car_sales_missing.head(n=10)
print(car_sales_missing.isna().sum()) # 查詢缺失值

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64


- 先處裡price

In [59]:
car_sales_missing.dropna(subset=["Price"], inplace=True)
print(car_sales_missing.isna().sum()) # 查詢缺失值

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64


### spilt data

In [60]:
X = car_sales_missing.drop("Price", axis=1) 
y = car_sales_missing["Price"]

In [61]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2) # 75% 25%

# View the data shapes
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((760, 4), (190, 4), (760,), (190,))

In [49]:
from sklearn.impute import SimpleImputer # 用於處理數據中的缺失值
from sklearn.compose import ColumnTransformer # 在同一個數據集上對不同的特徵（列）應用不同的轉換

# NOTE: 建立填補缺失值的策略
cat_imputer = SimpleImputer(strategy="constant", fill_value="missing")
door_imputer = SimpleImputer(strategy="constant", fill_value=4)
num_imputer = SimpleImputer(strategy="mean")

# NOTE: 定義不同特徵的轉換方式
categorical_features = ["Make", "Colour"]
door_feature = ["Doors"]
numerical_feature = ["Odometer (KM)"]
imputer = ColumnTransformer([
    ("cat_imputer", cat_imputer, categorical_features),
    ("door_imputer", door_imputer, door_feature),
    ("num_imputer", num_imputer, numerical_feature)])


In [62]:
filled_X_train = imputer.fit_transform(X_train)
filled_X_test = imputer.transform(X_test)

In [63]:
filled_X_train_df = pd.DataFrame(filled_X_train, columns=["Make", "Colour", "Doors", "Odometer (KM)"])
filled_X_test_df  = pd.DataFrame(filled_X_test , columns=["Make", "Colour", "Doors", "Odometer (KM)"])


filled_X_train_df

Unnamed: 0,Make,Colour,Doors,Odometer (KM)
0,Toyota,Blue,4.0,126078.0
1,Nissan,Blue,4.0,141962.0
2,missing,White,4.0,113250.0
3,Honda,White,4.0,130783.0
4,Toyota,Blue,4.0,42459.0
...,...,...,...,...
755,Honda,Blue,4.0,40912.0
756,Honda,Green,4.0,37606.0
757,BMW,Blue,3.0,130817.0
758,BMW,missing,5.0,131528.194215


In [64]:
# Check missing data in training set
filled_X_train_df.isna().sum()

Make             0
Colour           0
Doors            0
Odometer (KM)    0
dtype: int64

- 接下來，一樣要針對字串部分進行編碼(one-hot encoding)

In [65]:
from sklearn.preprocessing import OneHotEncoder
# 用於將類別變量轉換成 One-Hot 編碼（每個類別會被轉換成獨立的 0 和 1 格式）
from sklearn.compose import ColumnTransformer


categorical_features = ["Make", "Colour", "Doors"]

one_hot = OneHotEncoder()
# NOTE: 建立 ColumnTransformer 實例
transformer = ColumnTransformer(
    [
        ("one_hot", # 轉換器的名稱
         one_hot,  # 要用 OneHotEncoder 進行轉換
         categorical_features) # 轉換類別
    ], 
    remainder="passthrough" # 對於未指定轉換的列，將其保留不變
)

# 對資料進行轉換
transformed_X_train = transformer.fit_transform(filled_X_train_df)
transformed_X_test = transformer.transform(filled_X_test_df)

print(transformed_X_train)


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 3040 stored elements and shape (760, 15)>
  Coords	Values
  (0, 3)	1.0
  (0, 6)	1.0
  (0, 12)	1.0
  (0, 14)	126078.0
  (1, 2)	1.0
  (1, 6)	1.0
  (1, 12)	1.0
  (1, 14)	141962.0
  (2, 4)	1.0
  (2, 9)	1.0
  (2, 12)	1.0
  (2, 14)	113250.0
  (3, 1)	1.0
  (3, 9)	1.0
  (3, 12)	1.0
  (3, 14)	130783.0
  (4, 3)	1.0
  (4, 6)	1.0
  (4, 12)	1.0
  (4, 14)	42459.0
  (5, 1)	1.0
  (5, 9)	1.0
  (5, 12)	1.0
  (5, 14)	171260.0
  (6, 3)	1.0
  :	:
  (753, 14)	131528.19421487604
  (754, 3)	1.0
  (754, 6)	1.0
  (754, 12)	1.0
  (754, 14)	38544.0
  (755, 1)	1.0
  (755, 6)	1.0
  (755, 12)	1.0
  (755, 14)	40912.0
  (756, 1)	1.0
  (756, 7)	1.0
  (756, 12)	1.0
  (756, 14)	37606.0
  (757, 0)	1.0
  (757, 6)	1.0
  (757, 11)	1.0
  (757, 14)	130817.0
  (758, 0)	1.0
  (758, 10)	1.0
  (758, 13)	1.0
  (758, 14)	131528.19421487604
  (759, 3)	1.0
  (759, 9)	1.0
  (759, 12)	1.0
  (759, 14)	26655.0


- 嘗試整合進模型~

In [66]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()

# Make sure to use the transformed data (filled and one-hot encoded X data)
model.fit(transformed_X_train, y_train)
model.score(transformed_X_test, y_test)

0.17285125555888892