In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer
import torch

In [2]:
# load public test data
path = 'D:/永豐攻房戰/30_Public Dataset_Public Sumission Template_v2/public_dataset.csv'
public_df = pd.read_csv(path)

scaler_y = FunctionTransformer(np.log1p, np.expm1)

city_group = public_df.groupby('縣市') # 依照縣市分組
taipei_df = city_group.get_group('台北市')
newtaipei_df = city_group.get_group('新北市')
taoyuan_df = city_group.get_group('桃園市')
taichung_df = city_group.get_group('台中市')
tainan_df =  city_group.get_group('台南市')
kaoshung_df = city_group.get_group('高雄市')
main_six = ['台北市', '新北市', '桃園市', '台中市', '台南市', '高雄市']
others_df = pd.concat([city_group.get_group(group) for group in city_group.groups if group not in main_six]) # 取得六都以外的縣市
print('台北市:', len(taipei_df))
print('新北市:', len(newtaipei_df))
print('桃園市:', len(taoyuan_df))
print('台中市:', len(taichung_df))
print('台南市:', len(tainan_df))
print('高雄市:', len(kaoshung_df))
print('其他:', len(others_df))
df_lst = [taipei_df, newtaipei_df, taoyuan_df, taichung_df, tainan_df, kaoshung_df, others_df] # 用來對ID

# 將各組資料取出後 normalize, 並存到dict, 作為各網路 input
input_dict = {}
names = ['taipei', 'newtaipei', 'taoyuan', 'taichung', 'tainan', 'kaoshung', 'others']
for i, df in enumerate(df_lst):
    numeric_data = df[['土地面積', '移轉層次', '總樓層數', '屋齡', '建物面積', '車位面積', '車位個數', '橫坐標', '縱坐標', '主建物面積', '陽台面積', '附屬建物面積']] # 12 features
    numeric_data = numeric_data.to_numpy()
    scaler_X = StandardScaler()
    scaler_X.fit(numeric_data)
    X_scaled = scaler_X.transform(numeric_data)
    input_dict[names[i]] = X_scaled
    

台北市: 1219
新北市: 2197
桃園市: 549
台中市: 485
台南市: 214
高雄市: 669
其他: 543


In [3]:
# load models
from ANN_model import Taipei_features12_NN, NewTaipei_features12_NN, Taoyuan_features12_NN, Taichung_features12_NN, Tainan_features12_NN, Kaoshung_features12_NN, Others_features12_NN

model1 = Taipei_features12_NN()
model2 = NewTaipei_features12_NN()
model3 = Taoyuan_features12_NN()
model4 = Taichung_features12_NN()
model5 = Tainan_features12_NN()
model6 = Kaoshung_features12_NN()
model7 = Others_features12_NN()

model1.load_state_dict(torch.load('Models/taipei_features12_model1best.pt'))
model2.load_state_dict(torch.load('Models/newtaipei_features12_model1best.pt'))
model3.load_state_dict(torch.load('Models/taoyuan_features12_model3best.pt'))
model4.load_state_dict(torch.load('Models/taichung_features12_model1best.pt'))
model5.load_state_dict(torch.load('Models/tainan_features12_model5best.pt'))
model6.load_state_dict(torch.load('Models/kaoshung_features12_model2best.pt'))
model7.load_state_dict(torch.load('Models/others_features12_model3best.pt'))
model1.eval()
model2.eval()
model3.eval()
model4.eval()
model5.eval()
model6.eval()
model7.eval()

# 各網路輸入對應的 input 預測
pred_df_lst = []
for i, model in enumerate([model1, model2, model3, model4, model5, model6, model7]):
    input = torch.from_numpy(input_dict[names[i]]).type(torch.FloatTensor)
    y_pred = scaler_y.inverse_transform(model(input).detach().numpy()) # 還原 log1p
    y_pred_df = pd.DataFrame({'predicted_price': y_pred.reshape(-1, )}).astype(float).reset_index(drop=True)
    id_df = df_lst[i][['ID']].astype(str).reset_index(drop=True)
    print(len(y_pred_df))
    print(len(id_df))
    y_idpred_df = pd.concat([id_df, y_pred_df], axis=1)
    pred_df_lst.append(y_idpred_df)
merged_pred_df = pd.concat(pred_df_lst, axis=0) # 把所有預測df合併

def custom_sort(id_str):
    return int(id_str.split('-')[1])

# 使用自定义排序函数对'ID'列进行排序
merged_pred_df['ID'] = merged_pred_df['ID'].map(custom_sort)

# 根据排序后的'ID'列进行排序
merged_pred_df = merged_pred_df.sort_values(by='ID')

# 恢复'ID'列的格式
merged_pred_df['ID'] = 'PU-' + merged_pred_df['ID'].astype(str)

# 重新设置索引
merged_pred_df = merged_pred_df.reset_index(drop=True)
merged_pred_df

1219
1219
2197
2197
549
549
485
485
214
214
669
669
543
543


Unnamed: 0,ID,predicted_price
0,PU-1,1.244084
1,PU-2,1.732396
2,PU-3,2.300274
3,PU-4,1.471471
4,PU-5,3.383839
...,...,...
5871,PU-5872,1.601394
5872,PU-5873,1.548872
5873,PU-5874,2.034641
5874,PU-5875,2.854741


In [4]:
# save file
from Utils import save_with_unique_name 

SAVING_FILE = 'Submissions/public_submission_5.csv'

file_name = save_with_unique_name(SAVING_FILE)
merged_pred_df.to_csv(file_name, index=False)