In [None]:
%%time 
from datetime import datetime 
start_real = datetime.now() # 開始量測整體的處理時間

import pandas as pd
# 讀取訓練資料與測試資料
train_df = pd.read_table('../input/mercari/train.tsv')
test_df = pd.read_table('../input/mercari/test.tsv')
print(train_df.shape, test_df.shape)


In [None]:
train_df = train_df.drop(train_df[(train_df.price < 3.0)].index)
train_df.shape

In [None]:
%%time
 # 確認商品名稱與商品敘述的單詞數量

def wordCount(text):
    """
    Parameters:
        text(str): 商品名稱、商品敘述 
    """ 
    try: 
        if text == 'No description yet': 
            return 0 # 商品名稱跟敘述為 'No description yet' 時則傳回0 
        else: 
            text = text.lower() # 全數改為小寫字母 
            words = [w for w in text.split(" ")] # 用空白鍵進行切割 
            return len(words) # 傳回單詞數量
    except:
        return 0

# 將'name'單詞數量紀錄在'name_len'
train_df['name_len'] = train_df['name'].apply(lambda x: wordCount(x))
test_df['name_len'] = test_df['name'].apply(lambda x: wordCount(x))
# 將'item_description'單詞數量紀錄在'desc_len'
train_df['desc_len'] = train_df['item_description'].apply(lambda x: wordCount(x))
test_df['desc_len'] = test_df['item_description'].apply(lambda x: wordCount(x))


In [None]:
%%time
import numpy as np

# 對訓練資料的price進行對數變換
train_df["target"] = np.log1p(train_df.price)


In [None]:
%%time

def split_cat(text):
    """
    Parameters:
    text(str): 類別名稱
        · 使用 / 分割類別名稱
        · 若資料不存在 / 時則傳回"No Label" 
    """
    try: return text.split("/")
    except: return ("No Label", "No Label", "No Label")

# 訓練資料
train_df['subcat_0'], train_df['subcat_1'], train_df['subcat_2'] = \
zip( * train_df['category_name'].apply(lambda x: split_cat(x)))
# 測試資料
test_df['subcat_0'], test_df['subcat_1'], test_df['subcat_2'] = \
zip( * test_df['category_name'].apply(lambda x: split_cat(x)))


In [None]:
%%time
# 將train_df與test_df結合
full_set = pd.concat([train_df, test_df])
# 從全部資料中找出所有出現的品牌名稱，建立品牌清單
all_brands = set(full_set['brand_name'].values)

# 將'brand_name'的缺失值 NaN置換為'missing'
train_df['brand_name'].fillna(value='missing', inplace=True)
test_df['brand_name'].fillna(value='missing', inplace=True)

# 取得訓練資料中缺失值的個數
train_premissing = len(train_df.loc[train_df['brand_name'] == 'missing'])
# 取得測試資料中缺失值的個數
test_premissing = len(test_df.loc[test_df['brand_name'] == 'missing'])

def brandfinder(line):
    
    """
    Parameters: line(str): 品牌名稱
    · 將品牌名稱的'missing'替換為商品名稱：
        當'missing'的商品名稱單詞存在於品牌清單中時
    · 將品牌名稱替換為商品名稱:
        當商品名稱與品牌清單中的名稱完全一致時
    · 維持現有品牌名稱:
        商品名稱與品牌清單的名稱不一致品牌名稱雖為'missing'，但商品名稱的單詞不在品牌清單內
    """
    
    brand = line[0] # 第 1 欄為品牌名稱
    name = line[1]  # 第 2 欄為商品名稱
    namesplit = name.split(' ') # 使用空格分割商品名稱

    if brand == 'missing':  # 是缺失值
        for x in namesplit: # 取出從商品名稱分割出來的單詞
            if x in all_brands:
                return name # 商品名稱單詞存在於品牌清單中，則傳回商品名稱單詞
    if name in all_brands:  # 不是缺失值
        return name         # 商品名稱若存在於品牌清單中，則傳回商品名稱

    return brand # 都沒有一致的話就傳回品牌名稱

# 更換品牌名稱
train_df['brand_name'] = train_df[['brand_name','name']].apply(brandfinder, axis = 1)
test_df['brand_name'] = test_df[['brand_name','name']].apply(brandfinder, axis = 1)

# 取得改寫後的缺失值數量
train_found = train_premissing - len(train_df.loc[train_df['brand_name'] == 'missing'])
test_found = test_premissing - len(test_df.loc[test_df['brand_name'] == 'missing'])
print(train_premissing) # 改寫前訓練資料的缺失值數量
print(train_found)      # 改寫後訓練資料的缺失值數量
print(test_premissing)  # 改寫前測試資料的缺失值數量
print(test_found)       # 改寫後測試資料的缺失值數量


In [None]:
%%time
# 將訓練用的資料框架以99:1的比例分割為訓練資料跟驗證資料
from sklearn.model_selection import train_test_split 
import gc  

train_dfs, dev_dfs = train_test_split(train_df, # 目標資料框架 
                                      random_state=123, # 亂數生成Seed（種子） 
                                      train_size=0.99, # 用於訓練之99%的資料 
                                      test_size=0.01) # 用於驗證之1%的資料

n_trains = train_dfs.shape[0] # 訓練資料尺寸
n_devs = dev_dfs.shape[0]     # 驗證資料尺寸
n_tests = test_df.shape[0]    # 測試資料尺寸
print('Training :', n_trains, 'examples')
print('Validating :', n_devs, 'examples')
print('Testing :', n_tests, 'examples')
del train_df
gc.collect()


In [None]:
%%time
# 將訓練資料、驗證資料、測試資料合併
full_df = pd.concat([train_dfs, dev_dfs, test_df])

def fill_missing_values(df):
    df.category_name.fillna(value='missing', inplace=True)    # 商品類別
    df.brand_name.fillna(value='missing', inplace=True)       # 品牌名稱
    df.item_description.fillna(value='missing', inplace=True) # 商品敘述
    # 將敘述中的 'No description yet' 改為 'missing' 
    df.item_description.replace('No description yet','missing', inplace=True) # 置換商品敘述
    return df

full_df = fill_missing_values(full_df)


In [None]:
%%time
from sklearn.preprocessing import LabelEncoder

print("Processing categorical data...")

# 建立LabelEncoder
le = LabelEncoder()
# 對'category_name'進行編碼、登錄至'category'欄位
le.fit(full_df.category_name)
full_df['category'] = le.transform(full_df.category_name)
# 'brand_name'編碼
le.fit(full_df.brand_name)
full_df.brand_name = le.transform(full_df.brand_name)
# 'subcat_0'編碼
le.fit(full_df.subcat_0)
full_df.subcat_0 = le.transform(full_df.subcat_0)
# 'subcat_1'編碼
le.fit(full_df.subcat_1)
full_df.subcat_1 = le.transform(full_df.subcat_1)
# 'subcat_2'編碼
le.fit(full_df.subcat_2)
full_df.subcat_2 = le.transform(full_df.subcat_2)
del le
gc.collect()


In [None]:
%%time
# 對完成連接的商品敘述、商品名稱進行標籤編碼
from tensorflow.keras.preprocessing.text import Tokenizer

# 將商品敘述、商品名稱、商品類別如下連接成一為陣列
# [商品敘述1,商品敘述2, ...,商品名稱1,商品名稱2,...,商品類別,商品類別,...]

print("Transforming text data to sequences...")
raw_text = np.hstack([full_df.item_description.str.lower(), # 商品敘述
                      full_df.name.str.lower(),             # 商品名稱
                      full_df.category_name.str.lower()])   # 商品類別
print('sequences shape', raw_text.shape)

print(" Fitting tokenizer...")
tok_raw = Tokenizer()
tok_raw.fit_on_texts(raw_text)

print(" Transforming text to sequences...")
full_df['seq_item_description'] = tok_raw.texts_to_sequences(full_df.item_description.str.lower())
full_df['seq_name'] = tok_raw.texts_to_sequences(full_df.name.str.lower())

del tok_raw
gc.collect()


In [None]:
MAX_NAME_SEQ = 10      # 商品名稱的最大尺寸(最大為17，截短為10 )
MAX_ITEM_DESC_SEQ = 75 # 商品敘述的最大尺寸(最大為269，截短為75 )
MAX_CATEGORY_SEQ = 8   # 商品類別的最大尺寸(最大為8)

# 商品名稱與商品敘述的單詞數量: 最大值+100
MAX_TEXT = np.max([np.max(full_df.seq_name.max()),
                   np.max(full_df.seq_item_description.max())]) + 100
# 商品類別的單詞數量: 最大值+1
MAX_CATEGORY = np.max(full_df.category.max()) + 1
# 品牌名稱的單詞數量: 最大值+1
MAX_BRAND = np.max(full_df.brand_name.max()) + 1
# 商品狀態的數量: 最大值+1
MAX_CONDITION = np.max(full_df.item_condition_id.max()) + 1
# 商品敘述的單詞數量: 每列單詞數量的最大值+1
MAX_DESC_LEN = np.max(full_df.desc_len.max()) + 1
# 商品名稱的單詞數量: 每列單詞數量的最大值+1
MAX_NAME_LEN = np.max(full_df.name_len.max()) + 1
# 子類別的單詞數量: 最大值+1
MAX_SUBCAT_0 = np.max(full_df.subcat_0.max()) + 1
MAX_SUBCAT_1 = np.max(full_df.subcat_1.max()) + 1
MAX_SUBCAT_2 = np.max(full_df.subcat_2.max()) + 1


In [None]:
%%time
from tensorflow.keras.preprocessing.sequence import pad_sequences
def get_rnn_data(dataset): 
    """ 
    將輸入的資料放入dict物件後傳回  
    Parameter: 
        dataset: 全部資料 
    """ 
    X = { 
        # 商品名稱 
        # 墊零以統一序列尺寸: MAX_NAME_SEQ=10
        'name': pad_sequences(dataset.seq_name,
                              maxlen=MAX_NAME_SEQ),
        # 商品敘述
        # 墊零以統一序列尺寸: MAX_ITEM_DESC_SEQ=75
        'item_desc': pad_sequences(dataset.seq_item_description,
                                   maxlen=MAX_ITEM_DESC_SEQ),
        # 品牌名稱
        'brand_name': np.array(dataset.brand_name),
        # 商品類別
        'category': np.array(dataset.category),
        # 商品狀態
        'item_condition': np.array(dataset.item_condition_id),
        # 運費負擔: 賣方負擔為 1, 買方負擔為0
        'num_vars': np.array(dataset[["shipping"]]),
        # 商品敘述
        'desc_len': np.array(dataset[["desc_len"]]),
        # 商品名稱
        'name_len': np.array(dataset[["name_len"]]),
        # 商品子類別0
        'subcat_0': np.array(dataset.subcat_0),
        # 商品子類別1
        'subcat_1': np.array(dataset.subcat_1),
        # 商品子類別2
        'subcat_2': np.array(dataset.subcat_2)}
    return X

# 訓練資料: 索引0到訓練資料數量的索引為止
train = full_df[:n_trains]
# 驗證資料: 資料數量的索引到訓練資料數量+驗證資料數量的索引為止
dev = full_df[n_trains:n_trains+n_devs] 
# 測試資料: 訓練資料+驗證資料的索引開始到最後 
test = full_df[n_trains+n_devs:]  

# 取得訓練用的dictionary物件
X_train = get_rnn_data(train) 
# 將訓練用的商品價格1維陣列轉換為2維矩陣
# (1466844) ➡ (1466844,1)
Y_train = train.target.values.reshape(-1, 1)

# 取的驗證用的dictionary物件
X_dev = get_rnn_data(dev)
# 將驗證用的商品價格1維陣列轉換為2維矩陣
# (14817) ➡ (14817,1)
Y_dev = dev.target.values.reshape(-1, 1)

# 取的測試用的dictionary
X_test = get_rnn_data(test)

del full_df
gc.collect()

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dropout, Dense, Embedding, Flatten
from tensorflow.keras.layers import concatenate, GRU
from tensorflow.keras.optimizers import Adam

np.random.seed(123) # 設定亂數種子

# 定義均方根誤差(Root Mean Square Error, RMSE)
# 用於確認預測狀況
# 使用此函數時的Y_pred(預測售價)與Y(實際售價)已經有經過對數轉換
# 因此實質上輸出是對數均方根誤差(Root Mean Squared Logarithmic Error, RMSLE)
def rmsle(Y, Y_pred):
    assert Y.shape == Y_pred.shape
    return np.sqrt(np.mean(np.square(Y_pred - Y )))

def new_rnn_model(lr=0.001, decay=0.0):
    """
    生成循環型類神經網路模型
    Parameters:
        lr: 學習率
        decay: 學習率的衰減
    """
    # 輸入層
    # 商品名稱、商品敘述、品牌名稱、商品狀態、負擔運費
    name           = Input(shape=[X_train["name"].shape[1]] , name="name")
    item_desc      = Input(shape=[X_train["item_desc"].shape[1]], name="item_desc")
    brand_name     = Input(shape=[1], name="brand_name")
    item_condition = Input(shape=[1], name="item_condition")
    num_vars       = Input(shape=[X_train["num_vars"].shape[1]], name="num_vars")
    # 商品名稱文字、商品敘述文字的單詞數量
    name_len       = Input(shape=[1], name="name_len")
    desc_len       = Input(shape=[1], name="desc_len")
    # 商品子類別
    subcat_0       = Input(shape=[1], name="subcat_0")
    subcat_1       = Input(shape=[1], name="subcat_1")
    subcat_2       = Input(shape=[1], name="subcat_2")

    # Embedding層
    # 商品名稱Embedding: 輸入為單詞總數 +100、輸出的維數為20
    emb_name = Embedding(MAX_TEXT, 20)(name)
    # 商品敘述Embedding: 輸入為單詞總數 +100、輸出的維數為60
    emb_item_desc = Embedding(MAX_TEXT, 60)(item_desc)
    # 品牌名稱Embedding: 輸入為單詞總數 +1、輸出的維數為10
    emb_brand_name = Embedding(MAX_BRAND, 10)(brand_name)
    # 商品狀態Embedding: 輸入為5+1、輸出的維數為 5
    emb_item_condition = Embedding(MAX_CONDITION, 5)(item_condition)
    # 商品敘述單詞數量Embedding: 輸入為商品敘述的最大單詞數量 +1、輸出為5
    emb_desc_len = Embedding(MAX_DESC_LEN, 5)(desc_len)
    # 商品名稱單詞數量Embedding: 輸入為商品名稱的最大單詞數量 +1、輸出為5
    emb_name_len = Embedding(MAX_NAME_LEN, 5)(name_len)
    # 商品子類別的Embedding: 輸入為類別名稱的最大單詞數量 +1、輸出為10
    emb_subcat_0 = Embedding(MAX_SUBCAT_0, 10)(subcat_0)
    emb_subcat_1 = Embedding(MAX_SUBCAT_1, 10)(subcat_1)
    emb_subcat_2 = Embedding(MAX_SUBCAT_2, 10)(subcat_2)

    # 門控循環單元
    rnn_layer1 = GRU(16) (emb_item_desc) # 商品敘述
    rnn_layer2 = GRU(8) (emb_name)       # 商品名稱

    # 扁平層
    main_l = concatenate([Flatten()(emb_brand_name), # 品牌名稱 Embedding
                          Flatten()(emb_item_condition), #商品狀態Embedding
                          Flatten()(emb_desc_len), # 商品敘述的單詞數量 Embedding
                          Flatten()(emb_name_len), # 商品名稱的單詞數量 Embedding
                          Flatten()(emb_subcat_0), # 子類別 0 的 Embedding
                          Flatten()(emb_subcat_1), # 子類別1 的 Embedding
                          Flatten()(emb_subcat_2), # 子類別 2 的 Embedding
                          rnn_layer1, # 商品敘述 GRU Unit
                          rnn_layer2, # 商品名稱 GRU Unit
                          num_vars]) #負擔運費(0或1)

    # 全連接層
    main_l = Dropout(0.1)(Dense(512,
                                kernel_initializer='normal',
                                activation='relu')(main_l))
    main_l = Dropout(0.1)(Dense(256,
                                kernel_initializer='normal',
                                activation='relu')(main_l))
    main_l = Dropout(0.1)(Dense(128,
                                kernel_initializer='normal',
                                activation='relu')(main_l))
    main_l = Dropout(0.1)(Dense(64,
                                kernel_initializer='normal',
                                activation='relu')(main_l))

    # 輸出層
    output = Dense(1,
                   activation="linear") (main_l)

    # 整合
    # 因為是多個平行的輸入層，所以將其做成清單
    model = Model(inputs=[name,
                          item_desc,
                          brand_name,
                          item_condition,
                          num_vars,
                          desc_len,
                          name_len,
                          subcat_0, 
                          subcat_1, 
                          subcat_2],
                  # 輸出層
                  outputs=output)

    # 設定損失函數以及優化器，開始編譯
    model.compile(loss = 'mse',
                  optimizer = Adam(lr=lr, decay=decay))
    
    return model

# 建立模型
model = new_rnn_model()
model.summary()

del model
gc.collect()


In [None]:
%%time
# 批次大小
BATCH_SIZE = 512 * 2
epochs = 3

# 學習率衰減
exp_decay = lambda init, fin, steps: (init/fin) ** (1/(steps-1)) - 1
steps = int(len(X_train['name']) / BATCH_SIZE) * epochs
lr_init = 0.005
lr_fin = 0.001
lr_decay = exp_decay(lr_init, lr_fin, steps)

# 建立模型
rnn_model = new_rnn_model(lr=lr_init, decay=lr_decay)
# 訓練模型
rnn_model.fit(X_train, 
              Y_train,
              epochs=epochs,
              batch_size=BATCH_SIZE,
              validation_data=(X_dev, Y_dev),
              verbose=1)


In [None]:
%%time
# 使用驗證資料評估模型
print("Evaluating the model on validation data...")
# 用訓練完成的模型預測驗證資料進行
Y_dev_preds_rnn = rnn_model.predict(X_dev,
                                    batch_size=BATCH_SIZE)
# 使用rmsle()求出均方根誤差
print("RMSLE error:", rmsle(Y_dev,            # 驗證資料的商品價格
                            Y_dev_preds_rnn)) # 預測值

In [None]:
rnn_preds = rnn_model.predict(X_test, 
                              batch_size=BATCH_SIZE, 
                              verbose=1)
# 對預測出的商品價格套用指數函數
rnn_preds = np.expm1(rnn_preds)
del rnn_model
gc.collect()

In [None]:
full_df2 = pd.concat([train_dfs, dev_dfs, test_df])

In [None]:
%%time

print("Handling missing values...")
# 將類別名稱的缺漏值置換為'missing' 
full_df2['category_name'] = full_df2['category_name'
                                    ].fillna('missing').astype(str)
# 將子類別的標籤轉換為文字列
full_df2['subcat_0'] = full_df2['subcat_0'].astype(str)
full_df2['subcat_1'] = full_df2['subcat_1'].astype(str)
full_df2['subcat_2'] = full_df2['subcat_2'].astype(str)
# 將品牌名稱的缺漏值置換為'missing' 
full_df2['brand_name'] = full_df2['brand_name'
                                 ].fillna('missing').astype(str)
# 將運費負擔、商品狀態置換為文字列
full_df2['shipping'] = full_df2['shipping'].astype(str)
full_df2['item_condition_id'] = full_df2['item_condition_id'
                                        ].astype(str)
# 將商品敘述的單詞數量、商品名稱的單詞數量置換為文字列
full_df2['desc_len'] = full_df2['desc_len'
                               ].astype(str)
full_df2['name_len'] = full_df2['name_len'
                               ].astype(str)
# 將商品敘述的缺漏值置換為'No description yet'
full_df2['item_description'] = full_df2['item_description'
                                       ].fillna('No description yet'
                                               ).astype(str)



In [None]:
%%time

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion

print("Vectorizing data...")
default_preprocessor = CountVectorizer().build_preprocessor()

def build_preprocessor(field):
    """ 
    取的指定欄位的索引
    傳回製作Token count矩陣的CountVectorizer
    Parameter:全連接資料框架的欄位名稱
    """
    field_idx = list(full_df2.columns).index(field)
    return lambda x: default_preprocessor(x[field_idx])

# 轉換文字資料
vectorizer = FeatureUnion([
    ('name', CountVectorizer(ngram_range=(1, 2),
        max_features=5000, # Token count上限值
        preprocessor=build_preprocessor('name'))),
    ('subcat_0', CountVectorizer(token_pattern='.+',
        preprocessor=build_preprocessor('subcat_0'))),
    ('subcat_1', CountVectorizer(token_pattern='.+',
        preprocessor=build_preprocessor('subcat_1'))),
    ('subcat_2', CountVectorizer(token_pattern='.+',
        preprocessor=build_preprocessor('subcat_2'))),
    ('brand_name', CountVectorizer(token_pattern='.+',
        preprocessor=build_preprocessor('brand_name'))),
    ('shipping', CountVectorizer(token_pattern='\d+',
        preprocessor=build_preprocessor('shipping'))),
    ('item_condition_id', CountVectorizer(token_pattern='\d+',
        preprocessor=build_preprocessor('item_condition_id'))),
    ('desc_len', CountVectorizer(token_pattern='\d+',
        preprocessor=build_preprocessor('desc_len'))),
    ('name_len', CountVectorizer(token_pattern='\d+',
        preprocessor=build_preprocessor('name_len'))),
    ('item_description', TfidfVectorizer(ngram_range=(1, 3),
        max_features=5000,  # Token count上限值
        preprocessor=build_preprocessor('item_description'))),])

X = vectorizer.fit_transform(full_df2.values)

del vectorizer
gc.collect()

# 取出訓練資料
X_train = X[:n_trains]
# 將訓練資料中的商品價格轉換成二維矩陣
Y_train = train_dfs.target.values.reshape(-1, 1)

# 取出驗證資料
X_dev = X[n_trains:n_trains+n_devs]
# 將驗證資料中的商品價格轉換成二維矩陣
Y_dev = dev_dfs.target.values.reshape(-1, 1)

# 取出測試資料
X_test = X[n_trains+n_devs:]

print('X:', X.shape)
print('X_train:', X_train.shape)
print('X_dev:', X_dev.shape)
print('X_test:', X_test.shape)
print('Y_train:', Y_train.shape)
print('Y_dev:', Y_dev.shape)


In [None]:
%%time

from sklearn.linear_model import Ridge, RidgeCV

print("Fitting Ridge model on training examples...")
ridge_model = Ridge(solver='auto',      # 自動選擇優化器
                    fit_intercept=True, # 計算截距(又稱偏誤)
                    alpha=1.0,          # 常規化強度為預設值
                    max_iter=200,       # 迭代次數
                    normalize=False,    # 不要進行資料標準化
                    tol=0.01,           # 目標準確率
                    random_state = 1)   # 洗牌資料時所使用的亂數種子

ridge_modelCV = RidgeCV(fit_intercept=True,
                        alphas=[5.0],
                        normalize=False,
                        cv = 2,                           # 交叉驗證的fold數為2
                        scoring='neg_mean_squared_error') # 評價指標為均方誤差

ridge_model.fit(X_train, Y_train)
ridge_modelCV.fit(X_train, Y_train)


In [None]:
Y_dev_preds_ridge = ridge_model.predict(X_dev)
Y_dev_preds_ridge = Y_dev_preds_ridge.reshape(-1, 1)
print('Ridge model RMSE error:', rmsle(Y_dev, Y_dev_preds_ridge))

In [None]:
Y_dev_preds_ridgeCV = ridge_modelCV.predict(X_dev)
Y_dev_preds_ridgeCV = Y_dev_preds_ridgeCV.reshape(-1, 1)
print('RidgeCV model RMSE error:', rmsle(Y_dev, Y_dev_preds_ridgeCV))

In [None]:
%%time
# Ridge模型
ridge_preds = ridge_model.predict(X_test)
ridge_preds = np.expm1(ridge_preds)
# RidgeCV模型
ridgeCV_preds = ridge_modelCV.predict(X_test)
ridgeCV_preds = np.expm1(ridgeCV_preds)

In [None]:
%%time
def aggregate_predicts3(Y1, Y2, Y3, ratio1, ratio2):
    """
    對3個模型的預測值套用加權值，將3的預測值結合為1個預測值並傳回 
    Parameters: 
        Y1: 循環神經網路模型的預測值
        Y2: Ridge模型的預測值 
        Y3: RidgeCV模型的預測值 
        ratio1: 加權值 1 
        ratio2: 加權值 2
        
        (ratio3): 1.0 - ratio1 - ratio2
    """
    assert Y1.shape == Y2.shape
    return Y1*ratio1 + Y2*ratio2 + Y3*(1.0 - ratio1 - ratio2)


In [None]:
%%time
best1 = 0 
best2 = 0 
lowest = 0.99
for i in range(100): 
    for j in range(100): 
        r = i * 0.01 
        r2 = j * 0.01
        if r+r2 < 1.0:
            # 對3個模型對驗證資料做出預測值，進行加權集成，取得新的預測值
            Y_dev_preds = aggregate_predicts3(Y_dev_preds_rnn,    
                                              Y_dev_preds_ridge,  
                                              Y_dev_preds_ridgeCV,
                                              r, 
                                              r2)
            # 求出加權集成預測值與真實值的損失 
            fpred = rmsle(Y_dev, Y_dev_preds) 
            # 如果當前的損失較小，則記錄下來 
            if fpred < lowest: 
                best1 = r 
                best2 = r2
                lowest = fpred

Y_dev_preds = aggregate_predicts3(Y_dev_preds_rnn,
                                  Y_dev_preds_ridge,
                                  Y_dev_preds_ridgeCV, 
                                  best1, 
                                  best2)

print('r1:', best1)
print('r2:', best2)
print('r3:', 1.0 - best1 - best2)
print("(Best) RMSE error for RNN + Ridge + RidgeCV on dev set:\n", rmsle(Y_dev, Y_dev_preds))


In [None]:
# 對3個模型對測試資料做出預測值，使用最佳加權值進行加權集成，取得新的預測值
preds = aggregate_predicts3(rnn_preds,
                            ridge_preds,
                            ridgeCV_preds,
                            best1, 
                            best2)
# 彙整ID與預測值
submission = pd.DataFrame({"test_id": test_df.test_id,
                           "price": preds.reshape(-1)})

# 輸出CSV檔案
submission.to_csv("./rnn_ridge_submission_best.csv", index=False)


In [None]:
stop_real = datetime.now()
execution_time_real = stop_real-start_real
print(execution_time_real)