# Pre-process

## Package & Datasets loading

In [None]:
import pandas as pd
import numpy as np
import re
import stanza
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from collections import Counter

In [None]:
nltk.download('punkt')  
nltk.download('stopwords')
nltk.download('vader_lexicon')
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
# stanza.download('en')
# nlp = stanza.Pipeline(lang='en', processors='tokenize,sentiment', tokenize_no_ssplit=True)

In [None]:
df = pd.read_csv('../../Final_Datasets/TA_combined_df_City_tourism_type_VADER_final_Stanza.csv')
df

In [None]:
df['Hotel_locID'].nunique()

## Data Overview & Labeling

In [None]:
# VADER
def calculate_compound_score(review):
    sid = SentimentIntensityAnalyzer()
    return sid.polarity_scores(review)['compound']

# Unreliable tag
def calculate_unreliable(row):
    compound_score = row['Compound_Score']
    rating = row['Review_Rating']
    
    if (compound_score < -0.49 and rating >= 3):
        return 1
    else:
        return 0

# Stanza
# def analyze_sentiment(text):
#   doc = nlp(text)
#   sentiments = [sentence.sentiment for sentence in doc.sentences]
  
#   compound_score = sentiments[0]
#   return compound_score

# Stanza x iterations
def analyze_sentiment(text):
    compound_scores = []
    num_iterations=11
    for _ in range(num_iterations):
        doc = nlp(text)
        sentiments = [sentence.sentiment for sentence in doc.sentences]
        compound_scores.append(sentiments[0])
    
    most_common_sentiment = Counter(compound_scores).most_common(1)
    # print(compound_scores)
    
    return most_common_sentiment[0][0]

# Stanza Unreliable tag
def calculate_unreliable_stanza(row):
    stanza_score = row['Stanza_Score']
    rating = row['Review_Rating']
    
    if (stanza_score == 2 and rating < 3) or (stanza_score == 0 and rating >= 3):
        return 1
    else:
        return 0

In [None]:
df['Unreliable'] = df.apply(calculate_unreliable, axis=1)
print(df['Unreliable'].value_counts())

In [None]:
# df['Stanza_Score'] = df['Review'].apply(analyze_sentiment)

In [None]:
# Unreliable reviews example
unreliable_reviews = df[df['Unreliable'] == 1]['Review']
unreliable_reviews.head(10)

In [None]:
# Reviews rating distributions
rating_counts = df['Review_Rating'].value_counts()
rating_counts = rating_counts.sort_index()
plt.figure(figsize=(8, 5))  # size
plt.bar(rating_counts.index, rating_counts.values)
plt.xlabel('Review_Rating')  
plt.ylabel('Count')  
plt.title('Distribution of Ratings')  
plt.show()

## Function

In [None]:
def decontracting(text):
    text = re.sub(r"won\'t", "will not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"don\'t", "do not", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'bout", "about", text)
    text = re.sub(r"\'til", "until", text)
    return text

stopwords_list = stopwords.words('english')
def remove_stopwords(tokens):
    texts = [i for i in tokens if i not in stopwords_list]
    return texts


def lemmatization(tokens):
  lemmatizer = WordNetLemmatizer()
  lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
  return lemmatized_tokens


def word_preprocess(df, column_name):
  #lowercase
  df[column_name] = df[column_name].apply(lambda x: str(x).lower())

  #decontracting
  df[column_name] = df[column_name].apply(decontracting)

  #remove tags, punctuations, numbers
  df[column_name] = df[column_name].apply(lambda x: re.sub('[^a-zA-Z!]', ' ', x))

  #tokenization
  import nltk
  nltk.download('punkt')
  df[column_name] = df[column_name].apply(lambda x: nltk.word_tokenize(x))

  #remove stopwords
  df[column_name] = df[column_name].apply(remove_stopwords)
  return df

## Text Preprocess

In [None]:
text_preprocessed_df = word_preprocess(df,'Review')

In [None]:
text_preprocessed_df['Review']

In [None]:
# delete nan row
text_preprocessed_df = text_preprocessed_df.dropna(subset=['Hotel_star', 'Review_Rating', 'Review', 'Reviewer_Contributions'])

In [None]:
text_preprocessed_df['Hotel_locID'].nunique()

In [None]:
text_preprocessed_df['Review_Rating'].mean()

In [None]:
rating_counts = text_preprocessed_df['Review_Rating'].value_counts()
rating_proportions = text_preprocessed_df['Review_Rating'].value_counts(normalize=True)

# 打印計數和比例
print("Rating counts:\n", rating_counts)
print("Rating proportions:\n", rating_proportions)

## Split datasets to sub-datasets

In [None]:
text_preprocessed_df

In [None]:
def contributions_range(value):
    if value <= 5:
        return 1
    else:
        return 2
    
def hotel_star_range(value):
    if value <= 2.0:
        return 1
    elif value >= 2.5 and value <= 3.0:
        return 1
    elif value >= 3.5 and value <= 4.0:
        return 2
    else:
        return 2

In [None]:
# create new columns
text_preprocessed_df['Reviewer_Contributions_range'] = text_preprocessed_df['Reviewer_Contributions'].apply(contributions_range)
text_preprocessed_df['Hotel_star_range'] = text_preprocessed_df['Hotel_star'].apply(hotel_star_range)
text_preprocessed_df

In [None]:
# Split 4 sub-datasets
LCLS = text_preprocessed_df[(text_preprocessed_df['Reviewer_Contributions_range'] == 1) & (text_preprocessed_df['Hotel_star_range'] == 1)]
LCHS = text_preprocessed_df[(text_preprocessed_df['Reviewer_Contributions_range'] == 1) & (text_preprocessed_df['Hotel_star_range'] == 2)]
HCLS = text_preprocessed_df[(text_preprocessed_df['Reviewer_Contributions_range'] == 2) & (text_preprocessed_df['Hotel_star_range'] == 1)]
HCHS = text_preprocessed_df[(text_preprocessed_df['Reviewer_Contributions_range'] == 2) & (text_preprocessed_df['Hotel_star_range'] == 2)]


In [None]:
# split unreliable reviews
LCLS_unreliable = LCLS[LCLS['Unreliable'] == 1]
LCHS_unreliable = LCHS[LCHS['Unreliable'] == 1]
HCLS_unreliable = HCLS[HCLS['Unreliable'] == 1]
HCHS_unreliable = HCHS[HCHS['Unreliable'] == 1]

In [None]:
# split reliable reviews
LCLS_reliable = LCLS[LCLS['Unreliable'] == 0]
LCHS_reliable = LCHS[LCHS['Unreliable'] == 0]
HCLS_reliable = HCLS[HCLS['Unreliable'] == 0]
HCHS_reliable = HCHS[HCHS['Unreliable'] == 0]

In [None]:
# split subset
selected_columns = ['Review', 'Review_Rating']
LCLS_text = LCLS_reliable.loc[:, selected_columns]
LCHS_text = LCHS_reliable.loc[:, selected_columns]
HCLS_text = HCLS_reliable.loc[:, selected_columns]
HCHS_text = HCHS_reliable.loc[:, selected_columns]

# reset index
LCLS_text.reset_index(drop=True, inplace=True)
LCHS_text.reset_index(drop=True, inplace=True)
HCLS_text.reset_index(drop=True, inplace=True)
HCHS_text.reset_index(drop=True, inplace=True)

In [None]:
LCLS_text['Review'] = [' '.join(text) for text in LCLS_text['Review']]
LCHS_text['Review'] = [' '.join(text) for text in LCHS_text['Review']]
HCLS_text['Review'] = [' '.join(text) for text in HCLS_text['Review']]
HCHS_text['Review'] = [' '.join(text) for text in HCHS_text['Review']]

# Split sub-datasets to X & Y, Training and Testing set

In [None]:
LCLS_X = LCLS_text['Review']
LCLS_y = LCLS_text['Review_Rating']

LCHS_X = LCHS_text['Review']
LCHS_y = LCHS_text['Review_Rating']

HCLS_X = HCLS_text['Review']
HCLS_y = HCLS_text['Review_Rating']

HCHS_X = HCHS_text['Review']
HCHS_y = HCHS_text['Review_Rating']

LCLS_X_train, LCLS_X_test, LCLS_y_train, LCLS_y_test = train_test_split(LCLS_X, LCLS_y, test_size=0.2, random_state=88)
LCHS_X_train, LCHS_X_test, LCHS_y_train, LCHS_y_test = train_test_split(LCHS_X, LCHS_y, test_size=0.2, random_state=88)
HCLS_X_train, HCLS_X_test, HCLS_y_train, HCLS_y_test = train_test_split(HCLS_X, HCLS_y, test_size=0.2, random_state=88)
HCHS_X_train, HCHS_X_test, HCHS_y_train, HCHS_y_test = train_test_split(HCHS_X, HCHS_y, test_size=0.2, random_state=88)

In [None]:
LCLS_X_train

In [None]:
LCLS_y_train

# Feature extraction

## Initial variable

In [None]:
# LCLS 
LCLS_X_train_bow_df = None
LCLS_X_train_tfidf_df = None
LCLS_X_train_d2v = None
LCLS_X_train_glove = None
LCLS_X_train_bert = None
LCLS_X_test_bow_df = None
LCLS_X_test_tfidf_df = None
LCLS_X_test_d2v = None
LCLS_X_test_glove = None
LCLS_X_test_bert = None

# LCHS
LCHS_X_train_bow_df = None
LCHS_X_train_tfidf_df = None
LCHS_X_train_d2v = None
LCHS_X_train_glove = None
LCHS_X_train_bert = None
LCHS_X_test_bow_df = None
LCHS_X_test_tfidf_df = None
LCHS_X_test_d2v = None
LCHS_X_test_glove = None
LCHS_X_test_bert = None

# HCLS
HCLS_X_train_bow_df = None
HCLS_X_train_tfidf_df = None
HCLS_X_train_d2v = None
HCLS_X_train_glove = None
HCLS_X_train_bert = None
HCLS_X_test_bow_df = None
HCLS_X_test_tfidf_df = None
HCLS_X_test_d2v = None
HCLS_X_test_glove = None
HCLS_X_test_bert = None

# HCHS
HCHS_X_train_bow_df = None
HCHS_X_train_tfidf_df = None
HCHS_X_train_d2v = None
HCHS_X_train_glove = None
HCHS_X_train_bert = None
HCHS_X_test_bow_df = None
HCHS_X_test_tfidf_df = None
HCHS_X_test_d2v = None
HCHS_X_test_glove = None
HCHS_X_test_bert = None


## BOW

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

def generate_bow_train(X_train, max_features=1000):
    vectorizer = CountVectorizer(max_features=max_features)
    bow_vectors = vectorizer.fit_transform(X_train)
    bow_df = pd.DataFrame(bow_vectors.toarray(), columns=vectorizer.get_feature_names_out())
    
    return bow_df, vectorizer

def generate_bow_test(X_test, vectorizer):
    bow_vectors = vectorizer.transform(X_test)
    bow_df = pd.DataFrame(bow_vectors.toarray(), columns=vectorizer.get_feature_names_out())
    
    return bow_df

In [None]:
LCLS_X_train_bow_df, LCLS_vectorizer = generate_bow_train(LCLS_X_train)
LCHS_X_train_bow_df, LCHS_vectorizer = generate_bow_train(LCHS_X_train)
HCLS_X_train_bow_df, HCLS_vectorizer = generate_bow_train(HCLS_X_train)
HCHS_X_train_bow_df, HCHS_vectorizer = generate_bow_train(HCHS_X_train)


### Testing set

In [None]:
LCLS_X_test_bow_df = generate_bow_test(LCLS_X_test, LCLS_vectorizer)
LCHS_X_test_bow_df = generate_bow_test(LCHS_X_test, LCHS_vectorizer)
HCLS_X_test_bow_df = generate_bow_test(HCLS_X_test, HCLS_vectorizer)
HCHS_X_test_bow_df = generate_bow_test(HCHS_X_test, HCHS_vectorizer)

## TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def generate_tfidf_train(X_train, stop_words='english', max_features=1000, max_df=0.9):
    tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=max_features, max_df=max_df)
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train).toarray()
    feature_names = tfidf_vectorizer.get_feature_names_out()
    
    tfidf_df = pd.DataFrame(X_train_tfidf, columns=feature_names)
    
    return tfidf_df, tfidf_vectorizer

def generate_tfidf_test(X_test, tfidf_vectorizer):
    X_test_tfidf = tfidf_vectorizer.transform(X_test).toarray()
    feature_names = tfidf_vectorizer.get_feature_names_out()
    
    tfidf_df = pd.DataFrame(X_test_tfidf, columns=feature_names)
    
    return tfidf_df

In [None]:
LCLS_X_train_tfidf_df, tfidf_vectorizer_LCLS = generate_tfidf_train(LCLS_X_train)
LCHS_X_train_tfidf_df, tfidf_vectorizer_LCHS = generate_tfidf_train(LCHS_X_train)
HCLS_X_train_tfidf_df, tfidf_vectorizer_HCLS = generate_tfidf_train(HCLS_X_train)
HCHS_X_train_tfidf_df, tfidf_vectorizer_HCHS = generate_tfidf_train(HCHS_X_train)

### Testing set

In [None]:
LCLS_X_test_tfidf_df = generate_tfidf_test(LCLS_X_test, tfidf_vectorizer_LCLS)
LCHS_X_test_tfidf_df = generate_tfidf_test(LCHS_X_test, tfidf_vectorizer_LCHS)
HCLS_X_test_tfidf_df = generate_tfidf_test(HCLS_X_test, tfidf_vectorizer_HCLS)
HCHS_X_test_tfidf_df = generate_tfidf_test(HCHS_X_test, tfidf_vectorizer_HCHS)

## Doc2Vec

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

def train_doc2vec_model(X_train, min_count=5, workers=8, epochs=40, vector_size=100):
    tagged_docs = [TaggedDocument(doc.split(' '), [i]) for i, doc in enumerate(X_train)]
    model = Doc2Vec(min_count=min_count, workers=workers, epochs=epochs, vector_size=vector_size)
    model.build_vocab(tagged_docs)
    model.train(tagged_docs, total_examples=model.corpus_count, epochs=model.epochs)
    
    return model

In [None]:
# Get different datasets of Doc2vec
LCLS_model = train_doc2vec_model(LCLS_X_train)
LCHS_model = train_doc2vec_model(LCHS_X_train)
HCLS_model = train_doc2vec_model(HCLS_X_train)
HCHS_model = train_doc2vec_model(HCHS_X_train)

In [None]:
LCLS_X_train_d2v = np.array([LCLS_model.infer_vector((doc.split(' '))) for doc in LCLS_X_train])
LCHS_X_train_d2v = np.array([LCHS_model.infer_vector((doc.split(' '))) for doc in LCHS_X_train])
HCLS_X_train_d2v = np.array([HCLS_model.infer_vector((doc.split(' '))) for doc in HCLS_X_train])
HCHS_X_train_d2v = np.array([HCHS_model.infer_vector((doc.split(' '))) for doc in HCHS_X_train])

In [None]:
pd.DataFrame(LCLS_X_train_d2v)

In [None]:
pd.DataFrame(LCHS_X_train_d2v)

In [None]:
pd.DataFrame(HCLS_X_train_d2v)

In [None]:
pd.DataFrame(HCHS_X_train_d2v)

### Testing set

In [None]:
LCLS_X_test_d2v = np.array([LCLS_model.infer_vector((doc.split(' '))) for doc in LCLS_X_test])
LCHS_X_test_d2v = np.array([LCHS_model.infer_vector((doc.split(' '))) for doc in LCHS_X_test])
HCLS_X_test_d2v = np.array([HCLS_model.infer_vector((doc.split(' '))) for doc in HCLS_X_test])
HCHS_X_test_d2v = np.array([HCHS_model.infer_vector((doc.split(' '))) for doc in HCHS_X_test])

## GloVe

In [None]:
glove_path = '../GloVe_wordvec/glove.6B.100d.txt'

# import GloVe word vectors into dictionary
embeddings_index = {}
with open(glove_path, 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [None]:
# define function to create embedding matrix
def glove_embedding(comment, embeddings_index = embeddings_index, dim=100):
    words = comment.split()
    vec = np.zeros(dim)
    count = 0
    for word in words:
        embedding_vector = embeddings_index.get(word) # get GloVe word vectors
        if embedding_vector is not None:
            vec += embedding_vector
            count += 1
    if count != 0:
        vec /= count
    return vec

In [None]:
# train
LCLS_X_train_glove = np.array([glove_embedding(comment) for comment in LCLS_X_train])
LCHS_X_train_glove = np.array([glove_embedding(comment) for comment in LCHS_X_train])
HCLS_X_train_glove = np.array([glove_embedding(comment) for comment in HCLS_X_train])
HCHS_X_train_glove = np.array([glove_embedding(comment) for comment in HCHS_X_train])

In [None]:
pd.DataFrame(LCLS_X_train_glove)

In [None]:
pd.DataFrame(LCHS_X_train_glove)

In [None]:
pd.DataFrame(HCLS_X_train_glove)

In [None]:
pd.DataFrame(HCHS_X_train_glove)

In [None]:
# test
LCLS_X_test_glove = np.array([glove_embedding(comment) for comment in LCLS_X_test])
LCHS_X_test_glove = np.array([glove_embedding(comment) for comment in LCHS_X_test])
HCLS_X_test_glove = np.array([glove_embedding(comment) for comment in HCLS_X_test])
HCHS_X_test_glove = np.array([glove_embedding(comment) for comment in HCHS_X_test])

## BERT

In [None]:
import transformers
import torch

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# BERT(longformer) model
model_name = 'allenai/longformer-base-4096'
tokenizer = transformers.LongformerTokenizer.from_pretrained(model_name)
model = transformers.LongformerModel.from_pretrained(model_name).to(device)

In [None]:
# define function
def bert_embedding(X_train):
    embeddings = []
    for text in X_train:
        # 將文本轉成BERT的輸入格式，即加上[CLS]與[SEP] token，並轉成tensor
        encoded_text = tokenizer.encode_plus(text, add_special_tokens=True, return_tensors='pt').to(device)

        # 用預訓練BERT模型轉成向量
        with torch.no_grad():
            model_output = model(encoded_text['input_ids'], attention_mask=encoded_text['attention_mask'])

        # 取出[CLS] token對應的向量作為整個文本的向量表示
        embeddings.append(model_output.last_hidden_state[:, 0, :].squeeze().tolist())
    return embeddings

In [None]:
LCLS_X_train_bert = bert_embedding(LCLS_X_train)
pd.DataFrame(LCLS_X_train_bert)

In [None]:
LCHS_X_train_bert = bert_embedding(LCHS_X_train)
pd.DataFrame(LCHS_X_train_bert)

In [None]:
HCLS_X_train_bert = bert_embedding(HCLS_X_train)
pd.DataFrame(HCLS_X_train_bert)

In [None]:
HCHS_X_train_bert = bert_embedding(HCHS_X_train)
pd.DataFrame(HCHS_X_train_bert)

### Testing set

In [None]:
LCLS_X_test_bert = bert_embedding(LCLS_X_test)
LCHS_X_test_bert = bert_embedding(LCHS_X_test)
HCLS_X_test_bert = bert_embedding(HCLS_X_test)
HCHS_X_test_bert = bert_embedding(HCHS_X_test)

# Model

## Package loading & function define

In [39]:
# ML
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

# DL
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D, Flatten, Input
from keras.callbacks import EarlyStopping
from keras.layers import LSTM

# evaluation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import make_scorer

import warnings

2024-06-30 00:53:37.489589: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-30 00:53:37.489614: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-30 00:53:37.492846: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-06-30 00:53:37.764554: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
# define validation function
def mape(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true))

def validation(model_name, X_train, y_train, word_vec_train):
    cv = 10
    
    # MSE
    mse_scores = cross_val_score(model_name, X_train, y_train, cv=cv, scoring='neg_mean_squared_error') # mse
    mse_scores = -mse_scores # transfer to positive
    avg_mse = mse_scores.mean()
    
    # MAE
    mae_scores = cross_val_score(model_name, X_train, y_train, cv=cv, scoring='neg_mean_absolute_error')
    mae_scores = -mae_scores  # Convert to positive values
    avg_mae = mae_scores.mean()
    
    # MAPE
    mape_scorer = make_scorer(mape, greater_is_better=False)  # Create custom scorer
    mape_scores = cross_val_score(model_name, X_train, y_train, cv=cv, scoring=mape_scorer)
    mape_scores = -mape_scores  # Convert to positive values
    avg_mape = mape_scores.mean()
    
    # R-squared
    r2_scores = cross_val_score(model_name, X_train, y_train, cv=cv, scoring='r2')
    avg_r2 = r2_scores.mean()

    print(f"{word_vec_train}'s MSE, MAE, MAPE, R^2: {avg_mse}, {avg_mae}, {avg_mape}, {avg_r2}")    

In [None]:
# define evaluation
def evaluation(y_test, y_pred, word_vec_test):
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"{word_vec_test}'s MSE, MAE, MAPE, R^2: {mse}, {mae}, {mape}, {r2}")

In [None]:
# define EarlyStopping callback 
early_stopping = EarlyStopping(
    monitor='val_loss',  # 監控驗證集上的損失值
    patience=10,  # 如果性能在10個epoch內沒有改善，則停止訓練
    verbose=1,  
    restore_best_weights=True  # 恢復最佳權重
)

In [43]:
# initialize word vector
Data = {
    'LCLS': {
        'tf': LCLS_X_train_bow_df,
        'tf-idf': LCLS_X_train_tfidf_df,
        'd2v': LCLS_X_train_d2v,
        'glove': LCLS_X_train_glove,
        'bert': LCLS_X_train_bert,
        'tf_test': LCLS_X_test_bow_df,
        'tf-idf_test': LCLS_X_test_tfidf_df,
        'd2v_test': LCLS_X_test_d2v, 
        'glove_test': LCLS_X_test_glove, 
        'bert_test': LCLS_X_test_bert, 
    },
    'LCHS': {
        'tf': LCHS_X_train_bow_df,
        'tf-idf': LCHS_X_train_tfidf_df,
        'd2v': LCHS_X_train_d2v,
        'glove': LCHS_X_train_glove,
        'bert': LCHS_X_train_bert,
        'tf_test': LCHS_X_test_bow_df,
        'tf-idf_test': LCHS_X_test_tfidf_df,
        'd2v_test': LCHS_X_test_d2v,
        'glove_test': LCHS_X_test_glove,
        'bert_test': LCHS_X_test_bert,
    },
    'HCLS': {
        'tf': HCLS_X_train_bow_df,
        'tf-idf': HCLS_X_train_tfidf_df,
        'd2v': HCLS_X_train_d2v,
        'glove': HCLS_X_train_glove,
        'bert': HCLS_X_train_bert,
        'tf_test': HCLS_X_test_bow_df,
        'tf-idf_test': HCLS_X_test_tfidf_df,
        'd2v_test': HCLS_X_test_d2v,
        'glove_test': HCLS_X_test_glove,
        'bert_test': HCLS_X_test_bert,
    },
    'HCHS': {
        'tf': HCHS_X_train_bow_df,
        'tf-idf': HCHS_X_train_tfidf_df,
        'd2v': HCHS_X_train_d2v,
        'glove': HCHS_X_train_glove,
        'bert': HCHS_X_train_bert,
        'tf_test': HCHS_X_test_bow_df,
        'tf-idf_test': HCHS_X_test_tfidf_df,
        'd2v_test': HCHS_X_test_d2v,
        'glove_test': HCHS_X_test_glove,
        'bert_test': HCHS_X_test_bert,
    }
}


## ML model
* SVR
* Random Forest
* XGBoost

### SVR

In [40]:
word_vec_train = 'tf-idf'
word_vec_test = 'tf-idf_test'

In [44]:
svr_LCLS = SVR(epsilon=0.2, kernel='rbf')
svr_LCHS = SVR(epsilon=0.2, kernel='rbf')
svr_HCLS = SVR(epsilon=0.2, kernel='rbf')
svr_HCHS = SVR(epsilon=0.2, kernel='rbf')

In [45]:
svr_LCLS.fit(Data['LCLS'][word_vec_train], LCLS_y_train)
svr_LCHS.fit(Data['LCHS'][word_vec_train], LCHS_y_train)
svr_HCLS.fit(Data['HCLS'][word_vec_train], HCLS_y_train)
svr_HCHS.fit(Data['HCHS'][word_vec_train], HCHS_y_train)

In [None]:
validation(svr_LCLS, Data['LCLS'][word_vec_train], LCLS_y_train, word_vec_train)

In [None]:
validation(svr_LCHS, Data['LCHS'][word_vec_train], LCHS_y_train, word_vec_train)

In [None]:
validation(svr_HCLS, Data['HCLS'][word_vec_train], HCLS_y_train, word_vec_train)

In [None]:
validation(svr_HCHS, Data['HCHS'][word_vec_train], HCHS_y_train, word_vec_train)

In [None]:
# prediction
svr_y_pred_LCLS = svr_LCLS.predict(Data['LCLS'][word_vec_test])
svr_y_pred_LCHS = svr_LCHS.predict(Data['LCHS'][word_vec_test])
svr_y_pred_HCLS = svr_HCLS.predict(Data['HCLS'][word_vec_test])
svr_y_pred_HCHS = svr_HCHS.predict(Data['HCHS'][word_vec_test])

In [None]:
# LCLS
evaluation(LCLS_y_test, svr_y_pred_LCLS, word_vec_test)

In [None]:
# LCHS
evaluation(LCHS_y_test, svr_y_pred_LCHS, word_vec_test)

In [None]:
# HCLS
evaluation(HCLS_y_test, svr_y_pred_HCLS, word_vec_test)

In [None]:
# HCHS
evaluation(HCHS_y_test, svr_y_pred_HCHS, word_vec_test)

### Random forest

In [None]:
word_vec_train_rf = 'bert'
word_vec_test_rf = 'bert_test'

In [None]:
rf_LCLS = RandomForestRegressor()
rf_LCHS = RandomForestRegressor()
rf_HCLS = RandomForestRegressor()
rf_HCHS = RandomForestRegressor()

In [None]:
rf_LCLS.fit(Data['LCLS'][word_vec_train_rf], LCLS_y_train)
rf_LCHS.fit(Data['LCHS'][word_vec_train_rf], LCHS_y_train)
rf_HCLS.fit(Data['HCLS'][word_vec_train_rf], HCLS_y_train)
rf_HCHS.fit(Data['HCHS'][word_vec_train_rf], HCHS_y_train)

In [None]:
validation(rf_LCLS, Data['LCLS'][word_vec_train_rf], LCLS_y_train, word_vec_train_rf)

In [None]:
validation(rf_LCHS, Data['LCHS'][word_vec_train_rf], LCHS_y_train, word_vec_train_rf)

In [None]:
validation(rf_HCLS, Data['HCLS'][word_vec_train_rf], HCLS_y_train, word_vec_train_rf)

In [None]:
validation(rf_HCHS, Data['HCHS'][word_vec_train_rf], HCHS_y_train, word_vec_train_rf)

In [None]:
# prediction
rf_y_pred_LCLS = rf_LCLS.predict(Data['LCLS'][word_vec_test_rf])
rf_y_pred_LCHS = rf_LCHS.predict(Data['LCHS'][word_vec_test_rf])
rf_y_pred_HCLS = rf_HCLS.predict(Data['HCLS'][word_vec_test_rf])
rf_y_pred_HCHS = rf_HCHS.predict(Data['HCHS'][word_vec_test_rf])

In [None]:
# evaluation
evaluation(LCLS_y_test, rf_y_pred_LCLS, word_vec_test_rf)
evaluation(LCHS_y_test, rf_y_pred_LCHS, word_vec_test_rf)
evaluation(HCLS_y_test, rf_y_pred_HCLS, word_vec_test_rf)
evaluation(HCHS_y_test, rf_y_pred_HCHS, word_vec_test_rf)

### XGBOOST

In [None]:
word_vec_train_xgb = 'tf'
word_vec_test_xgb = 'tf_test'

In [None]:
xgb_LCLS = xgb.XGBRegressor()
xgb_LCHS = xgb.XGBRegressor()
xgb_HCLS = xgb.XGBRegressor()
xgb_HCHS = xgb.XGBRegressor()

In [None]:
xgb_LCLS.fit(Data['LCLS'][word_vec_train_xgb], LCLS_y_train)
xgb_LCHS.fit(Data['LCHS'][word_vec_train_xgb], LCHS_y_train)
xgb_HCLS.fit(Data['HCLS'][word_vec_train_xgb], HCLS_y_train)
xgb_HCHS.fit(Data['HCHS'][word_vec_train_xgb], HCHS_y_train)

In [None]:
validation(xgb_LCLS, Data['LCLS'][word_vec_train_xgb], LCLS_y_train, word_vec_train_xgb)

In [None]:
validation(xgb_LCHS, Data['LCHS'][word_vec_train_xgb], LCHS_y_train, word_vec_train_xgb)

In [None]:
validation(xgb_HCLS, Data['HCLS'][word_vec_train_xgb], HCLS_y_train, word_vec_train_xgb)

In [None]:
validation(xgb_HCHS, Data['HCHS'][word_vec_train_xgb], HCHS_y_train, word_vec_train_xgb)

In [None]:
# prediction
xgb_y_pred_LCLS = xgb_LCLS.predict(Data['LCLS'][word_vec_test_xgb])
xgb_y_pred_LCHS = xgb_LCHS.predict(Data['LCHS'][word_vec_test_xgb])
xgb_y_pred_HCLS = xgb_HCLS.predict(Data['HCLS'][word_vec_test_xgb])
xgb_y_pred_HCHS = xgb_HCHS.predict(Data['HCHS'][word_vec_test_xgb])

In [None]:
# LCLS, LCHS, HCLS, HCHS
evaluation(LCLS_y_test, xgb_y_pred_LCLS, word_vec_test_xgb)
evaluation(LCHS_y_test, xgb_y_pred_LCHS, word_vec_test_xgb)
evaluation(HCLS_y_test, xgb_y_pred_HCLS, word_vec_test_xgb)
evaluation(HCHS_y_test, xgb_y_pred_HCHS, word_vec_test_xgb)

## DL Model

## MLP

### Pytorch(Error)

In [None]:
LCLS_X_train_tfidf_df

In [None]:
LCLS_y_train

In [None]:
# step1: TF-IDF DataFrame and rating(y) transform to PyTorch tensor
X = torch.tensor(LCLS_X_train_tfidf_df.values, dtype=torch.float32)
y = torch.tensor(LCLS_y_train.values, dtype=torch.float32)

In [None]:
# step2: contruct Pytorch dataloader
from torch.utils.data import DataLoader, TensorDataset
dataset = TensorDataset(X, y)
batch_size = 128
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [None]:
# step3: define model
class MLPModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MLPModel, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.dropout1 = nn.Dropout(p=0.3)
        self.fc3 = nn.Linear(hidden_size, 64)
        self.dropout2 = nn.Dropout(p=0.3)
        self.relu = nn.ReLU()
        self.fc4 = nn.Linear(64, output_size)

    def forward(self, x):
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.dropout1(x)
        x = self.fc3(x)
        x = self.dropout2(x)
        x = self.relu(x)
        x = self.fc4(x)
        return x

input_size = LCLS_X_train_tfidf_df.shape[1]  # 輸入特徵的維度
hidden_size = 128  # 隱藏層的神經元數量
output_size = 1  # 輸出的評分值

model = MLPModel(input_size, hidden_size, output_size)

In [None]:
model

In [None]:
dataloader

In [None]:
# step 4: training
criterion = nn.MSELoss()  # 使用均方誤差損失
optimizer = optim.Adam(model.parameters(), lr=0.01)

num_epochs = 50
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    for inputs, labels in dataloader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(dataloader)}')

### Keras

In [None]:
sub_dataset = 'LCLS'
word_vec_train = 'tf'
word_vec_test = 'tf_test'

In [None]:
# define variables
subdata_X_train_embedding = Data[sub_dataset][word_vec_train]
subdata_y_train = Data[sub_dataset]['y_train']
subdata_X_test_embedding = Data[sub_dataset][word_vec_test]
subdata_y_test = Data[sub_dataset]['y_test']

In [None]:
# find best parameter list
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from IPython.display import clear_output

def MLP_model_para(X_train_embedding, params):
    nn_model = Sequential()
    nn_model.add(Dense(units=int(params['units']), input_dim=X_train_embedding.shape[1], activation=params['activation']))
    nn_model.add(Dropout(params['dropout']))
    nn_model.add(Dense(units=int(params['units_h']), activation=params['activation']))
    nn_model.add(Dropout(params['dropout']))
    nn_model.add(Dense(1, activation=params['activation']))  # Use linear for regression 
    
    nn_model.compile(loss='mean_squared_error', optimizer=params['optimizer'])
    return nn_model

def objective(params):
    model = MLP_model_para(subdata_X_train_embedding.to_numpy(), params)

    model.fit(subdata_X_train_embedding.to_numpy(), subdata_y_train.to_numpy(), epochs=int(params['epochs']), batch_size=int(params['batch_size']), validation_split=0.2, verbose=0)
    y_pred = model.predict(subdata_X_test_embedding.to_numpy())
    mse = mean_squared_error(subdata_y_test, y_pred)
    clear_output(wait=True)  # clear output
    return {'loss': mse, 'status': STATUS_OK}

space = {
    'units': hp.quniform('units', 32, 256, 32),
    'units_h': hp.quniform('units_h', 32, 256, 32),
    'activation': hp.choice('activation', ['relu', 'sigmoid']),
    'dropout': hp.uniform('dropout', 0, 1),
    'optimizer': hp.choice('optimizer', ['adam', 'rmsprop', 'sgd']),
    'epochs': hp.quniform('epochs', 10, 100, 10),
    'batch_size': hp.quniform('batch_size', 16, 256, 32)
}

trials = Trials()  # Create a trials object to track the progress
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=50, trials=trials)
print("Best hyperparameters:", best)

# Can also access the results and losses from the trials object
losses = [trial['loss'] for trial in trials.results]
best_loss = min(losses)
print("Best MSE:", best_loss)

In [None]:
best

In [None]:
# Model parameter
activation = 'relu'
batch_size = int(best['batch_size'])
dropout = best['dropout']
epochs = 70
optimizer = 'rmsprop'
units = 256
units_h = 32

In [None]:
def MLP_model(X_train_embedding, y_train):
  # define model
  nn_model = Sequential()
  # Input - Layer
  nn_model.add(Dense(units=128, input_dim=X_train_embedding.shape[1], activation='relu'))
  # Hidden - Layers
  nn_model.add(Dropout(0.3))
  nn_model.add(Dense(units=64, activation='relu'))
  nn_model.add(Dropout(0.3))
  # Output- Layer
  nn_model.add(Dense(1, activation='relu'))

  nn_model.compile(loss='mean_squared_error', optimizer='adam')
  return nn_model

In [None]:
def cross_val_metrics(X, y, n_splits=10):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    mse_scores = []
    rmse_scores = []
    mae_scores = []
    mape_scores = []
    corr_scores = []
    r2_scores = []

    X = X.to_numpy()  # Convert DataFrame to NumPy array
    y = y.to_numpy()  # Convert Series to NumPy array

    for train_index, test_index in kf.split(X):
        model = MLP_model(X,y)  # Create a new model for each fold
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2, verbose=0, callbacks=[early_stopping])
        y_pred = model.predict(X_test).flatten()

        mse = mean_squared_error(y_test, y_pred)
        # rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_test, y_pred)
        mape = mean_absolute_percentage_error(y_test, y_pred)
        # corr = np.corrcoef(y_test, y_pred)[0, 1]
        r2 = r2_score(y_test, y_pred)

        mse_scores.append(mse)
        # rmse_scores.append(rmse)
        mae_scores.append(mae)
        mape_scores.append(mape)
        # corr_scores.append(corr)
        r2_scores.append(r2)

    return mse_scores, mae_scores, mape_scores, r2_scores

In [None]:
# validation
warnings.filterwarnings('ignore', category=UserWarning)
mse_scores,  mae_scores, mape_scores,  r2_scores = cross_val_metrics(subdata_X_train_embedding, subdata_y_train)

print(f"{sub_dataset}'s MSE, MAE, MAPE, R^2: {np.mean(mse_scores)}, {np.mean(mae_scores)}, {np.mean(mape_scores)}, {np.mean(r2_scores)}")

In [None]:
# train
mlp_model = MLP_model(subdata_X_train_embedding, subdata_y_train)
mlp_model.fit(subdata_X_train_embedding, subdata_y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2, verbose=1, callbacks=[early_stopping])

In [None]:
# test
mlp_y_pred = mlp_model.predict(subdata_X_test_embedding)
mlp_y_pred = mlp_y_pred.ravel() # covert to 1-dim

# model evaluation
evaluation(subdata_y_test, mlp_y_pred)

## Text-CNN

In [1]:
sub_dataset_cnn = 'HCHS'
word_vec_train_cnn = 'bert'
word_vec_test_cnn = 'bert_test'

In [None]:
# define variables
subdata_X_train_embedding = Data[sub_dataset_cnn][word_vec_train_cnn]
subdata_y_train = Data[sub_dataset_cnn]['y_train']
subdata_X_test_embedding = Data[sub_dataset_cnn][word_vec_test_cnn]
subdata_y_test = Data[sub_dataset_cnn]['y_test']

In [None]:
# find best parameter list
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from IPython.display import clear_output

# fill in missing values with the mean of the column
subdata_X_train_embedding.fillna(subdata_X_train_embedding.mean(), inplace=True)
subdata_y_train.fillna(subdata_y_train.mean(), inplace=True)
subdata_X_test_embedding.fillna(subdata_X_test_embedding.mean(), inplace=True)
subdata_y_test.fillna(subdata_y_test.mean(), inplace=True)

def CNN_model_para(X_train_shape, params):
    cnn_model = Sequential()
    # 1st Conv1D + MaxPooling1D layer  
    cnn_model.add(Conv1D(filters=int(params['filters']), kernel_size=int(params['kernel_size']), activation='relu', padding='same', input_shape=(X_train_shape[1], 1)))
    cnn_model.add(Conv1D(filters=int(params['filters_1']), kernel_size=int(params['kernel_size']), activation='relu', padding='same'))
    cnn_model.add(MaxPooling1D(int(params['pool_size']), padding='same'))
    # Flatten
    cnn_model.add(Flatten())
    # Fully connected layers
    cnn_model.add(Dropout(params['dropout']))
    cnn_model.add(Dense(units=int(params['dense_units']), activation='relu'))
    # # Output layer for regression
    cnn_model.add(Dense(1, activation='linear'))
    cnn_model.compile(optimizer=params['optimizer'], loss='mean_squared_error')
    return cnn_model

def objective(params):
    cnn_model = CNN_model_para(subdata_X_train_embedding.shape, params)
    cnn_model.fit(subdata_X_train_embedding, subdata_y_train, epochs=int(params['epochs']), batch_size=int(params['batch_size']), validation_split=0.2, verbose=0, callbacks=[early_stopping])
    
    y_pred = cnn_model.predict(subdata_X_test_embedding)
    mse = mean_squared_error(subdata_y_test, y_pred)
    # clear_output(wait=True)  # clear output
    return {'loss': mse, 'status': STATUS_OK}

space = {
    'filters': hp.quniform('filters', 32, 256, 32),
    'filters_1': hp.quniform('filters_1', 32, 256, 32),
    'kernel_size': hp.choice('kernel_size', [7, 9, 11, 13]),
    'pool_size': hp.choice('pool_size', [2, 3, 5]),
    'dropout': hp.uniform('dropout', 0, 1),
    'dense_units': hp.quniform('dense_units', 32, 256, 32),
    'optimizer': hp.choice('optimizer', ['adam', 'rmsprop', 'sgd']),
    'epochs': hp.quniform('epochs', 10, 100, 10),
    'batch_size': hp.quniform('batch_size', 16, 256, 32)
}

trials = Trials()
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=30, trials=trials)
print("Best hyperparameters:", best)

# Can also access the results and losses from th
# e trials object
losses = [trial['loss'] for trial in trials.results]
best_loss = min(losses)
print("Best MSE:", best_loss)

In [None]:
best

In [None]:
# Best hyperparameters: {'batch_size': 32.0, 'dense_units': 192.0, 'dropout': 0.20990214723448342, 'epochs': 2, 'filters': 224.0, 'filters_1': 96.0, 'kernel_size': 3, 'optimizer': 2, 'pool_size': 1}
batch_size = int(best['batch_size'])
dense_units = int(best['dense_units'])
dropout = best['dropout']
epochs = 70
filters = int(best['filters'])
filters_1 = int(best['filters_1'])
kernel_size = 9
optimizer = 'rmsprop'
pool_size = 5

In [None]:
def CNN_model(X_train_embedding, y_train):
    # define model
    cnn_model = Sequential()
    
    # Conv1D 2 layer + MaxPooling1D layer
    cnn_model.add(Conv1D(filters=filters, kernel_size=kernel_size, activation='relu', padding='same', input_shape=(X_train_embedding.shape[1], 1)))
    cnn_model.add(Conv1D(filters=filters_1, kernel_size=kernel_size, activation='relu', padding='same'))
    cnn_model.add(MaxPooling1D(pool_size, padding='same'))
    
    # 2nd Conv1D + MaxPooling1D layer
    # cnn_model.add(Conv1D(filters=int(filters*2), kernel_size=kernel_size, activation='relu', padding='same'))
    # cnn_model.add(MaxPooling1D(pool_size, padding='same'))
    cnn_model.add(Flatten())
    
    # Fully connected layers
    cnn_model.add(Dropout(dropout))
    cnn_model.add(Dense(units=dense_units, activation='relu'))
    
    # Output layer for regression
    cnn_model.add(Dense(1, activation='linear'))
    
    cnn_model.compile(optimizer=optimizer, loss='mean_squared_error')
    
    return cnn_model

In [None]:
def cross_val_metrics(X, y, n_splits=10):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    mse_scores = []
    rmse_scores = []
    mae_scores = []
    mape_scores = []
    corr_scores = []
    r2_scores = []

    X = X.to_numpy()  # Convert DataFrame to NumPy array
    y = y.to_numpy()  # Convert Series to NumPy array

    for train_index, test_index in kf.split(X):
        model = CNN_model(X,y)  # Create a new model for each fold
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2, verbose=0, callbacks=[early_stopping])
        y_pred = model.predict(X_test).flatten()

        mse = mean_squared_error(y_test, y_pred)
        # rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_test, y_pred)
        mape = mean_absolute_percentage_error(y_test, y_pred)
        # corr = np.corrcoef(y_test, y_pred)[0, 1]
        r2 = r2_score(y_test, y_pred)

        mse_scores.append(mse)
        # rmse_scores.append(rmse)
        mae_scores.append(mae)
        mape_scores.append(mape)
        # corr_scores.append(corr)
        r2_scores.append(r2)

    return mse_scores, mae_scores, mape_scores, r2_scores

In [None]:
# validation
warnings.filterwarnings('ignore', category=UserWarning)
mse_scores, mae_scores, mape_scores, r2_scores = cross_val_metrics(subdata_X_train_embedding, subdata_y_train)
print(f"{sub_dataset_cnn}'s MSE, MAE, MAPE, R^2: {np.mean(mse_scores)}, {np.mean(mae_scores)}, {np.mean(mape_scores)}, {np.mean(r2_scores)}")

In [None]:
# train
cnn_model = CNN_model(subdata_X_train_embedding, subdata_y_train)
cnn_model.fit(subdata_X_train_embedding, subdata_y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2, verbose=1, callbacks=[early_stopping])

In [None]:
# test
cnn_y_pred = cnn_model.predict(subdata_X_test_embedding)
cnn_y_pred = cnn_y_pred.ravel() # covert to 1-dim

# model evaluation
evaluation(subdata_y_test, cnn_y_pred)

## LSTM

In [None]:
sub_dataset_lstm = 'HCHS'
word_vec_train_lstm = 'bert'
word_vec_test_lstm = 'bert_test'

In [None]:
# define variables
subdata_X_train_embedding = Data[sub_dataset_lstm][word_vec_train_lstm]
subdata_y_train = Data[sub_dataset_lstm]['y_train']
subdata_X_test_embedding = Data[sub_dataset_lstm][word_vec_test_lstm]
subdata_y_test = Data[sub_dataset_lstm]['y_test']

In [None]:
# reshape vector to 3D
subdata_X_train_embedding = subdata_X_train_embedding.to_numpy().reshape(subdata_X_train_embedding.shape[0], 1, subdata_X_train_embedding.shape[1])
subdata_X_test_embedding = subdata_X_test_embedding.to_numpy().reshape(subdata_X_test_embedding.shape[0], 1, subdata_X_test_embedding.shape[1])

In [None]:
# find best parameter list
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from IPython.display import clear_output

def lstm_model_para(X_train_embedding, params):
    lstm_model = Sequential()
    lstm_model.add(LSTM(units=int(params['units']), return_sequences=params['return_sequences'], input_shape=(X_train_embedding.shape[1], X_train_embedding.shape[2])))
    lstm_model.add(Dropout(params['dropout']))
    lstm_model.add(Dense(units=1, activation='linear'))
    lstm_model.compile(optimizer=params['optimizer'], loss='mean_squared_error')
    return lstm_model

def objective(params):
    lstm_model = lstm_model_para(subdata_X_train_embedding, params)

    lstm_model.fit(subdata_X_train_embedding, subdata_y_train, epochs=int(params['epochs']), batch_size=int(params['batch_size']), validation_split=0.2, verbose=0, callbacks=[early_stopping])

    y_pred = lstm_model.predict(subdata_X_test_embedding)
    mse = mean_squared_error(subdata_y_test, y_pred.flatten())
    clear_output(wait=True)  # clear output
    return {'loss': mse, 'status': STATUS_OK}

space = {
    'units': hp.quniform('units', 16, 256, 32),
    'return_sequences': hp.choice('return_sequences', [True, False]),
    'dropout': hp.uniform('dropout', 0, 0.5),
    'optimizer': hp.choice('optimizer', ['adam', 'rmsprop', 'sgd']),
    'epochs': hp.quniform('epochs', 10, 100, 10),
    'batch_size': hp.quniform('batch_size', 16, 256, 32)
}

trials = Trials()
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=50, trials=trials)

print("Best hyperparameters:", best)

# Also access the results and losses from the trials object
losses = [trial['loss'] for trial in trials.results]
best_loss = min(losses)
print("Best MSE:", best_loss)

In [None]:
best

In [None]:
batch_size = int(best['batch_size'])
units = int(best['units'])
dropout = best['dropout']
epochs = 100
optimizer = 'adam'
return_sequences = True

In [None]:
def LSTM_model(X_train_embedding, y_train): 
    model = Sequential()
    model.add(LSTM(units=units, return_sequences=return_sequences, input_shape=(X_train_embedding.shape[1], X_train_embedding.shape[2])))
    model.add(Dropout(dropout))
    model.add(Dense(units=1, activation='linear'))
    model.compile(optimizer=optimizer, loss='mean_squared_error')
    return model

In [None]:
def cross_val_metrics(X, y, n_splits=10):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    mse_scores = []
    # rmse_scores = []
    mae_scores = []
    mape_scores = []
    # corr_scores = []
    r2_scores = []

    # # X = X.to_numpy()  # Convert DataFrame to NumPy array
    # y = y.values.ravel()  # Convert Series to NumPy array
    y = y.reset_index(drop=True)  # reset y index

    for train_index, test_index in kf.split(X):
        model = LSTM_model(X,y)  # Create a new model for each fold
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
        model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2, verbose=0, callbacks=[early_stopping])
        y_pred = model.predict(X_test).flatten()

        mse = mean_squared_error(y_test, y_pred)
        # rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_test, y_pred)
        mape = mean_absolute_percentage_error(y_test, y_pred)
        # corr = np.corrcoef(y_test, y_pred)[0, 1]
        r2 = r2_score(y_test, y_pred)

        mse_scores.append(mse)
        # rmse_scores.append(rmse)
        mae_scores.append(mae)
        mape_scores.append(mape)
        # corr_scores.append(corr)
        r2_scores.append(r2)

    return mse_scores, mae_scores, mape_scores, r2_scores

In [None]:
# validation
warnings.filterwarnings('ignore', category=UserWarning)
mse_scores, mae_scores, mape_scores, r2_scores = cross_val_metrics(subdata_X_train_embedding, subdata_y_train)
print(f"{sub_dataset_lstm}'MSE, MAE, MAPE, CC: {np.mean(mse_scores)}, {np.mean(mae_scores)}, {np.mean(mape_scores)}, {np.mean(r2_scores)}")

In [None]:
# train
lstm_model = LSTM_model(subdata_X_train_embedding, subdata_y_train)
lstm_model.fit(subdata_X_train_embedding, subdata_y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2, verbose=1, callbacks=[early_stopping])

In [None]:
# test
lstm_y_pred = lstm_model.predict(subdata_X_test_embedding)
lstm_y_pred = lstm_y_pred.ravel() # covert to 1-dim

# model evaluation
evaluation(subdata_y_test, lstm_y_pred)

In [None]:
# save LSTM model and architecture to single file
lstm_model.save("Model/HCHS_lstm_model_longformer.h5")