In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from transformers import BertTokenizer, BertModel



In [2]:
books_df = pd.read_csv('data/preprocessed-data2.csv')

books_df['labels'] = books_df[['Fantasy', 'Young Adult', 'Classics', 'Romance', 'Historical Fiction']].values.tolist()

train_df, test_df = train_test_split(books_df, test_size=0.15, random_state=42)


In [11]:
print(f"Train data: {len(train_df)} rows")
print(f"Test data: {len(test_df)} rows")


Train data: 8866 rows
Test data: 1565 rows


In [3]:
# BERT Tokenizer dan model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Fungsi untuk encode teks menjadi embeddings BERT
def encode_bert(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze()

# Encode book_details dari train dan test set
train_df['bert_embeddings'] = train_df['book_details'].apply(lambda x: encode_bert(x).numpy())
test_df['bert_embeddings'] = test_df['book_details'].apply(lambda x: encode_bert(x).numpy())


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [4]:
print("BERT Embeddings dari train_df:")
print(train_df[['book_title', 'bert_embeddings']].head())

BERT Embeddings dari train_df:
                                             book_title  \
5247                             Days of Blood and Fire   
837   Of Time and the River: A Legend of Man's Hunge...   
9646                                            Vicious   
2287                             The Great Dune Trilogy   
227                                      The Lighthouse   

                                        bert_embeddings  
5247  [-0.05223939, -0.06135322, 0.6332198, -0.05536...  
837   [-0.14010018, 0.21665296, 0.47726083, -0.15766...  
9646  [-0.29220918, 0.030329172, 0.5338684, -0.04132...  
2287  [-0.41899854, 0.16336535, 0.043289322, -0.2021...  
227   [-0.13574862, 0.2181609, 0.6103115, -0.2158134...  


In [5]:
print("\nBERT Embeddings dari test_df:")
print(test_df[['book_title', 'bert_embeddings']].head())


BERT Embeddings dari test_df:
                                             book_title  \
3133                                   The Mango Season   
7749                                        The Warlock   
3963                                    The Setting Sun   
8071                             The Fault in Our Stars   
5081  The Supernatural Omnibus- Being A Collection o...   

                                        bert_embeddings  
3133  [-0.043614767, -0.010836026, 0.6619129, -0.065...  
7749  [-0.18907814, -0.025165105, 0.5567748, -0.0533...  
3963  [-0.35580024, 0.10432567, 0.40185836, -0.20675...  
8071  [-0.15204097, 0.050045114, 0.52621484, 0.03933...  
5081  [-0.28905517, 0.165146, 0.22898307, -0.2821451...  


In [7]:
print("BERT Embedding dari satu contoh book_details di train_df:")
print(train_df['bert_embeddings'].iloc[0])


BERT Embedding dari satu contoh book_details di train_df:
[-5.22393882e-02 -6.13532215e-02  6.33219779e-01 -5.53619452e-02
  3.42566311e-01 -4.32385147e-01  2.78458506e-01 -4.17638607e-02
  1.94420815e-02 -3.58723044e-01  1.81484759e-01 -1.54064968e-01
  1.83699474e-01  3.50742906e-01 -1.67942479e-01  4.68606740e-01
 -1.48401991e-01  4.25058119e-02 -1.13426335e-01 -7.69843236e-02
  1.77531511e-01 -9.09341648e-02  1.06079429e-01  3.08689177e-01
  3.62681150e-01  1.16914943e-01 -4.36816216e-02  7.25115910e-02
  1.50448764e-02  1.56351939e-01  3.85726273e-01  8.64406750e-02
  3.69119644e-02 -1.74201578e-01 -1.54775485e-01 -1.96046359e-03
 -5.80944568e-02 -1.93467550e-02 -3.58001180e-02  2.38806039e-01
 -2.73578584e-01 -2.79195845e-01  2.80057769e-02  2.27471054e-01
 -1.65203631e-01 -3.27725530e-01 -3.83984387e-01 -4.92725754e-03
  1.56283498e-01 -1.39275193e-01 -2.12894946e-01  4.12041843e-01
 -5.28463870e-02 -1.33920610e-01  9.74467248e-02  4.92933869e-01
 -1.26744509e-01 -2.29701146e-01

In [9]:
train_df.to_csv('train_data.csv', index=False)
test_df.to_csv('test_data.csv', index=False)

print("Train dan Test dataset telah disimpan ke file CSV.")


Train dan Test dataset telah disimpan ke file CSV.
