In [2]:
!pip install sentence_transformers
!pip install evaluate
# !pip install --upgrade pyarrow



In [3]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer, models

# 下載訓練和測試資料集
train_data = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
test_data = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

# 將資料集轉換為 DataFrame
train_df = pd.DataFrame({'text': train_data.data, 'target': train_data.target})
test_df = pd.DataFrame({'text': test_data.data, 'target': test_data.target})

# 隨機抽樣每個類別50筆資料
sampled_train_df = train_df.groupby('target', group_keys=False).apply(lambda x: x.sample(50, random_state=30241))
sampled_test_df = test_df.groupby('target', group_keys=False).apply(lambda x: x.sample(50, random_state=30241))

print(sampled_train_df)
print(sampled_test_df)




                                                    text  target
8541    \n \nNo wonder in the light of that you are a...       0
10464                                                          0
2710   \n\nAssuming you are presenting it accurately,...       0
10080  \nIt appears that Walla Walla College will fil...       0
5084   \n: Regardless of people's hidden motivations,...       0
...                                                  ...     ...
9335   \nI suppose these illegal guns have been found...      19
7142   \n\nYou are correct. See today's (4/21) Washin...      19
984    Mr DeCenso, in spite of requiring Scholarly op...      19
8196   \n\n\n\nOmigod, it's an operationalist! Sorry,...      19
2811   \nChristians through ages have had to learn to...      19

[1000 rows x 2 columns]
                                                   text  target
557   \n\n\tMay I ask why they are afraid to do so?\...       0
6475  \n\nI'm sure all the religious types would get...       0
336

In [4]:
# 使用 [cls] 建立embedding

# 定義 BERT 模型和 CLS pooling
model_name = "google-bert/bert-base-uncased"
word_embedding_model = models.Transformer(model_name)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), "cls")
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])


# 提取文本嵌入
train_embeddings = model.encode(sampled_train_df['text'].tolist(), convert_to_tensor=True)
test_embeddings = model.encode(sampled_test_df['text'].tolist(), convert_to_tensor=True)



# # 將嵌入添加到 DataFrame 中
# sample_df['cls_embedding'] = embeddings.tolist()

# # 查看結果
# print(sample_df.head())



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=20)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
from torch.utils.data import DataLoader, TensorDataset
import torch

# 轉換成tensor
train_label = torch.tensor(sampled_train_df['target'].to_numpy())
test_label = torch.tensor(sampled_test_df['target'].to_numpy())

train_tensor = TensorDataset(train_embeddings, train_label)
test_tensor = TensorDataset(test_embeddings, test_label)

In [67]:
# 使用 DataLoader 來迭代

train_dataloader = DataLoader(train_tensor, shuffle=True, batch_size=32)
test_dataloader = DataLoader(test_tensor, batch_size=32)

In [68]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=1e-5)

In [73]:
from transformers import get_scheduler

num_epochs = 100
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [74]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [75]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()

count = 1
for epoch in range(num_epochs):
    for batch in train_dataloader:
        inputs, labels = batch
        inputs = inputs.to(device)
        labels = labels.to(device).long()
        inputs = inputs.unsqueeze(1)


        outputs = model(inputs_embeds=inputs, labels=labels)
        loss = outputs.loss
        if count % 100 == 0:
            print("Cost after iteration {}: {}".format(count, loss))

        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
        count += 1


  0%|          | 0/3200 [00:00<?, ?it/s]

Cost after iteration 100: 2.1695289611816406
Cost after iteration 200: 2.1268699169158936
Cost after iteration 300: 2.282475471496582
Cost after iteration 400: 1.9841885566711426
Cost after iteration 500: 1.9023925065994263
Cost after iteration 600: 1.6839268207550049
Cost after iteration 700: 1.8319628238677979
Cost after iteration 800: 2.2350966930389404
Cost after iteration 900: 1.4706827402114868
Cost after iteration 1000: 1.5895346403121948
Cost after iteration 1100: 1.5123193264007568
Cost after iteration 1200: 1.2528536319732666
Cost after iteration 1300: 1.3267486095428467
Cost after iteration 1400: 1.516972303390503
Cost after iteration 1500: 1.3187373876571655
Cost after iteration 1600: 1.5868322849273682
Cost after iteration 1700: 1.229599118232727
Cost after iteration 1800: 1.2404407262802124
Cost after iteration 1900: 1.054173469543457
Cost after iteration 2000: 1.3853880167007446
Cost after iteration 2100: 1.3497196435928345
Cost after iteration 2200: 1.1507914066314697
C

In [76]:
import evaluate

# 使用 f1 scores
metric = evaluate.load("f1")

model.eval()
for batch in test_dataloader:
    inputs, labels = batch
    inputs = inputs.to(device)
    labels = labels.to(device).long()
    inputs = inputs.unsqueeze(1)

    with torch.no_grad():
        outputs = model(inputs_embeds=inputs, labels=labels)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)

    metric.add_batch(predictions=predictions, references=labels)

# 計算分數
results = metric.compute(average='weighted')
print(results)

{'f1': 0.3786509868222942}
