# Use BERT to process data

> ## Data preprocessing

### Data preparation

In [2]:
import torch
torch.cuda.is_available()

AttributeError: module 'torch' has no attribute 'cuda'

In [3]:
import pandas as pd
folder_name = 'dm-2024-isa-5810-lab-2-homework'
data_identification = pd.read_csv(folder_name + '/data_identification.csv')
emotion = pd.read_csv(folder_name + '/emotion.csv')
sample_submission = pd.read_csv(folder_name + '/sampleSubmission.csv')

print(data_identification)
print(data_identification.shape)
print(f"{'='*40}")
print(emotion)
print(emotion.shape)
print(f"{'='*40}")
print(sample_submission)
print(f"{'='*40}")

df_twitter = pd.read_json(folder_name + '/tweets_DM.json', lines=True)
train_ids = data_identification[data_identification['identification'] == 'train']['tweet_id'].tolist()
test_ids = data_identification[data_identification['identification'] == 'test']['tweet_id'].tolist()

print("Show ids of train and test\n")
print(len(train_ids))
print(len(test_ids))
print(len(train_ids) + len(test_ids))

df_twitter_expanded = pd.json_normalize(df_twitter['_source'])

print("After expand the tweet_id, tweet_hashtag...\n")
df_twitter['tweet_id'] = df_twitter_expanded['tweet.tweet_id']
df_twitter['text'] = df_twitter_expanded['tweet.text']
df_twitter['hash_tags'] = df_twitter_expanded['tweet.hashtags']

df_twitter_train = df_twitter[df_twitter['tweet_id'].isin(train_ids)]
df_twitter_test = df_twitter[df_twitter['tweet_id'].isin(test_ids)]

print("After saperate train and test:\n")
print(df_twitter_train.shape)
print(df_twitter_test.shape)

df_twitter_train = pd.merge(df_twitter_train, emotion, on='tweet_id', how='left')

         tweet_id identification
0        0x28cc61           test
1        0x29e452          train
2        0x2b3819          train
3        0x2db41f           test
4        0x2a2acc          train
...           ...            ...
1867530  0x227e25          train
1867531  0x293813          train
1867532  0x1e1a7e          train
1867533  0x2156a5          train
1867534  0x2bb9d2          train

[1867535 rows x 2 columns]
(1867535, 2)
         tweet_id       emotion
0        0x3140b1       sadness
1        0x368b73       disgust
2        0x296183  anticipation
3        0x2bd6e1           joy
4        0x2ee1dd  anticipation
...           ...           ...
1455558  0x38dba0           joy
1455559  0x300ea2           joy
1455560  0x360b99          fear
1455561  0x22eecf           joy
1455562  0x2fb282  anticipation

[1455563 rows x 2 columns]
(1455563, 2)
              id   emotion
0       0x2c7743  surprise
1       0x2c1eed  surprise
2       0x2826ea  surprise
3       0x356d9a  surprise
4  

In [None]:
import torch
from transformers import BertTokenizer, BertModel
import torch.nn as nn
import torch.optim as optim

# 初始化 BERT 模型與 Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased")

# 自定義神經網絡分類器
class SimpleClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(SimpleClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# 設置神經網絡參數
input_dim = 768       # BERT-base 的輸出維度
hidden_dim = 128      # 可以根據需求調整
output_dim = 8        # 標籤的數量
classifier = SimpleClassifier(input_dim, hidden_dim, output_dim)

# 載入一些文本並進行 tokenization 和嵌入生成
texts = df_twitter_train['text']
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

# 獲取 BERT 的嵌入
with torch.no_grad():
    outputs = bert_model(**inputs)
    cls_embeddings = outputs.last_hidden_state[:, 0, :]  # 提取 [CLS] token 嵌入

# 將 BERT 嵌入輸入到自定義的神經網絡分類器
logits = classifier(cls_embeddings)

# 計算分類損失並進行反向傳播
labels = torch.tensor([0, 1])  # 範例標籤
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(classifier.parameters(), lr=1e-5)

loss = criterion(logits, labels)
loss.backward()
optimizer.step()

print("Loss:", loss.item())


Import BERT libraries

In [4]:
from transformers import BertTokenizer, BertForSequenceClassification

In [5]:
df_twitter_train.head()

Unnamed: 0,_score,_index,_source,_crawldate,_type,tweet_id,text,hash_tags,emotion
0,391,hashtag_tweets,"{'tweet': {'hashtags': ['Snapchat'], 'tweet_id...",2015-05-23 11:42:47,tweets,0x376b20,"People who post ""add me on #Snapchat"" must be ...",[Snapchat],anticipation
1,433,hashtag_tweets,"{'tweet': {'hashtags': ['freepress', 'TrumpLeg...",2016-01-28 04:52:09,tweets,0x2d5350,"@brianklaas As we see, Trump is dangerous to #...","[freepress, TrumpLegacy, CNN]",sadness
2,376,hashtag_tweets,"{'tweet': {'hashtags': [], 'tweet_id': '0x1cd5...",2016-01-24 23:53:05,tweets,0x1cd5b0,Now ISSA is stalking Tasha 😂😂😂 <LH>,[],fear
3,120,hashtag_tweets,"{'tweet': {'hashtags': ['authentic', 'LaughOut...",2015-06-11 04:44:05,tweets,0x1d755c,@RISKshow @TheKevinAllison Thx for the BEST TI...,"[authentic, LaughOutLoud]",joy
4,1021,hashtag_tweets,"{'tweet': {'hashtags': [], 'tweet_id': '0x2c91...",2015-08-18 02:30:07,tweets,0x2c91a8,Still waiting on those supplies Liscus. <LH>,[],anticipation


In [7]:
df_twitter_train['text'].head()

0    People who post "add me on #Snapchat" must be ...
1    @brianklaas As we see, Trump is dangerous to #...
2                  Now ISSA is stalking Tasha 😂😂😂 <LH>
3    @RISKshow @TheKevinAllison Thx for the BEST TI...
4         Still waiting on those supplies Liscus. <LH>
Name: text, dtype: object

In [8]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=8)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
texts = list(df_twitter_train['text'])
labels = list(df_twitter_train['emotion'])

labels[:5]

['anticipation', 'sadness', 'fear', 'joy', 'anticipation']

In [13]:
label_map = {emotion: idx for idx, emotion in enumerate(set(labels))}
print(label_map)

{'joy': 0, 'trust': 1, 'anticipation': 2, 'anger': 3, 'fear': 4, 'sadness': 5, 'disgust': 6, 'surprise': 7}


In [17]:
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

In [19]:
labels = [label_map[label] for label in labels]
labels = torch.tensor(labels)

In [21]:
type(inputs)

transformers.tokenization_utils_base.BatchEncoding

In [26]:
inputs[:1]

{'input_ids': tensor([[  101,  2111,  2040,  2695,  1000,  5587,  2033,  2006,  1001, 10245,
           7507,  2102,  1000,  2442,  2022,  2139, 10536,  7265,  3064,  1012,
          12731,  2480,  2158,  1012,  1012,  1012,  1012,  2008,  1005,  1055,
           1026,  1048,  2232,  1028,   102,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,    

Training

In [27]:
from torch.utils.data import DataLoader, TensorDataset
from transformers import AdamW

# 準備資料
dataset = TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# 訓練模型
optimizer = AdamW(model.parameters(), lr=1e-5)

model.train()
for epoch in range(3):  # 設定訓練次數
    for batch in dataloader:
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    print(f"Epoch {epoch + 1} completed")



KeyboardInterrupt: 

In [None]:
test_texts = list(df_twitter_test['text'])

In [None]:
model.eval()
with torch.no_grad():
    test_inputs = tokenizer(test_texts, padding=True, truncation=True, return_tensors="pt")
    outputs = model(**test_inputs)
    predictions = torch.argmax(outputs.logits, dim=-1)


In [None]:
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')

class EmotionDataset(Dataset):

    def __init__(self, texts, labels, tokenizer, max_len=128):

        self.texts = texts

        self.labels = labels

        self.tokenizer = tokenizer

        self.max_len = max_len



    def __len__(self):

        return len(self.texts)



    def __getitem__(self, index):

        text = self.texts.iloc[index]

        label = self.labels.iloc[index]

        encoding = self.tokenizer(

            text,

            add_special_tokens=True,

            truncation=True,

            max_length=self.max_len,

            return_token_type_ids=False,

            padding='max_length',

            return_attention_mask=True,

            return_tensors='pt'

        )

        return {

            'input_ids': encoding['input_ids'].flatten(),

            'attention_mask': encoding['attention_mask'].flatten(),

            'labels': torch.tensor(label, dtype=torch.long)

        }
    
train_texts, val_texts, train_labels, val_labels = train_test_split(df_twitter_train['text'], df_twitter_train['label'], test_size=0.2)
train_dataset = EmotionDataset(train_texts, train_labels, tokenizer)

val_dataset = EmotionDataset(val_texts, val_labels, tokenizer)
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_encoder.classes_))

training_args = TrainingArguments(

    output_dir='/kaggle/working/result',

    num_train_epochs=1,

    per_device_train_batch_size=16,

    per_device_eval_batch_size=16,

    warmup_steps=500,

    weight_decay=0.01,

    logging_dir='/kaggle/working/log',

    logging_steps=10,

    evaluation_strategy="epoch"      # 每個 epoch 後進行評估

)



# 定義 Trainer 進行訓練

trainer = Trainer(

    model=model,

    args=training_args,

    train_dataset=train_dataset,

    eval_dataset=val_dataset

)

trainer.train()
trainer.evaluate()