In [46]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModel, BertTokenizerFast

# specify GPU
device = torch.device("cuda")

In [47]:
import torch
torch.cuda.is_available()

True

In [48]:
device

device(type='cuda')

In [49]:
import pandas as pd
folder_name = 'dm-2024-isa-5810-lab-2-homework'
data_identification = pd.read_csv(folder_name + '/data_identification.csv')
emotion = pd.read_csv(folder_name + '/emotion.csv')
sample_submission = pd.read_csv(folder_name + '/sampleSubmission.csv')

print(data_identification)
print(data_identification.shape)
print(f"{'='*40}")
print(emotion)
print(emotion.shape)
print(f"{'='*40}")
print(sample_submission)
print(f"{'='*40}")

df_twitter = pd.read_json(folder_name + '/tweets_DM.json', lines=True)
train_ids = data_identification[data_identification['identification'] == 'train']['tweet_id'].tolist()
test_ids = data_identification[data_identification['identification'] == 'test']['tweet_id'].tolist()

print("Show ids of train and test\n")
print(len(train_ids))
print(len(test_ids))
print(len(train_ids) + len(test_ids))

df_twitter_expanded = pd.json_normalize(df_twitter['_source'])

print("After expand the tweet_id, tweet_hashtag...\n")
df_twitter['tweet_id'] = df_twitter_expanded['tweet.tweet_id']
df_twitter['text'] = df_twitter_expanded['tweet.text']
df_twitter['hash_tags'] = df_twitter_expanded['tweet.hashtags']

df_twitter_train = df_twitter[df_twitter['tweet_id'].isin(train_ids)]
df_twitter_test = df_twitter[df_twitter['tweet_id'].isin(test_ids)]

print("After saperate train and test:\n")
print(df_twitter_train.shape)
print(df_twitter_test.shape)

df_twitter_train = pd.merge(df_twitter_train, emotion, on='tweet_id', how='left')

         tweet_id identification
0        0x28cc61           test
1        0x29e452          train
2        0x2b3819          train
3        0x2db41f           test
4        0x2a2acc          train
...           ...            ...
1867530  0x227e25          train
1867531  0x293813          train
1867532  0x1e1a7e          train
1867533  0x2156a5          train
1867534  0x2bb9d2          train

[1867535 rows x 2 columns]
(1867535, 2)
         tweet_id       emotion
0        0x3140b1       sadness
1        0x368b73       disgust
2        0x296183  anticipation
3        0x2bd6e1           joy
4        0x2ee1dd  anticipation
...           ...           ...
1455558  0x38dba0           joy
1455559  0x300ea2           joy
1455560  0x360b99          fear
1455561  0x22eecf           joy
1455562  0x2fb282  anticipation

[1455563 rows x 2 columns]
(1455563, 2)
              id   emotion
0       0x2c7743  surprise
1       0x2c1eed  surprise
2       0x2826ea  surprise
3       0x356d9a  surprise
4  

In [50]:
df_twitter_train.head()

Unnamed: 0,_score,_index,_source,_crawldate,_type,tweet_id,text,hash_tags,emotion
0,391,hashtag_tweets,"{'tweet': {'hashtags': ['Snapchat'], 'tweet_id...",2015-05-23 11:42:47,tweets,0x376b20,"People who post ""add me on #Snapchat"" must be ...",[Snapchat],anticipation
1,433,hashtag_tweets,"{'tweet': {'hashtags': ['freepress', 'TrumpLeg...",2016-01-28 04:52:09,tweets,0x2d5350,"@brianklaas As we see, Trump is dangerous to #...","[freepress, TrumpLegacy, CNN]",sadness
2,376,hashtag_tweets,"{'tweet': {'hashtags': [], 'tweet_id': '0x1cd5...",2016-01-24 23:53:05,tweets,0x1cd5b0,Now ISSA is stalking Tasha 😂😂😂 <LH>,[],fear
3,120,hashtag_tweets,"{'tweet': {'hashtags': ['authentic', 'LaughOut...",2015-06-11 04:44:05,tweets,0x1d755c,@RISKshow @TheKevinAllison Thx for the BEST TI...,"[authentic, LaughOutLoud]",joy
4,1021,hashtag_tweets,"{'tweet': {'hashtags': [], 'tweet_id': '0x2c91...",2015-08-18 02:30:07,tweets,0x2c91a8,Still waiting on those supplies Liscus. <LH>,[],anticipation


In [51]:
from sklearn.preprocessing import LabelEncoder

emotion_label = df_twitter_train['emotion']
label_encoder = LabelEncoder()
label_encoder.fit(emotion_label)
print('check label: ', label_encoder.classes_)

check label:  ['anger' 'anticipation' 'disgust' 'fear' 'joy' 'sadness' 'surprise'
 'trust']


In [52]:
emotion_label = label_encoder.fit_transform(df_twitter_train['emotion'])

# 檢查轉換結果
print("Encoded Labels:", emotion_label[:4])
print("Mapping:", dict(zip(label_encoder.classes_, range(len(label_encoder.classes_)))))

Encoded Labels: [1 5 3 4]
Mapping: {'anger': 0, 'anticipation': 1, 'disgust': 2, 'fear': 3, 'joy': 4, 'sadness': 5, 'surprise': 6, 'trust': 7}


In [53]:
df_twitter_train['emotion'][:4]

0    anticipation
1         sadness
2            fear
3             joy
Name: emotion, dtype: object

In [54]:
train_texts = df_twitter_train['text']
train_texts.head()

0    People who post "add me on #Snapchat" must be ...
1    @brianklaas As we see, Trump is dangerous to #...
2                  Now ISSA is stalking Tasha 😂😂😂 <LH>
3    @RISKshow @TheKevinAllison Thx for the BEST TI...
4         Still waiting on those supplies Liscus. <LH>
Name: text, dtype: object

In [9]:
train_texts, val_texts, train_labels, val_labels = train_test_split(df_twitter_train['text'], emotion_label, 
                                                                    random_state=42, 
                                                                    test_size=0.2, 
                                                                    stratify=emotion_label)

In [10]:
test_texts = df_twitter_test['text']
test_texts.head()

2     Confident of your obedience, I write to you, k...
4     "Trust is not the same as faith. A friend is s...
9     When do you have enough ? When are you satisfi...
30    God woke you up, now chase the day #GodsPlan #...
33    In these tough times, who do YOU turn to as yo...
Name: text, dtype: object

In [11]:
# import BERT-base pretrained model
bert = AutoModel.from_pretrained('bert-base-uncased')

# Load the BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [12]:
# tokenize and encode sequences in the training set
tokens_train = tokenizer.batch_encode_plus(
    train_texts.tolist(),
    max_length = 25,
    pad_to_max_length=True,
    truncation=True
)

# tokenize and encode sequences in the validation set
tokens_val = tokenizer.batch_encode_plus(
    val_texts.tolist(),
    max_length = 25,
    pad_to_max_length=True,
    truncation=True
)



In [13]:
train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels.tolist())

val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(val_labels.tolist())

# Data loader

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

#define a batch size
batch_size = 32

# wrap tensors
train_data = TensorDataset(train_seq, train_mask, train_y)

# sampler for sampling the data during training
train_sampler = RandomSampler(train_data)

# dataLoader for train set
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# wrap tensors
val_data = TensorDataset(val_seq, val_mask, val_y)

# sampler for sampling the data during training
val_sampler = SequentialSampler(val_data)

# dataLoader for validation set
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)

# Model Architecture

In [15]:
for param in bert.parameters():
    param.requires_grad = False

In [16]:
class BERT_Arch(nn.Module):

    def __init__(self, bert):
        super(BERT_Arch, self).__init__()
        
        self.bert = bert 
        
        # dropout layer
        self.dropout = nn.Dropout(0.1)
      
        # relu activation function
        self.relu =  nn.ReLU()

        # dense layer 1
        self.fc1 = nn.Linear(768,512)
      
        # dense layer 2 (Output layer)
        self.fc2 = nn.Linear(512,8)

        #softmax activation function
        self.softmax = nn.LogSoftmax(dim=1)

    #define the forward pass
    def forward(self, sent_id, mask):
        
        #pass the inputs to the model  
        _, cls_hs = self.bert(sent_id, attention_mask=mask, return_dict=False)
      
        x = self.fc1(cls_hs)

        x = self.relu(x)

        x = self.dropout(x)

        # output layer
        x = self.fc2(x)
      
        # apply softmax activation
        x = self.softmax(x)

        return x

In [17]:
# pass the pre-trained BERT to our define architecture
model = BERT_Arch(bert)

# push the model to GPU
model = model.to(device)

In [18]:
# optimizer from hugging face transformers
from transformers import AdamW

# define the optimizer
optimizer = AdamW(model.parameters(),lr = 3e-5)



In [32]:
train_labels[:20]

array([4, 4, 4, 4, 4, 4, 2, 4, 7, 4, 4, 1, 4, 4, 5, 4, 4, 7, 1, 4])

In [20]:
from sklearn.utils.class_weight import compute_class_weight

# 計算 class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(train_labels), y=train_labels)

print("Class Weights:", class_weights)

Class Weights: [4.56375024 0.73089486 1.30800631 2.84295103 0.35259609 0.94058966
 3.73383911 0.8854756 ]


In [21]:
weights= torch.tensor(class_weights,dtype=torch.float)

# push to GPU
weights = weights.to(device)

# define the loss function
cross_entropy  = nn.NLLLoss(weight=weights) 

# number of training epochs
epochs = 10

# Fine tune

In [22]:
def train():
    
    model.train()
    total_loss, total_accuracy = 0, 0
  
    # empty list to save model predictions
    total_preds=[]
  
    # iterate over batches
    for step,batch in enumerate(train_dataloader):
        
        # progress update after every 50 batches.
        if step % 50 == 0 and not step == 0:
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))
        
        # push the batch to gpu
        batch = [r.to(device) for r in batch]
 
        sent_id, mask, labels = batch
        
        # clear previously calculated gradients 
        model.zero_grad()        

        # get model predictions for the current batch
        preds = model(sent_id, mask)

        # compute the loss between actual and predicted values
        loss = cross_entropy(preds, labels)

        # add on to the total loss
        total_loss = total_loss + loss.item()

        # backward pass to calculate the gradients
        loss.backward()

        # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # update parameters
        optimizer.step()

        # model predictions are stored on GPU. So, push it to CPU
        preds=preds.detach().cpu().numpy()

    # append the model predictions
    total_preds.append(preds)

    # compute the training loss of the epoch
    avg_loss = total_loss / len(train_dataloader)
  
      # predictions are in the form of (no. of batches, size of batch, no. of classes).
      # reshape the predictions in form of (number of samples, no. of classes)
    total_preds  = np.concatenate(total_preds, axis=0)

    #returns the loss and predictions
    return avg_loss, total_preds

In [23]:
def evaluate():
    
    print("\nEvaluating...")
  
    # deactivate dropout layers
    model.eval()

    total_loss, total_accuracy = 0, 0
    t0 = time.time()
    
    # empty list to save the model predictions
    total_preds = []

    # iterate over batches
    for step,batch in enumerate(val_dataloader):
        
        # Progress update every 50 batches.
        if step % 50 == 0 and not step == 0:
            
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))

        # push the batch to gpu
        batch = [t.to(device) for t in batch]

        sent_id, mask, labels = batch

        # deactivate autograd
        with torch.no_grad():
            
            # model predictions
            preds = model(sent_id, mask)

            # compute the validation loss between actual and predicted values
            loss = cross_entropy(preds,labels)

            total_loss = total_loss + loss.item()

            preds = preds.detach().cpu().numpy()

            total_preds.append(preds)

    # compute the validation loss of the epoch
    avg_loss = total_loss / len(val_dataloader) 

    # reshape the predictions in form of (number of samples, no. of classes)
    total_preds  = np.concatenate(total_preds, axis=0)

    return avg_loss, total_preds

In [24]:
import time

def format_time(elapsed):
    # Convert elapsed time to a string in hh:mm:ss format
    elapsed_rounded = int(round(elapsed))
    return str(time.strftime("%H:%M:%S", time.gmtime(elapsed_rounded)))

In [25]:
# set initial loss to infinite
best_valid_loss = float('inf')

# empty lists to store training and validation loss of each epoch
train_losses=[]
valid_losses=[]

#for each epoch
for epoch in range(epochs):
     
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
    
    #train model
    train_loss, _ = train()
    
    #evaluate model
    valid_loss, _ = evaluate()
    
    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    # append training and validation loss
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    
    print(f'\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')


 Epoch 1 / 10
  Batch    50  of  36,390.
  Batch   100  of  36,390.
  Batch   150  of  36,390.
  Batch   200  of  36,390.
  Batch   250  of  36,390.
  Batch   300  of  36,390.
  Batch   350  of  36,390.
  Batch   400  of  36,390.
  Batch   450  of  36,390.
  Batch   500  of  36,390.
  Batch   550  of  36,390.
  Batch   600  of  36,390.
  Batch   650  of  36,390.
  Batch   700  of  36,390.
  Batch   750  of  36,390.
  Batch   800  of  36,390.
  Batch   850  of  36,390.
  Batch   900  of  36,390.
  Batch   950  of  36,390.
  Batch 1,000  of  36,390.
  Batch 1,050  of  36,390.
  Batch 1,100  of  36,390.
  Batch 1,150  of  36,390.
  Batch 1,200  of  36,390.
  Batch 1,250  of  36,390.
  Batch 1,300  of  36,390.
  Batch 1,350  of  36,390.
  Batch 1,400  of  36,390.
  Batch 1,450  of  36,390.
  Batch 1,500  of  36,390.
  Batch 1,550  of  36,390.
  Batch 1,600  of  36,390.
  Batch 1,650  of  36,390.
  Batch 1,700  of  36,390.
  Batch 1,750  of  36,390.
  Batch 1,800  of  36,390.
  Batch 1,850

In [26]:
#load weights of best model
path = 'saved_weights.pt'
model.load_state_dict(torch.load(path))

  model.load_state_dict(torch.load(path))


<All keys matched successfully>

# Prediction

In [27]:
tokens_test = tokenizer.batch_encode_plus(
    test_texts.tolist(),
    max_length = 25,
    pad_to_max_length=True,
    truncation=True
)

test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])



In [29]:
torch.cuda.empty_cache()

In [37]:
device

device(type='cuda')

In [43]:
# 创建测试集的 DataLoader
test_data = TensorDataset(test_seq, test_mask)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

# 设置模型为评估模式
model.eval()
model.to(device)

# 用于保存所有预测结果
all_preds = []

# 禁用梯度计算，节省内存和加速推理
with torch.no_grad():
    for step, batch in enumerate(test_dataloader):
        
        # 将数据加载到 GPU
        batch = [t.to(device) for t in batch]
        sent_id, mask = batch

        # 通过模型进行预测
        preds = model(sent_id, mask)
        
        # 获取每个样本的最高概率类别索引
        preds = torch.argmax(preds, dim=1).cpu().numpy()
        
        # 收集当前批次的预测结果
        all_preds.extend(preds)

# 使用 LabelEncoder 将预测结果转换回情感标签
predicted_emotions = label_encoder.inverse_transform(all_preds)

# 将结果添加到测试 DataFrame 中
df_twitter_test['predicted_emotion'] = predicted_emotions

result = pd.DataFrame({
    "id": test_ids,
    "emotion": predicted_emotions,
    "emotion_label": all_preds
})

# 查看前几条预测结果
print(result.head())

# 保存预测结果到CSV文件
result.to_csv("submission_bert.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_twitter_test['predicted_emotion'] = predicted_emotions


         id       emotion  emotion_label
0  0x28cc61  anticipation              1
1  0x2db41f  anticipation              1
2  0x2466f6         anger              0
3  0x23f9e9  anticipation              1
4  0x1fb4e1         trust              7
