In [1]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [1]:
pip install nbconvert PyPDF2




In [18]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from torch.utils.data import Dataset, DataLoader
from transformers import BertweetTokenizer, AutoModelForSequenceClassification, AdamW
from transformers import get_scheduler
from tqdm import tqdm

In [16]:
#===========================================================================

# Data preprocessing

In [1]:
# Environment = 1 # Kaggle
Environment = 2 # Local

import pandas as pd

if Environment == 1:
    folder_name = '/kaggle/input/dm-2024-isa-5810-lab-2-homework'
elif Environment == 2:
    folder_name = 'dm-2024-isa-5810-lab-2-homework'

"""Load data from local or Kaggle"""
#===========================================================================
data_identification = pd.read_csv(folder_name + '/data_identification.csv')
emotion = pd.read_csv(folder_name + '/emotion.csv')
sample_submission = pd.read_csv(folder_name + '/sampleSubmission.csv')
df_twitter = pd.read_json(folder_name + '/tweets_DM.json', lines=True)

"""Seperate ids of training and testing dataset"""
#===========================================================================
train_ids_df = data_identification[data_identification['identification'] == 'train'].drop(['identification'], axis=1)
test_ids_df = data_identification[data_identification['identification'] == 'test'].drop(['identification'], axis=1)

"""Process the json file (tweet), inspired from internet"""
#===========================================================================
df_twitter_expanded = pd.json_normalize(df_twitter['_source'])

df_twitter = df_twitter.drop(['_source'], axis=1)
df_twitter = pd.concat([df_twitter, df_twitter_expanded], axis=1)

"""Merge json file and emotion, ids"""
#===========================================================================
df_training = pd.merge(train_ids_df, emotion, on='tweet_id', how='inner')
df_training = pd.merge(df_training, df_twitter, left_on='tweet_id', right_on='tweet.tweet_id', how='inner')

df_test = pd.merge(test_ids_df, df_twitter, left_on='tweet_id', right_on='tweet.tweet_id', how='inner')

In [2]:
df_training.head()

Unnamed: 0,tweet_id,emotion,_score,_index,_crawldate,_type,tweet.hashtags,tweet.tweet_id,tweet.text
0,0x29e452,joy,809,hashtag_tweets,2015-01-17 03:07:03,tweets,[],0x29e452,Huge Respect🖒 @JohnnyVegasReal talking about l...
1,0x2b3819,joy,808,hashtag_tweets,2016-07-02 09:34:06,tweets,"[spateradio, app]",0x2b3819,Yoooo we hit all our monthly goals with the ne...
2,0x2a2acc,trust,16,hashtag_tweets,2016-08-15 18:18:39,tweets,[],0x2a2acc,@KIDSNTS @PICU_BCH @uhbcomms @BWCHBoss Well do...
3,0x2a8830,joy,768,hashtag_tweets,2017-02-11 08:49:46,tweets,"[PUBG, GamersUnite, twitch, BeHealthy, StayPos...",0x2a8830,Come join @ambushman27 on #PUBG while he striv...
4,0x20b21d,anticipation,70,hashtag_tweets,2016-11-23 05:37:10,tweets,"[strength, bones, God]",0x20b21d,@fanshixieen2014 Blessings!My #strength little...


In [3]:
df_test.head()

Unnamed: 0,tweet_id,_score,_index,_crawldate,_type,tweet.hashtags,tweet.tweet_id,tweet.text
0,0x28cc61,107,hashtag_tweets,2017-01-17 14:13:32,tweets,[],0x28cc61,@Habbo I've seen two separate colours of the e...
1,0x2db41f,728,hashtag_tweets,2015-10-17 06:46:20,tweets,[],0x2db41f,@FoxNews @KellyannePolls No serious self respe...
2,0x2466f6,491,hashtag_tweets,2016-12-19 03:50:27,tweets,[womendrivers],0x2466f6,"Looking for a new car, and it says 1 lady owne..."
3,0x23f9e9,28,hashtag_tweets,2017-04-09 19:32:19,tweets,[robbingmembers],0x23f9e9,@cineworld “only the brave” just out and fount...
4,0x1fb4e1,925,hashtag_tweets,2016-01-15 11:59:31,tweets,[],0x1fb4e1,Felt like total dog 💩 going into open gym and ...


In [5]:
df_test.size

3295776

In [33]:
# Encode labels
label_encoder = LabelEncoder()
df_training['emotion_label'] = label_encoder.fit_transform(df_training['emotion'])
num_classes = len(label_encoder.classes_)

In [34]:
tokenizer = BertweetTokenizer.from_pretrained("vinai/bertweet-base", normalization=True) # base --> max_len = 128

In [35]:
X_train, X_val, y_train, y_val = train_test_split(df_training['tweet.text'],
                                                  df_training['emotion_label'],
                                                  test_size=0.2,
                                                  random_state=1111)

In [36]:
class TweetDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]
        encoding = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long)
        }

In [37]:
# Hyperparameters
max_len = 128
batch_size = 16
epochs = 3
learning_rate = 2e-5

In [None]:
# Prepare data loaders
train_dataset = TweetDataset(X_train, y_train, tokenizer, max_len)
val_dataset = TweetDataset(X_val, y_val, tokenizer, max_len)

test_dataset = TweetDataset(df_test['tweet.text'], [0] * len(df_test), tokenizer, max_len)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [39]:
# Load model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained(
    "vinai/bertweet-base",
    num_labels=num_classes
)

# Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * epochs)
criterion = nn.CrossEntropyLoss()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [40]:
model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(130, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [41]:
for epoch in range(epochs):
    # Training phase
    model.train()
    train_loss = 0
    train_correct = 0

    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

        train_loss += loss.item()
        train_correct += (logits.argmax(1) == labels).sum().item()

    train_accuracy = train_correct / len(train_dataset)

    # Validation phase
    model.eval()
    val_loss = 0
    val_correct = 0
    all_preds = []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            loss = criterion(logits, labels)

            val_loss += loss.item()
            val_correct += (logits.argmax(1) == labels).sum().item()
            all_preds.extend(logits.argmax(1).cpu().numpy())

    val_accuracy = val_correct / len(val_dataset)

    print(f"Epoch {epoch+1}:")
    print(f"  Training  - Loss: {train_loss/len(train_loader):.4f}, Accuracy: {train_accuracy:.4f}")
    print(f"  Validation - Loss: {val_loss/len(val_loader):.4f}, Accuracy: {val_accuracy:.4f}")

torch.save(model.state_dict(), "bertweet_emotion_model.pth")
print("Model saved as bertweet_emotion_model.pth")

Training Epoch 1: 100%|██████████| 72779/72779 [2:57:27<00:00,  6.84it/s]  


Epoch 1:
  Training  - Loss: 1.0313, Accuracy: 0.6263
  Validation - Loss: 0.8746, Accuracy: 0.6817


Training Epoch 2: 100%|██████████| 72779/72779 [2:54:59<00:00,  6.93it/s]  


Epoch 2:
  Training  - Loss: 0.7832, Accuracy: 0.7162
  Validation - Loss: 0.8442, Accuracy: 0.6978


Training Epoch 3: 100%|██████████| 72779/72779 [2:58:15<00:00,  6.80it/s]  


Epoch 3:
  Training  - Loss: 0.6575, Accuracy: 0.7619
  Validation - Loss: 0.8712, Accuracy: 0.7004
Model saved as bertweet_emotion_model.pth


In [42]:
# 加載模型
model = AutoModelForSequenceClassification.from_pretrained(
    "vinai/bertweet-base",
    num_labels=num_classes
)
model.load_state_dict(torch.load("bertweet_emotion_model.pth"))
model.to(device)
model.eval()  # 設置為評估模式
print("Model loaded and ready for prediction.")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load("bertweet_emotion_model.pth"))


Model loaded and ready for prediction.


In [None]:
class TestTweetDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len):
        self.texts = list(texts)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0)
        }

In [46]:
test_dataset = TestTweetDataset(texts=df_test['tweet.text'], tokenizer=tokenizer, max_len=max_len)

test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [47]:
# Predict on test set
test_preds = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Predicting"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        test_preds.extend(logits.argmax(1).cpu().numpy())

# Convert predictions back to emotions
df_test['emotion'] = label_encoder.inverse_transform(test_preds)

submission = df_test[['tweet_id', 'emotion']]
submission = submission.rename(columns={'tweet_id': 'id'})

submission.to_csv('submission.csv', index=False)

print("Predictions saved to submission.csv")

Predicting: 100%|██████████| 25749/25749 [20:26<00:00, 20.99it/s]


Predictions saved to submission.csv
