In [42]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [26]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.optim.lr_scheduler import StepLR
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModel, BertTokenizerFast, BertweetTokenizer, AutoModelForSequenceClassification

# specify GPU
device = torch.device("cuda")
torch.cuda.is_available()

# Environment = 1 # Kaggle
Environment = 2 # Local

import pandas as pd

if Environment == 1:
    folder_name = '/kaggle/input'
elif Environment == 2:
    folder_name = 'dm-2024-isa-5810-lab-2-homework'

data_identification = pd.read_csv(folder_name + '/data_identification.csv')
emotion = pd.read_csv(folder_name + '/emotion.csv')
sample_submission = pd.read_csv(folder_name + '/sampleSubmission.csv')

print(data_identification)
print(data_identification.shape)
print(f"{'='*40}")
print(emotion)
print(emotion.shape)
print(f"{'='*40}")
print(sample_submission)
print(f"{'='*40}")

df_twitter = pd.read_json(folder_name + '/tweets_DM.json', lines=True)
train_ids_df = data_identification[data_identification['identification'] == 'train'].drop(['identification'], axis=1)
test_ids_df = data_identification[data_identification['identification'] == 'test'].drop(['identification'], axis=1)
train_ids = data_identification[data_identification['identification'] == 'train']['tweet_id'].tolist()
test_ids = data_identification[data_identification['identification'] == 'test']['tweet_id'].tolist()

print("Show ids of train and test\n")
print(len(train_ids))
print(len(test_ids))
print(len(train_ids) + len(test_ids))

df_twitter_expanded = pd.json_normalize(df_twitter['_source'])

print("After expand the tweet_id, tweet_hashtag...\n")
df_twitter['tweet_id'] = df_twitter_expanded['tweet.tweet_id']
df_twitter['text'] = df_twitter_expanded['tweet.text']
df_twitter['hash_tags'] = df_twitter_expanded['tweet.hashtags']

df_twitter_train = df_twitter[df_twitter['tweet_id'].isin(train_ids)]
df_twitter_test = df_twitter[df_twitter['tweet_id'].isin(test_ids)]

print("After saperate train and test:\n")
print(df_twitter_train.shape)
print(df_twitter_test.shape)

df_twitter_train = pd.merge(df_twitter_train, emotion, on='tweet_id', how='left')

         tweet_id identification
0        0x28cc61           test
1        0x29e452          train
2        0x2b3819          train
3        0x2db41f           test
4        0x2a2acc          train
...           ...            ...
1867530  0x227e25          train
1867531  0x293813          train
1867532  0x1e1a7e          train
1867533  0x2156a5          train
1867534  0x2bb9d2          train

[1867535 rows x 2 columns]
(1867535, 2)
         tweet_id       emotion
0        0x3140b1       sadness
1        0x368b73       disgust
2        0x296183  anticipation
3        0x2bd6e1           joy
4        0x2ee1dd  anticipation
...           ...           ...
1455558  0x38dba0           joy
1455559  0x300ea2           joy
1455560  0x360b99          fear
1455561  0x22eecf           joy
1455562  0x2fb282  anticipation

[1455563 rows x 2 columns]
(1455563, 2)
              id   emotion
0       0x2c7743  surprise
1       0x2c1eed  surprise
2       0x2826ea  surprise
3       0x356d9a  surprise
4  

In [27]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df_twitter_train["emotion_label"] = label_encoder.fit_transform(df_twitter_train["emotion"])
num_classes = len(label_encoder.classes_)

print(num_classes)
print(df_twitter_train["emotion_label"].unique())
df_twitter_train.head()

8
[1 5 3 4 0 7 2 6]


Unnamed: 0,_score,_index,_source,_crawldate,_type,tweet_id,text,hash_tags,emotion,emotion_label
0,391,hashtag_tweets,"{'tweet': {'hashtags': ['Snapchat'], 'tweet_id...",2015-05-23 11:42:47,tweets,0x376b20,"People who post ""add me on #Snapchat"" must be ...",[Snapchat],anticipation,1
1,433,hashtag_tweets,"{'tweet': {'hashtags': ['freepress', 'TrumpLeg...",2016-01-28 04:52:09,tweets,0x2d5350,"@brianklaas As we see, Trump is dangerous to #...","[freepress, TrumpLegacy, CNN]",sadness,5
2,376,hashtag_tweets,"{'tweet': {'hashtags': [], 'tweet_id': '0x1cd5...",2016-01-24 23:53:05,tweets,0x1cd5b0,Now ISSA is stalking Tasha 😂😂😂 <LH>,[],fear,3
3,120,hashtag_tweets,"{'tweet': {'hashtags': ['authentic', 'LaughOut...",2015-06-11 04:44:05,tweets,0x1d755c,@RISKshow @TheKevinAllison Thx for the BEST TI...,"[authentic, LaughOutLoud]",joy,4
4,1021,hashtag_tweets,"{'tweet': {'hashtags': [], 'tweet_id': '0x2c91...",2015-08-18 02:30:07,tweets,0x2c91a8,Still waiting on those supplies Liscus. <LH>,[],anticipation,1


In [28]:
from transformers import BertweetTokenizer
tokenizer = BertweetTokenizer.from_pretrained("vinai/bertweet-base", normalization=True)

In [29]:
X = df_twitter_train['text']
y = df_twitter_train['emotion_label']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1111)

In [30]:
def encode_data(X_train, X_val, y_train, y_val, tokenizer, max_length=128):
    # Tokenize text data
    train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=max_length, return_tensors="pt")
    val_encodings = tokenizer(list(X_val), truncation=True, padding=True, max_length=max_length, return_tensors="pt")

    # Encode labels
    label_encoder = LabelEncoder()
    train_labels = label_encoder.fit_transform(y_train)
    val_labels = label_encoder.transform(y_val)

    return train_encodings, val_encodings, train_labels, val_labels, label_encoder

In [31]:
# Dataset class for loading text data from DataFrame
class TweetDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings  # Should already contain tensors if return_tensors="pt" was used
        self.labels = torch.tensor(labels)  # Ensure labels are converted to a tensor

    def __getitem__(self, idx):
        # Use encodings as they are without re-wrapping in torch.tensor
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

In [32]:
from sklearn.metrics import classification_report

def train_model(model, optimizer, scheduler, train_dataloader, val_dataloader, device, num_epochs, label_encoder, save_dir="./models"):
    for epoch in range(num_epochs):
        print(f"\nEpoch {epoch + 1}/{num_epochs}")
        model.train()
        total_train_loss = 0

        # Training phase
        for batch in tqdm(train_dataloader, desc="Training"):
            b_input_ids = batch['input_ids'].to(device)
            b_attention_mask = batch['attention_mask'].to(device)
            b_labels = batch['labels'].to(device)
            
            # Zero the gradients
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(input_ids=b_input_ids, attention_mask=b_attention_mask, labels=b_labels)
            loss = outputs.loss
            total_train_loss += loss.item()
            
            # Backward pass
            loss.backward()
            
            # Clip the gradient to prevent exploding gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            
            # Update parameters
            optimizer.step()

        # Step the scheduler after each epoch
        scheduler.step()

        avg_train_loss = total_train_loss / len(train_dataloader)
        print(f"Epoch {epoch + 1}/{num_epochs}, Training Loss: {avg_train_loss}")

        # Validation phase
        model.eval()
        total_val_loss = 0
        all_preds = []
        all_labels = []

        with torch.no_grad():
            for batch in tqdm(val_dataloader, desc="Validating"):
                b_input_ids = batch['input_ids'].to(device)
                b_attention_mask = batch['attention_mask'].to(device)
                b_labels = batch['labels'].to(device)
                
                # Forward pass for validation
                outputs = model(input_ids=b_input_ids, attention_mask=b_attention_mask, labels=b_labels)
                loss = outputs.loss
                logits = outputs.logits
                
                total_val_loss += loss.item()
                
                # Store predictions and true labels
                _, preds = torch.max(logits, dim=1)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(b_labels.cpu().numpy())

        avg_val_loss = total_val_loss / len(val_dataloader)
        
        # Decode numerical labels to original class names
        decoded_preds = label_encoder.inverse_transform(all_preds)
        decoded_labels = label_encoder.inverse_transform(all_labels)
        
        # Generate classification report
        class_report = classification_report(decoded_labels, decoded_preds, digits=4, target_names=label_encoder.classes_)

        print(f"Epoch {epoch + 1}/{num_epochs}, Validation Loss: {avg_val_loss}")
        print("\nClassification Report:")
        print(class_report)

        # Save model checkpoint
        save_path = f"{save_dir}/ep_{epoch + 1}"
        model.save_pretrained(save_path)
        print(f"Model checkpoint saved to {save_path}")

In [44]:
model_name = "vinai/bertweet-base"
tokenizer = BertweetTokenizer.from_pretrained(model_name, normalization=True)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=8).to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
X_train_encoding, X_val_encoding, y_train_label, y_val_label, label_encoder = encode_data(X_train, X_val, y_train, y_val, tokenizer, max_length=48)

# Initialize datasets
train_dataset = TweetDataset(X_train_encoding, y_train_label)
val_dataset = TweetDataset(X_val_encoding, y_val_label)

train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=256)

In [39]:
device

device(type='cuda')

In [45]:
model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(130, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = StepLR(optimizer, step_size=2, gamma=0.7)
train_model(model, optimizer, scheduler, train_dataloader, val_dataloader, device, num_epochs=3, label_encoder=label_encoder, save_dir="./Data")


Epoch 1/3


Training:   0%|          | 8/9098 [02:20<44:15:12, 17.53s/it]


KeyboardInterrupt: 

: 

In [None]:
X_test = df_twitter_test['text']

# Load the tokenizer and model
model_name = "vinai/bertweet-base"
tokenizer = BertweetTokenizer.from_pretrained(model_name, normalization=True)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=48, return_tensors="pt")

In [None]:
# Create a DataLoader for batching
test_dataset = torch.utils.data.TensorDataset(
    test_encodings["input_ids"],
    test_encodings["attention_mask"]
)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=512)

model_path = "./models/Bertweet_v3/ep_4"
model = AutoModelForSequenceClassification.from_pretrained(model_path).to(device)

In [None]:
# Set the model to evaluation mode
model.eval()

# Perform predictions
predictions = []
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Testing"):
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)

        # Get logits from the model
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        # Get the predicted labels
        preds = torch.argmax(logits, dim=1)
        predictions.extend(preds.cpu().numpy())

# Map numerical predictions to text labels
label_encoder = LabelEncoder()
label_encoder.fit(y_train)  # Ensure this matches your training labels
predicted_labels = label_encoder.inverse_transform(predictions)

# Add predictions to the test DataFrame
df_twitter_test['emotion'] = predicted_labels  

# Keep only the desired columns
submission = df_twitter_test[['tweet_id', 'emotion']]

# Rename the 'tweet_id' column to 'id'
submission = submission.rename(columns={'tweet_id': 'id'})

# Save or display the predictions
submission.to_csv("submission.csv", index=False)