In [1]:
%%capture
%pip install --upgrade kaggle

In [2]:
!kaggle -v

Kaggle API 1.8.2


In [3]:
# kaggle setting
from google.colab import userdata
kaggle_token = userdata.get('KAGGLE_API_TOKEN')
import os

os.environ['KAGGLE_USERNAME'] = "minjae0321"
os.environ["KAGGLE_API_TOKEN"] = kaggle_token


In [4]:
# download dataset
!kaggle competitions download -c twitter-sentiment-classification-challenge
!unzip -n twitter-sentiment-classification-challenge.zip -d data/

Downloading twitter-sentiment-classification-challenge.zip to /content
  0% 0.00/4.38M [00:00<?, ?B/s]
100% 4.38M/4.38M [00:00<00:00, 52.6MB/s]
Archive:  twitter-sentiment-classification-challenge.zip
  inflating: data/HW4_test_no-label.csv  
  inflating: data/HW4_train.csv      


# Task
Load the training dataset "HW4_train.csv" and the test dataset "HW4_test_no-label.csv" into pandas DataFrames.

## 데이터셋 로드

### Subtask:
Load the training dataset "HW4_train.csv" and the test dataset "HW4_test_no-label.csv" into pandas DataFrames.


In [5]:
# loading dataset
import pandas as pd

train_df = pd.read_csv('data/HW4_train.csv', encoding='latin1')
test_df = pd.read_csv('data/HW4_test_no-label.csv', encoding='latin1')

print("Train DataFrame head:")
print(train_df.head())
print("\nTest DataFrame head:")
print(test_df.head())

Train DataFrame head:
   UserName  ScreenName   Location     TweetAt  \
0      3799       48751     London  16-03-2020   
1      3800       48752         UK  16-03-2020   
2      3801       48753  Vagabonds  16-03-2020   
3      3802       48754        NaN  16-03-2020   
4      3803       48755        NaN  16-03-2020   

                                       OriginalTweet           Sentiment  
0  @MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...             Neutral  
1  advice Talk to your neighbours family to excha...            Positive  
2  Coronavirus Australia: Woolworths to give elde...            Positive  
3  My food stock is not the only one which is emp...            Positive  
4  Me, ready to go at supermarket during the #COV...  Extremely Negative  

Test DataFrame head:
   UserName  ScreenName             Location     TweetAt  \
0         1       44953                  NYC  02-03-2020   
1         2       44954          Seattle, WA  02-03-2020   
2         3       44955  

In [6]:
# preprocessing - missing value
print("Missing values in Train DataFrame:")
print(train_df.isnull().sum())
print("\nMissing values in Test DataFrame:")
print(test_df.isnull().sum())

Missing values in Train DataFrame:
UserName            0
ScreenName          0
Location         8590
TweetAt             0
OriginalTweet       0
Sentiment           0
dtype: int64

Missing values in Test DataFrame:
UserName           0
ScreenName         0
Location         834
TweetAt            0
OriginalTweet      0
dtype: int64


In [7]:
import re

def preprocess_text(text):
    # Remove mentions (@username)
    text = re.sub(r'@\w+', '', text)
    # Remove URLs (http/https)
    text = re.sub(r'http[s]?://\S+', '', text)
    # Remove hashtags (#example)
    text = re.sub(r'#\w+', '', text)
    # Remove special characters, numbers, and punctuation, keep only alphabetic characters and spaces, convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text).lower()
    # Replace multiple spaces with a single space and remove leading/trailing spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply the preprocessing function to both DataFrames
train_df['cleaned_tweet'] = train_df['OriginalTweet'].apply(preprocess_text)
test_df['cleaned_tweet'] = test_df['OriginalTweet'].apply(preprocess_text)

# Display the first few rows to verify
print("Train DataFrame with cleaned tweets:")
print(train_df[['OriginalTweet', 'cleaned_tweet']].head())
print("\nTest DataFrame with cleaned tweets:")
print(test_df[['OriginalTweet', 'cleaned_tweet']].head())

Train DataFrame with cleaned tweets:
                                       OriginalTweet  \
0  @MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...   
1  advice Talk to your neighbours family to excha...   
2  Coronavirus Australia: Woolworths to give elde...   
3  My food stock is not the only one which is emp...   
4  Me, ready to go at supermarket during the #COV...   

                                       cleaned_tweet  
0                                            and and  
1  advice talk to your neighbours family to excha...  
2  coronavirus australia woolworths to give elder...  
3  my food stock is not the only one which is emp...  
4  me ready to go at supermarket during the outbr...  

Test DataFrame with cleaned tweets:
                                       OriginalTweet  \
0  TRENDING: New Yorkers encounter empty supermar...   
1  When I couldn't find hand sanitizer at Fred Me...   
2  Find out how you can protect yourself and love...   
3  #Panic buying hits #NewYork City

In [8]:
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer

# 1. Split train_df into training and validation sets
X = train_df['cleaned_tweet']
y = train_df['Sentiment']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")

# 2. Load pre-trained BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
print("BERT Tokenizer loaded.")

# 3. Define text tokenization function
def tokenize_text(texts, tokenizer, max_length=128):
    return tokenizer(texts.tolist(),
                     max_length=max_length,
                     truncation=True,
                     padding='max_length',
                     return_tensors='pt')

# 4. Create numerical mapping for sentiment labels
label_to_id = {label: i for i, label in enumerate(y.unique())}
id_to_label = {i: label for label, i in label_to_id.items()}

print(f"Sentiment label mapping: {label_to_id}")

# Display a small sample of the split data
print("\nSample X_train head:")
print(X_train.head())
print("\nSample y_train head:")
print(y_train.head())

Train set size: 32925
Validation set size: 8232


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

BERT Tokenizer loaded.
Sentiment label mapping: {'Neutral': 0, 'Positive': 1, 'Extremely Negative': 2, 'Negative': 3, 'Extremely Positive': 4}

Sample X_train head:
3730     to everyone hoarding rice who until now doesnt...
35121    if your going to eat they have complementary w...
9893     watch this if you are one of those idiots who ...
34429    we need to have a risk management system more ...
29290    markets plunge puts pension freedoms to the te...
Name: cleaned_tweet, dtype: object

Sample y_train head:
3730     Extremely Negative
35121    Extremely Positive
9893     Extremely Negative
34429              Positive
29290              Positive
Name: Sentiment, dtype: object


In [9]:
from torch.utils.data import Dataset, DataLoader

# 5. Apply text tokenization function
train_encodings = tokenize_text(X_train, tokenizer)
val_encodings = tokenize_text(X_val, tokenizer)
test_encodings = tokenize_text(test_df['cleaned_tweet'], tokenizer) # Assuming test_df is preprocessed

print("Train encodings keys:", train_encodings.keys())
print("Validation encodings keys:", val_encodings.keys())
print("Test encodings keys:", test_encodings.keys())

# Convert sentiment labels to numerical IDs
train_labels = torch.tensor([label_to_id[label] for label in y_train.tolist()], dtype=torch.long)
val_labels = torch.tensor([label_to_id[label] for label in y_val.tolist()], dtype=torch.long)

# 6. Define custom TweetDataset class
class TweetDataset(Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.encodings.input_ids)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = self.labels[idx]
        return item

# 7. Create TweetDataset instances
train_dataset = TweetDataset(train_encodings, train_labels)
val_dataset = TweetDataset(val_encodings, val_labels)
test_dataset = TweetDataset(test_encodings) # Test set does not have labels

print(f"Train Dataset size: {len(train_dataset)}")
print(f"Validation Dataset size: {len(val_dataset)}")
print(f"Test Dataset size: {len(test_dataset)}")

# 8. Create DataLoader objects
batch_size = 32

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

print(f"Train DataLoader created with batch size: {batch_size}")
print(f"Validation DataLoader created with batch size: {batch_size}")
print(f"Test DataLoader created with batch size: {batch_size}")

# Verify a batch from train_loader
for batch in train_loader:
    print("\nSample batch from train_loader:")
    print(f"Input IDs shape: {batch['input_ids'].shape}")
    print(f"Attention Mask shape: {batch['attention_mask'].shape}")
    print(f"Labels shape: {batch['labels'].shape}")
    break

Train encodings keys: KeysView({'input_ids': tensor([[ 101, 2000, 3071,  ...,    0,    0,    0],
        [ 101, 2065, 2115,  ...,    0,    0,    0],
        [ 101, 3422, 2023,  ...,    0,    0,    0],
        ...,
        [ 101, 9942, 2000,  ...,    0,    0,    0],
        [ 101, 1996, 2089,  ...,    0,    0,    0],
        [ 101, 2478, 1998,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])})
Validation encodings keys: KeysView({'input_ids': tensor([[  101,  5564,  1037,  ...,     0,     0,     0],
        [  101,  2149, 22886,  ...,     0,   

In [10]:
from transformers import BertForSequenceClassification

# 1. Define the number of sentiment classes
num_labels = len(label_to_id)
print(f"Number of sentiment classes: {num_labels}")

# 2. Load pre-trained BERT model with a classification head
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)
print("BERT-based sentiment classification model loaded.")

# 3. Move model to GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(f"Model moved to: {device}")

# 4. Print the model structure
print("\nModel Structure:")
print(model)


Number of sentiment classes: 5


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERT-based sentiment classification model loaded.
Model moved to: cuda

Model Structure:
BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_featu

In [11]:
import torch
from transformers import get_scheduler

num_epochs = 5 # Example number of epochs
learning_rate = 1e-5

# 1. Define Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate) # Common learning rate for BERT fine-tuning
print("Optimizer (AdamW) initialized.")

# 2. Define Learning Rate Scheduler
num_training_steps = num_epochs * len(train_loader)

lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)
print("Learning rate scheduler (linear) initialized.")

Optimizer (AdamW) initialized.
Learning rate scheduler (linear) initialized.


In [12]:
from tqdm.auto import tqdm

def train_epoch(model, dataloader, optimizer, lr_scheduler, device):
    model.train()  # 1. Set model to training mode
    total_loss = 0

    for batch in tqdm(dataloader, desc="Training"):
        # 2. Move inputs and labels to the appropriate device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()  # 3. Initialize optimizer gradients to zero

        # 4. Perform forward pass to get logits and loss
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()  # 5. Perform backpropagation

        optimizer.step()  # 6. Call optimizer's step() to update model parameters
        lr_scheduler.step()  # 7. Call learning rate scheduler's step() to adjust learning rate

        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    return avg_loss # 8. Record batch loss and return average loss for the epoch

print("train_epoch function defined.")

train_epoch function defined.


In [13]:
from sklearn.metrics import accuracy_score

def evaluate_model(model, dataloader, device, id_to_label=None):
    model.eval()  # Set model to evaluation mode
    predictions = []
    true_labels = []
    total_eval_loss = 0

    for batch in tqdm(dataloader, desc="Evaluating"):
        # Move inputs and labels to the appropriate device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch.get('labels')
        if labels is not None:
            labels = labels.to(device) # Move labels to the same device as inputs

        with torch.no_grad():  # Disable gradient calculations during evaluation
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1).cpu().numpy()  # Get predicted classes
        predictions.extend(preds)

        if labels is not None:
            true_labels.extend(labels.cpu().numpy())
            total_eval_loss += outputs.loss.item()

    avg_eval_loss = total_eval_loss / len(dataloader) if dataloader else 0

    if true_labels: # If labels were provided, calculate accuracy
        accuracy = accuracy_score(true_labels, predictions)
        print(f"  Loss: {avg_eval_loss:.4f}, Accuracy: {accuracy:.4f}")
    else:
        accuracy = None # No labels to calculate accuracy
        print(f"  Loss: {avg_eval_loss:.4f}") # Still print loss if available

    if id_to_label: # Convert numerical predictions to sentiment strings if mapping is provided
        sentiment_predictions = [id_to_label[p] for p in predictions]
        return avg_eval_loss, accuracy, sentiment_predictions, true_labels
    else:
        return avg_eval_loss, accuracy, predictions, true_labels

print("evaluate_model function updated to handle device placement for labels.")

evaluate_model function updated to handle device placement for labels.


In [14]:
import os
import torch

CHECKPOINT_PATH = 'best_checkpoint.pt'

def train(model, train_loader, val_loader, optimizer, lr_scheduler, device, num_epochs, monitor='val_loss', ckpt_path=CHECKPOINT_PATH):
    best_val_loss = float('inf')
    history = {'epoch': [], 'train_loss': [], 'val_loss': [], 'val_accuracy': []}
    for epoch in range(num_epochs):
        # run one epoch of training
        train_loss = train_epoch(model, train_loader, optimizer, lr_scheduler, device)
        # evaluate on validation set
        val_loss, val_acc, _, _ = evaluate_model(model, val_loader, device)
        history['epoch'].append(epoch + 1)
        history['train_loss'].append(train_loss)
        history['val_loss'].append(val_loss)
        history['val_accuracy'].append(val_acc)
        is_best = val_loss < best_val_loss
        if is_best:
            best_val_loss = val_loss
            torch.save({
                'epoch': epoch + 1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'scheduler_state_dict': lr_scheduler.state_dict(),
                'best_val_loss': best_val_loss
            }, ckpt_path)
        print(f"Epoch {epoch+1}/{num_epochs} - train_loss: {train_loss:.4f}  val_loss: {val_loss:.4f}  val_acc: {val_acc:.4f}  best: {is_best}")
    return history, ckpt_path

def load_checkpoint(path, model, optimizer=None, lr_scheduler=None, device='cpu'):
    if not os.path.exists(path):
        print(f'No checkpoint found at {path}')
        return model, optimizer, lr_scheduler, 0, None
    ckpt = torch.load(path, map_location=device)
    model.load_state_dict(ckpt['model_state_dict'])
    start_epoch = ckpt.get('epoch', 0)
    best_val_loss = ckpt.get('best_val_loss')
    if optimizer is not None and 'optimizer_state_dict' in ckpt:
        optimizer.load_state_dict(ckpt['optimizer_state_dict'])
    if lr_scheduler is not None and 'scheduler_state_dict' in ckpt:
        lr_scheduler.load_state_dict(ckpt['scheduler_state_dict'])
    return model, optimizer, lr_scheduler, start_epoch, best_val_loss

def predict(model, dataloader, device):
    model.eval()
    all_preds = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1).cpu().numpy().tolist()
            all_preds += preds
    return all_preds

# Train using the new function
history, best_ckpt_path = train(model, train_loader, val_loader, optimizer, lr_scheduler, device, num_epochs=num_epochs)

# Load the best checkpoint and perform inference on the test set
model, _, _, _, _ = load_checkpoint(best_ckpt_path, model, optimizer=None, lr_scheduler=None, device=device)

# Predict sentiments on the test set
preds_numeric = predict(model, test_loader, device)
test_sentiment_labels_new = [id_to_label[p] for p in preds_numeric]

# Create and save inference.csv
inference_df = test_df[['UserName']].copy()
inference_df['Sentiment'] = test_sentiment_labels_new
inference_df.to_csv('inference.csv', index=False)

# Print summary information
final_val_loss = history['val_loss'][-1] if history['val_loss'] else None
final_val_acc = history['val_accuracy'][-1] if history['val_accuracy'] else None
print('Best checkpoint path:', best_ckpt_path)
print('Final validation loss:', final_val_loss)
print('Final validation accuracy:', final_val_acc)
print('Head of inference.csv:')
print(inference_df.head())

Training:   0%|          | 0/1029 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/258 [00:00<?, ?it/s]

  Loss: 0.6689, Accuracy: 0.7425
Epoch 1/5 - train_loss: 0.9658  val_loss: 0.6689  val_acc: 0.7425  best: True


Training:   0%|          | 0/1029 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/258 [00:00<?, ?it/s]

  Loss: 0.5138, Accuracy: 0.8166
Epoch 2/5 - train_loss: 0.5566  val_loss: 0.5138  val_acc: 0.8166  best: True


Training:   0%|          | 0/1029 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/258 [00:00<?, ?it/s]

  Loss: 0.4833, Accuracy: 0.8271
Epoch 3/5 - train_loss: 0.4162  val_loss: 0.4833  val_acc: 0.8271  best: True


Training:   0%|          | 0/1029 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/258 [00:00<?, ?it/s]

  Loss: 0.4564, Accuracy: 0.8411
Epoch 4/5 - train_loss: 0.3429  val_loss: 0.4564  val_acc: 0.8411  best: True


Training:   0%|          | 0/1029 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/258 [00:00<?, ?it/s]

  Loss: 0.4700, Accuracy: 0.8403
Epoch 5/5 - train_loss: 0.2913  val_loss: 0.4700  val_acc: 0.8403  best: False
Best checkpoint path: best_checkpoint.pt
Final validation loss: 0.4700070312607658
Final validation accuracy: 0.8402575315840622
Head of inference.csv:
   UserName           Sentiment
0         1  Extremely Negative
1         2            Positive
2         3  Extremely Positive
3         4            Negative
4         5             Neutral


In [15]:
print("Making predictions on the test dataset with the newly trained model...")
# Make predictions on the test dataset using the retrained model
# The evaluate_model function returns (avg_eval_loss, accuracy, sentiment_predictions, true_labels)
# For the test set, accuracy and true_labels will be None.
_, _, test_predictions_numerical_new, _ = evaluate_model(model, test_loader, device, id_to_label=None)

# Convert numerical predictions back to sentiment labels using id_to_label map
test_sentiment_labels_new = [id_to_label[pred] for pred in test_predictions_numerical_new]

# Display the first few predicted results
print("\nFirst 10 test predictions from the new model:")
print(test_sentiment_labels_new[:10])


Making predictions on the test dataset with the newly trained model...


Evaluating:   0%|          | 0/119 [00:00<?, ?it/s]

  Loss: 0.0000

First 10 test predictions from the new model:
['Extremely Negative', 'Positive', 'Extremely Positive', 'Negative', 'Neutral', 'Neutral', 'Positive', 'Negative', 'Extremely Negative', 'Extremely Positive']


In [16]:
import pandas as pd

# Add the predicted sentiment labels to the test_df with a new name to avoid conflict with previous submission
test_df['Sentiment_new_model'] = test_sentiment_labels_new

# Create the inference DataFrame with 'UserName' and the new 'Sentiment' column
inference_df = test_df[['UserName', 'Sentiment_new_model']].rename(columns={'Sentiment_new_model': 'Sentiment'})

# Save the inference DataFrame to a CSV file
inference_df.to_csv('inference.csv', index=False)

print("inference.csv created successfully.")

# Display the head of the generated inference.csv to verify its format
print("\nHead of inference.csv:")
print(pd.read_csv('inference.csv').head())

inference.csv created successfully.

Head of inference.csv:
   UserName           Sentiment
0         1  Extremely Negative
1         2            Positive
2         3  Extremely Positive
3         4            Negative
4         5             Neutral
