In [3]:
import os
import pandas as pd
import torch
import numpy as np
import boto3
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertModel
from transformers import DistilBertForSequenceClassification, AdamW
from tqdm import tqdm
import argparse

In [6]:
# scpefiy the bucket name and region
bucket_name = 'hugging-face-multiclass-textclassification-bucket-custombucket'
region = 'eu-central-1'   

# create an s3 resource session
s3 = boto3.resource('s3', region_name=region)

# create the bucket
s3.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={'LocationConstraint': region})
print(f"Bucket {bucket_name} created successfully!")

Bucket hugging-face-multiclass-textclassification-bucket-custombucket created successfully!


In [11]:
s3 = boto3.client('s3')

# Specify the local filr path and s3 path
local_file = r"newsCorpora.csv"
s3_path = 'training_data/newsCorpora.csv'

# Upload the file to the s3 bucket
s3.upload_file(local_file, bucket_name, s3_path)
print(f"File {local_file} uploaded to S3 at {bucket_name}/{s3_path}")

File newsCorpora.csv uploaded to S3 at hugging-face-multiclass-textclassification-bucket-custombucket/training_data/newsCorpora.csv


In [15]:
s3_path = 's3://hugging-face-multiclass-textclassification-bucket-custombucket/training_data/newsCorpora.csv'
df = pd.read_csv(s3_path, sep='\t',names=['ID','TITLE','URL','PUBLISHER','CATEGORY','STORY','HOSTNAME','TIMESTAMP'])

df = df[['TITLE','CATEGORY']]

my_dict = {
    'e':'Entertainment',  # 0
    'b':'Business',       # 1
    't':'Science',        # 2
    'm':'Health'          # 3
}

def update_cat(x):
    return my_dict[x]

df['CATEGORY'] = df['CATEGORY'].apply(lambda x:update_cat(x))

print(df)

                                                    TITLE  CATEGORY
0       Fed official says weak data caused by weather,...  Business
1       Fed's Charles Plosser sees high bar for change...  Business
2       US open: Stocks fall after Fed official hints ...  Business
3       Fed risks falling 'behind the curve', Charles ...  Business
4       Fed's Plosser: Nasty Weather Has Curbed Job Gr...  Business
...                                                   ...       ...
422414  Surgeons to remove 4-year-old's rib to rebuild...    Health
422415  Boy to have surgery on esophagus after battery...    Health
422416  Child who swallowed battery to have reconstruc...    Health
422417  Phoenix boy undergoes surgery to repair throat...    Health
422418  Phoenix boy undergoes surgery to repair throat...    Health

[422419 rows x 2 columns]


In [16]:
# This is just a tip
df = df.sample(frac=0.05,random_state=1)

df = df.reset_index(drop=True)
# Tip ends

print(df)

                                                   TITLE       CATEGORY
0                 Murdoch's bid for Time Warner rejected       Business
1      Rescuers close in on 3 trapped Honduran miners...       Business
2      Johnny Depp - Johnny Depp Served With Legal Pa...  Entertainment
3      Apple prepping move into "smart home" connecti...        Science
4      Ripped First Look: Dwayne Johnson as Brett Rat...  Entertainment
...                                                  ...            ...
21116  Fed Beige Book: Activity, labor markets improv...       Business
21117                           National Agriculture Day       Business
21118  Placenta Home to Diverse Bacteria That May Aff...         Health
21119  US TV network Fox to air live 'Grease' musical...  Entertainment
21120    Elton John Is Getting Married! Details Revealed  Entertainment

[21121 rows x 2 columns]


In [17]:
df

Unnamed: 0,TITLE,CATEGORY
0,Murdoch's bid for Time Warner rejected,Business
1,Rescuers close in on 3 trapped Honduran miners...,Business
2,Johnny Depp - Johnny Depp Served With Legal Pa...,Entertainment
3,"Apple prepping move into ""smart home"" connecti...",Science
4,Ripped First Look: Dwayne Johnson as Brett Rat...,Entertainment
...,...,...
21116,"Fed Beige Book: Activity, labor markets improv...",Business
21117,National Agriculture Day,Business
21118,Placenta Home to Diverse Bacteria That May Aff...,Health
21119,US TV network Fox to air live 'Grease' musical...,Entertainment


In [18]:
encode_dict = {}

def encode_cat(x):
    if x not in encode_dict.keys():
        encode_dict[x]=len(encode_dict)
    return encode_dict[x]

In [19]:
df['ENCODE_CAT'] = df['CATEGORY'].apply(lambda x:encode_cat(x))

In [20]:
df

Unnamed: 0,TITLE,CATEGORY,ENCODE_CAT
0,Murdoch's bid for Time Warner rejected,Business,0
1,Rescuers close in on 3 trapped Honduran miners...,Business,0
2,Johnny Depp - Johnny Depp Served With Legal Pa...,Entertainment,1
3,"Apple prepping move into ""smart home"" connecti...",Science,2
4,Ripped First Look: Dwayne Johnson as Brett Rat...,Entertainment,1
...,...,...,...
21116,"Fed Beige Book: Activity, labor markets improv...",Business,0
21117,National Agriculture Day,Business,0
21118,Placenta Home to Diverse Bacteria That May Aff...,Health,3
21119,US TV network Fox to air live 'Grease' musical...,Entertainment,1


In [22]:
print("Rows:", df.shape[0])
print("Columns:", df.shape[1])

print(f"DataFrame has {df.shape[0]} rows and {df.shape[1]} columns.")

Rows: 21121
Columns: 3
DataFrame has 21121 rows and 3 columns.


In [23]:
# Initialize the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [25]:
# Prepare Dataset for PyTorch
class NewDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=20):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        # Tokenize the input text and return token ids, attention mask, and label
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [26]:
# Split data into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['TITLE'].tolist(),
    df['ENCODE_CAT'].tolist(),
    test_size=0.1, random_state=42
)

# Create Dataset objects
train_dataset = NewDataset(train_texts, train_labels, tokenizer)
val_dataset = NewDataset(val_texts, val_labels, tokenizer)

In [27]:
import random

# Test if train_dataset and val_datasets are built correctly

# Get a random sample from the train and validation datasets
random_train_sample = random.choice(train_dataset)
random_val_sample = random.choice(val_dataset)

# Print the random samples for inspection
print("Random Sample from train_dataset:")
print("Input IDs:", random_train_sample['input_ids'])
print("Attention Mask:", random_train_sample['attention_mask'])
print("Label:", random_train_sample['labels'])

print("\nRandom Sample from val_dataset:")
print("Input IDs:", random_val_sample['input_ids'])
print("Attention Mask:", random_val_sample['attention_mask'])
print("Label:", random_val_sample['labels'])

Random Sample from train_dataset:
Input IDs: tensor([  101,  6865,  2378,  1005,  7823,  2007, 25930,  4140,  2497,  4507,
         2265,   102,     0,     0,     0,     0,     0,     0,     0,     0])
Attention Mask: tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0])
Label: tensor(1)

Random Sample from val_dataset:
Input IDs: tensor([  101, 25935, 12992,  2015,  3036, 22496,  2007, 10819,  2271, 22345,
         7654,   102,     0,     0,     0,     0,     0,     0,     0,     0])
Attention Mask: tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0])
Label: tensor(2)


In [28]:
# Check the size (number of samples) in each dataset
train_size = len(train_dataset)
val_size = len(val_dataset)

print(f"Size of train_dataset: {train_size}")
print(f"Size of val_dataset: {val_size}")

Size of train_dataset: 19008
Size of val_dataset: 2113


In [29]:
# DataLoaders for batching
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=2)

In [30]:
# Load Model and optimizer

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(encode_dict))

# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=1e-5)


# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [31]:
def train_epoch(model, data_loader, optimizer, device):
    model.train()
    total_loss = 0
    correct_preds = 0
    total_preds = 0

    for batch in tqdm(data_loader):
        optimizer.zero_grad()

        # Move batch to GPU
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        # Backward pass
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Get predictions
        _, preds = torch.max(logits, dim=1)
        correct_preds += torch.sum(preds == labels)
        total_preds += labels.size(0)

    avg_loss = total_loss / len(data_loader)
    accuracy = correct_preds.double() / total_preds

    return avg_loss, accuracy

In [32]:
# Training loop
epochs = 2
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    train_loss, train_accuracy = train_epoch(model, train_loader, optimizer, device)
    print(f"Training loss: {train_loss:.4f}, accuracy: {train_accuracy:.4f}")

Epoch 1/2


  8%|▊         | 383/4752 [05:24<1:01:42,  1.18it/s]


KeyboardInterrupt: 

In [33]:
def eval_model(model, data_loader, device):
    model.eval()
    correct_preds = 0
    total_preds = 0
    
    with torch.no_grad():
        for batch in tqdm(data_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            # Forward pass
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            
            # Get predictions
            _, preds = torch.max(logits, dim=1)
            correct_preds += torch.sum(preds == labels)
            total_preds += labels.size(0)
    
    accuracy = correct_preds.double() / total_preds
    return accuracy

val_accuracy = eval_model(model, val_loader, device)
print(f"Validation accuracy: {val_accuracy:.4f}")

  9%|▉         | 97/1057 [00:07<01:09, 13.76it/s]


KeyboardInterrupt: 

In [None]:
# Save model to S3
model.save_pretrained('/opt/ml/model')
tokenizer.save_pretrained('/opt/ml/model')

print("Model saved to the S3 bucket")

In [15]:
from transformers import DistilBertTokenizer, DistilBertModel


tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

inputs = tokenizer.encode_plus(
    "I love soccer and mixed martial arts",
    "I love chess",
    add_special_tokens=True,
    max_length=20,
    padding='max_length',
    truncation=True,
    return_token_type_ids=True,
    return_attention_mask=True
)

print("Inputs IDs:",inputs['input_ids'])
print("Attention Mask:", inputs['attention_mask'])
print("Token type IDs:", inputs['token_type_ids'])

Inputs IDs: [101, 1045, 2293, 4715, 1998, 3816, 7761, 2840, 102, 1045, 2293, 7433, 102, 0, 0, 0, 0, 0, 0, 0]
Attention Mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]
Token type IDs: [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]
