In [1]:
!pip install transformers



In [2]:
import torch
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader, Dataset
from transformers import DistilBertTokenizer, DistilBertModel
from tqdm import tqdm
import argparse
import os

2025-06-05 15:24:42.377630: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
s3_path = 's3://ism-multiclass-textclassification-bucket/training-data/newsCorpora.csv'
df = pd.read_csv(s3_path, sep = '\t', names = ['ID', 'TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP'])


In [4]:
df = df[['TITLE','CATEGORY']]

my_dict = {
    'e' : 'Entertainment',
    'b' : 'Business',
    't' : 'Science',
    'm' : 'Health'
}

def update_cat(x):
    return my_dict[x]

df['CATEGORY'] = df['CATEGORY'].apply(lambda x:update_cat(x))

print(df)

                                                    TITLE  CATEGORY
0       Fed official says weak data caused by weather,...  Business
1       Fed's Charles Plosser sees high bar for change...  Business
2       US open: Stocks fall after Fed official hints ...  Business
3       Fed risks falling 'behind the curve', Charles ...  Business
4       Fed's Plosser: Nasty Weather Has Curbed Job Gr...  Business
...                                                   ...       ...
422414  Surgeons to remove 4-year-old's rib to rebuild...    Health
422415  Boy to have surgery on esophagus after battery...    Health
422416  Child who swallowed battery to have reconstruc...    Health
422417  Phoenix boy undergoes surgery to repair throat...    Health
422418  Phoenix boy undergoes surgery to repair throat...    Health

[422419 rows x 2 columns]


In [5]:
# *Making a fraction of the dataset

df = df.sample(frac = 0.05, random_state = 1)
df = df.reset_index(drop = True)

In [6]:
df

Unnamed: 0,TITLE,CATEGORY
0,Murdoch's bid for Time Warner rejected,Business
1,Rescuers close in on 3 trapped Honduran miners...,Business
2,Johnny Depp - Johnny Depp Served With Legal Pa...,Entertainment
3,"Apple prepping move into ""smart home"" connecti...",Science
4,Ripped First Look: Dwayne Johnson as Brett Rat...,Entertainment
...,...,...
21116,"Fed Beige Book: Activity, labor markets improv...",Business
21117,National Agriculture Day,Business
21118,Placenta Home to Diverse Bacteria That May Aff...,Health
21119,US TV network Fox to air live 'Grease' musical...,Entertainment


In [9]:
encode_dict = {}

In [10]:
def encode_cat(x):
    if x not in encode_dict.keys():
        encode_dict[x] = len(encode_dict)
    return encode_dict[x]

In [11]:
df['ENCODE_CAT'] = df['CATEGORY'].apply(lambda x : encode_cat(x))

In [13]:
encode_dict.keys()

dict_keys(['Business', 'Entertainment', 'Science', 'Health'])

In [14]:
df

Unnamed: 0,TITLE,CATEGORY,ENCODE_CAT
0,Murdoch's bid for Time Warner rejected,Business,0
1,Rescuers close in on 3 trapped Honduran miners...,Business,0
2,Johnny Depp - Johnny Depp Served With Legal Pa...,Entertainment,1
3,"Apple prepping move into ""smart home"" connecti...",Science,2
4,Ripped First Look: Dwayne Johnson as Brett Rat...,Entertainment,1
...,...,...,...
21116,"Fed Beige Book: Activity, labor markets improv...",Business,0
21117,National Agriculture Day,Business,0
21118,Placenta Home to Diverse Bacteria That May Aff...,Health,3
21119,US TV network Fox to air live 'Grease' musical...,Entertainment,1


In [4]:
from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

inputs = tokenizer.encode_plus(
    'I love Manchester United football team',
    add_special_tokens = True,
    max_length = 20,
    padding = 'max_length',
    truncation = True,
    return_token_type_ids = True,
    return_attention_mask = True
)

print('Inputs IDs:', inputs['input_ids'])
print('Attention Mask:', inputs['attention_mask'])
print('Token type IDs:', inputs['token_type_ids'])

Inputs IDs: [101, 1045, 2293, 5087, 2142, 2374, 2136, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Attention Mask: [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Token type IDs: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
