In [1]:
import io
import os
import requests
from zipfile import ZipFile
from pathlib import Path

In [2]:
import pandas as pd

In [3]:
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder

In [4]:
import torch

from torch.utils.data import DataLoader

In [5]:
from transformers import DistilBertTokenizer

In [6]:
from logger import logger

from dataset import NewsDataset

from model import NewsClsDistilBERT, CLS_NAMES

from train_helper import train_1epoch, eval_1epoch

In [7]:
url_zipfile = 'https://archive.ics.uci.edu/static/public/359/news+aggregator.zip'
output_csv = './output/'

In [8]:
response = requests.get(url_zipfile)

In [9]:
response.raise_for_status()  #@TODO let's handle this, if it throws error

In [10]:
# creates `news_aggregator_dataset` directory if not existed:
out_dir = Path('news_aggregator_dataset')
csv_path = out_dir / 'newsCorpora.csv'

with ZipFile(io.BytesIO(response.content)) as zip_file:
    zip_file.extractall(out_dir)

In [11]:
col_names = ['id', 'title', 'url', 'publisher', 'category',
             'story', 'hostname', 'timestamp']

df = pd.read_csv(csv_path, sep='\t', names=col_names)
df = df[['title', 'category']]  # only interested in these two columns
df.shape

(422419, 2)

In [12]:
df.sample(5)

Unnamed: 0,title,category
16021,Affordable Health Care Sign-Up Deadline Near,b
318460,Aereo loses to broadcasters in Supreme Court d...,t
168703,"'Game of Thrones': Joffrey's Killer Revealed, ...",e
146545,Make me a match,b
165359,Corn planting in full swing; soybean planting ...,b


In [13]:
df['word_cnt'] = df['title'].apply(lambda txt: len(txt.split()))
df.sample(5)

Unnamed: 0,title,category,word_cnt
74742,Gwyneth Paltrow - Gwyneth Paltrow and Chris Ma...,e,13
183878,First MERS Case Reported in the United States,m,8
4264,Study Finds Parents Distracted by Devices,m,6
164153,Nike Fuelband's Fall From Grace,t,5
407901,Mike Tyson Claims Jamie Foxx Is In Talks To Pl...,e,13


In [14]:
df['word_cnt'].value_counts().head(10).sort_index()

Unnamed: 0_level_0,count
word_cnt,Unnamed: 1_level_1
5,19805
6,34865
7,49180
8,57890
9,58184
10,52096
11,44528
12,36386
13,26363
14,16638


In [15]:
min_words = 5
max_words = 14
df = df.query(f'{min_words} <= word_cnt <= {max_words}')
df.shape

(395935, 3)

In [16]:
df['category'].value_counts(normalize=True).round(2)

Unnamed: 0_level_0,proportion
category,Unnamed: 1_level_1
e,0.36
b,0.28
t,0.26
m,0.11


In [17]:
category_cnt = df['category'].value_counts(normalize=True).round(2)

inverse_weights = {k: int(1/v) for k, v in category_cnt.items()}
inverse_weights

{'e': 2, 'b': 3, 't': 3, 'm': 9}

In [18]:
total = sum(inverse_weights.values())
normalized_weights = {k: round(v/total, 2) for k, v in inverse_weights.items()}
normalized_weights

{'e': 0.12, 'b': 0.18, 't': 0.18, 'm': 0.53}

In [19]:
df['weights'] = df['category'].map(normalized_weights)
df.sample(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['weights'] = df['category'].map(normalized_weights)


Unnamed: 0,title,category,word_cnt,weights
233318,What happens when Pat Sajak calls libs 'unpatr...,e,9,0.12
61191,SC unemployment falls to 5.7 percent in Februa...,b,13,0.18
110218,"Twitter has 44% Inactive accounts, and it's se...",t,8,0.18
172650,Why Selena Gomez Had an Unfollow Frenzy on Ins...,e,10,0.12
42191,HTC One M8 hands on preview: Making the best A...,t,13,0.18
26491,President Obama Appears Live on ELLEN Today,e,7,0.12
422303,"Ebola death toll tops 1, 550",m,6,0.53
81579,Samsung puts 'blame' on Google in Apple patent...,t,9,0.18
387740,Apple agrees to $400 million settlement over e...,t,9,0.18
213614,Indiana Verizon customers now have access to 9...,t,10,0.18


In [20]:
# Get a rather balanced sample of data of size n_samples
n_samples = 100000
df = df.sample(n_samples, weights='weights', random_state=19)
df.shape

(100000, 4)

In [21]:
df.sample(3)

Unnamed: 0,title,category,word_cnt,weights
43552,Canada rules out suspected case of Ebola Canad...,m,13,0.53
412027,Euro bond yields dive as Draghi comments feed ...,b,10,0.18
218858,"AT&T to Acquire DirecTV, for a Whopping $48.5 ...",b,9,0.18


In [22]:
# encode the category column:
le = LabelEncoder()
df['label'] = le.fit_transform(df['category'])

# Remove multiple spaces to just one & trim spaces from lhs & rhs:
df['title'] = df['title'].str.replace(r'\s+', ' ', regex=True).str.strip()


In [23]:
df.sample(5)

Unnamed: 0,title,category,word_cnt,weights,label
367012,US grains mixed; corn trades near 4-year low o...,b,12,0.18,0
192529,Bang & Olufsen's latest high-end TV turns to f...,t,11,0.18,3
343701,"China share-indexes end higher, lifted by banks",b,7,0.18,0
192307,Emirates airlines says profit up 43% in 2013 t...,b,11,0.18,0
222019,“Godzilla” Roars To $93.2 Million,e,5,0.12,1


In [24]:
# output_csv = Path(output_csv)
# os.makedirs(output_csv, exist_ok=True)

# df.to_csv(output_csv / 'dataset.csv', index=False, encoding='utf-8')

In [25]:
df_trn, df_tst = train_test_split(df, test_size=.2, random_state=19)
df_trn.shape[0], df_tst.shape[0]

(80000, 20000)

In [26]:
df_trn.head()

Unnamed: 0,title,category,word_cnt,weights,label
13106,China's urbanization level to reach 60 pct by ...,b,9,0.18,0
317273,GoPro prices shares at $24 for Thursday IPO,b,8,0.18,0
50301,Low Back Pain Leading Cause of Disability Worl...,m,8,0.53,2
46892,"USA. Coast Guard, EPA respond to oil discharge...",b,14,0.18,0
1776,"Apple Rolls Out iOS 7.1, New Features Include ...",t,13,0.18,3


In [27]:
dx = df_trn.head()
dx

Unnamed: 0,title,category,word_cnt,weights,label
13106,China's urbanization level to reach 60 pct by ...,b,9,0.18,0
317273,GoPro prices shares at $24 for Thursday IPO,b,8,0.18,0
50301,Low Back Pain Leading Cause of Disability Worl...,m,8,0.53,2
46892,"USA. Coast Guard, EPA respond to oil discharge...",b,14,0.18,0
1776,"Apple Rolls Out iOS 7.1, New Features Include ...",t,13,0.18,3


In [28]:
# Reset the index
df_trn = df_trn.reset_index(drop=True)
df_tst = df_tst.reset_index(drop=True)

In [29]:
df_trn.head()

Unnamed: 0,title,category,word_cnt,weights,label
0,China's urbanization level to reach 60 pct by ...,b,9,0.18,0
1,GoPro prices shares at $24 for Thursday IPO,b,8,0.18,0
2,Low Back Pain Leading Cause of Disability Worl...,m,8,0.53,2
3,"USA. Coast Guard, EPA respond to oil discharge...",b,14,0.18,0
4,"Apple Rolls Out iOS 7.1, New Features Include ...",t,13,0.18,3


In [30]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DEVICE

device(type='cuda')

In [31]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [32]:
data_trn = NewsDataset(df_trn, tokenizer, max_len=128)
data_tst = NewsDataset(df_tst, tokenizer, max_len=128)

In [33]:
loader_trn = DataLoader(data_trn, batch_size=64, shuffle=True)
loader_tst = DataLoader(data_tst, batch_size=64)

In [34]:
len(loader_trn.dataset)

80000

In [35]:
model = NewsClsDistilBERT().to(DEVICE)

In [36]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss()

In [37]:
EPOCHS = 1
for epoch in range(1, EPOCHS+1):
    logger.info(f'[INFO] Epoch [{epoch}/{EPOCHS}] =====================>')
    train_1epoch(model, loader_trn, optimizer, criterion, epoch, DEVICE)
    eval_1epoch(model, loader_tst, criterion, epoch, DEVICE)


INFO:pipeline_logger:	After 26624 examples >>> Train Loss: 0.5482 --- Train Acc : 82.74
INFO:pipeline_logger:	After 53248 examples >>> Train Loss: 0.4295 --- Train Acc : 86.38
INFO:pipeline_logger:	After 79872 examples >>> Train Loss: 0.3719 --- Train Acc : 88.20
INFO:pipeline_logger:EPOCH 1 >>> Train Loss: 0.3716 | Train Acc: 88.21
INFO:pipeline_logger:EPOCH 1 >>> Eval Loss: 0.2569 | Eval Acc: 91.53



In [38]:
import torch

def classify_sentence(sentence, model, tokenizer, max_length):
    # Tokenize the input sentence
    inputs = tokenizer.encode_plus(
        sentence,
        None,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',  # Pad to the maximum length
        truncation=True,
        return_tensors='pt'    # Return tensors
    )

    # Prepare the tensors
    ids = inputs['input_ids'].to('cpu')  # Shape: (1, max_length)
    mask = inputs['attention_mask'].to('cpu')  # Shape: (1, max_length)

    # Ensure model is in evaluation mode
    model.eval()
    model.to('cpu')

    # Perform prediction
    with torch.no_grad():
        # model output be like: tensor([[ 4.1735, -1.5431, -1.3251, -0.5129]])
        outputs = model(ids, mask)  # Forward pass through the model

    # Get the predicted label
    predicted_label = torch.argmax(outputs.squeeze()).item()

    return predicted_label


In [39]:
sentences = [
    "Breaking news: The stock market hits record highs!",
    "Streaming Service Secures Exclusive Rights to Cult-Favorite Comedy Series",
    "FDA Approves Revolutionary Gene Therapy for Rare Genetic Disorder",
    "mall Businesses See Surge in Sales Ahead of Holiday Season"
]


for sentence in sentences:
    predicted_idx = classify_sentence(sentence, model, tokenizer, max_length=128)
    print(f"Predicted Label: {CLS_NAMES[predicted_idx]}")


Predicted Label: business
Predicted Label: entertainment
Predicted Label: medicine
Predicted Label: business
