# Install requirements

In [None]:
!pip install datasets
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


# Crawl Data

In [None]:
import requests
import json
from bs4 import BeautifulSoup
import pandas as pd

In [None]:
def crawl(url_to_crawl):
  url = "https://scraper-api.decodo.com/v2/scrape"

  payload = {
        "url": url_to_crawl
  }

  headers = {
      "accept": "application/json",
      "content-type": "application/json",
      "authorization": "Basic "
  }

  response = requests.post(url, json=payload, headers=headers)

  return response


In [None]:
def get_article_text(article_url):
    try:
      # Crawl article
      crawled_article = crawl(article_url)
      crawled_article_json = json.loads(crawled_article.text)

      # If article is not crawled correctly retutn None
      status_code = crawled_article_json['results'][0]['status_code']
      if status_code!=200:
        return None

      # Create BeautifulSoup object from HTML
      html_string = crawled_article_json['results'][0]['content']
      soup = BeautifulSoup(html_string,'html.parser')

      # Get Article Text
      story_div = soup.find('div',id='storytext')
      if story_div is None:
        return None

      text = story_div.get_text(strip=True,separator='\n')

      return text
    except:
      return None

In [None]:
def get_next_article(category_url,batch_size=10):
  start_index=1
  while True:
    # Crawl Index Page
    crawled_page = crawl(f"{category_url}?start={start_index}&count={batch_size}")
    crawled_page_json = json.loads(crawled_page.text)

    # Break out if pages are finished
    status_code = crawled_page_json['results'][0]['status_code']
    if status_code!=200:
      break

    # Create BeautifulSoup objects for Index page
    html_string = crawled_page_json['results'][0]['content']
    soup = BeautifulSoup(html_string,'html.parser')

    # Loop over each article in Index Page
    for article in soup.find_all('article'):
      # Get Article text
      anchor_tag = article.find('a')
      if anchor_tag is None:
        continue

      article_url = anchor_tag['href']
      atricle_text = get_article_text(article_url)

       # Skip Articles that had issues
      if atricle_text is None:
        continue

      yield atricle_text

    start_index+=batch_size

In [None]:
urls_to_crawl = {
    "politics":"https://www.npr.org/get/1014/render/partial/next", #?start=11&count=20
    "business":"https://www.npr.org/get/1006/render/partial/next",
    "health":"https://www.npr.org/get/1128/render/partial/next",
    "science":"https://www.npr.org/get/1007/render/partial/next",
    "climate":"https://www.npr.org/get/1167/render/partial/next",

}

In [None]:
data=[]
for news_category,category_url in urls_to_crawl.items():
  print(f"Crawling {news_category}")
  articles_crawled=0

  for article_text in get_next_article(category_url):
    data.append({'news_category':news_category,'article':article_text})
    articles_crawled+=1

    if articles_crawled%100==0:
      print(f"Crawled {articles_crawled} articles")

    if articles_crawled>=1000:
      break

  df = pd.DataFrame(data)
  df.to_csv(f"news_articles_dataset_{news_category}.csv",index=False)

In [None]:
df = pd.DataFrame(data)

In [None]:
df.to_csv("news_articles_dataset.csv",index=False)

### Steps to Fine tune an LLM
### 1. Define Paramters
### 2. Clean Dataset
### 3. Wrangle Dataset
> ### Label encoder
> ### Train/Test Split
> ### Convert Into Huggingface Dataset
> ### Tokenizer
### 4. Initialize Model
### 5. Train model
### 6. Evaluate Model
### 7. Model Inference

# Parameters and reading Data

In [None]:
import pandas as pd
import huggingface_hub

In [None]:
# Dataset Parameters
dataset_csv_path='news_articles_dataset.csv'
text_column_name = 'article'
label_column_name = 'news_category'
test_size=0.2
num_labels = 2 # Default is 2, it's going to be overwritten after reading data

# Model Parameters
model_name='meta-llama/Llama-3.2-1B'
hf_token=''

In [None]:
df = pd.read_csv('news_articles_dataset.csv')
num_labels = df['news_category'].nunique()

In [None]:
huggingface_hub.login(hf_token)

# 2. Clean Data

In [None]:
from bs4 import BeautifulSoup
import re

In [None]:
class Cleaner():
  def __init__(self):
    pass
  def remove_hml_tags(self,text):
    clean_text = BeautifulSoup(text,'lxml').text
    return clean_text
  def remove_double_spaces(self,text):
    clean_text = re.sub(' +',' ',text)
    return clean_text
  def clean(self,text):
    clean_text = self.remove_hml_tags(text)
    clean_text = self.remove_double_spaces(clean_text)
    return clean_text

In [None]:
cleaner = Cleaner()
df['text_cleaned'] = df[text_column_name].apply(cleaner.clean)

# 3. Wrangle Data

## Label Encoder

In [None]:
from sklearn import preprocessing

In [None]:
le = preprocessing.LabelEncoder()
le.fit(df[label_column_name].tolist())
df['label'] = le.transform(df[label_column_name].tolist())

## Traib/Test Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df_train,df_test = train_test_split(df,test_size=test_size)

In [None]:
df_train.shape,df_test.shape

((4000, 4), (1000, 4))

In [None]:
df_train = df_train[['text_cleaned','label']]
df_test = df_test[['text_cleaned','label']]

## Convert to Huggingface Dataset

In [None]:
from datasets import Dataset

In [None]:
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

## Tokenizer

In [None]:
from transformers import AutoTokenizer

In [None]:
tokenizer= AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

In [None]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

In [None]:
def preprocess_function(examples):
  return tokenizer(examples['text_cleaned'],truncation=True)

In [None]:
tokenized_train = train_dataset.map(preprocess_function,batched=True)
tokenized_test = test_dataset.map(preprocess_function,batched=True)

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

# 4. initialize Model

In [None]:
from transformers import AutoModelForSequenceClassification

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=num_labels)

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.config.pad_token_id = model.config.eos_token_id

In [None]:
number_of_layers =0
for param in model.base_model.parameters():
  number_of_layers+=1
print(f"Number of layers: {number_of_layers}")

Number of layers: 146


In [None]:
layer_number =0
for param in model.base_model.parameters():
  if layer_number>= number_of_layers-25:
    break
  number_of_layers+=1
  param.requires_grad=False

# 5. Train Model

In [None]:
from transformers import TrainingArguments,Trainer
from transformers import DataCollatorWithPadding
import evaluate
import numpy as np

In [None]:
data_collator=  DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
metric = evaluate.load('accuracy')
def compute_metrics(eval_pred):
  logits,labels = eval_pred
  predictions = np.argmax(logits,axis=-1)
  return metric.compute(predictions=predictions,references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=10,

    report_to="none",
    fp16=True,

    learning_rate=2e-4,
    weight_decay=0.01,

    save_steps=2000
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)



  trainer = Trainer(


In [None]:
trainer.train()

Step,Training Loss
500,1.6921
1000,1.1233
1500,1.1375
2000,1.1608
2500,1.0161
3000,1.0087
3500,1.0332
4000,1.0913
4500,1.0179
5000,0.8658


TrainOutput(global_step=10000, training_loss=0.9781315765380859, metrics={'train_runtime': 5829.4638, 'train_samples_per_second': 3.431, 'train_steps_per_second': 1.715, 'total_flos': 1.923672143618212e+17, 'train_loss': 0.9781315765380859, 'epoch': 5.0})

In [None]:
model.config.id2label = {i: label for i, label in enumerate(le.classes_)}
model.config.label2id = {label:i for i, label in enumerate(le.classes_)}

In [None]:
trainer.save_model('./news_classifier_model')
tokenizer.save_pretrained('./news_classifier_model')

('./news_classifier_model/tokenizer_config.json',
 './news_classifier_model/special_tokens_map.json',
 './news_classifier_model/tokenizer.json')

In [None]:
# Save in HuggingFace hub -- Make sure to have your token to have Write acess
model.push_to_hub("news-classifier-model")
trainer.push_to_hub("news-classifier-model")
tokenizer.push_to_hub("news-classifier-model")

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.24k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/AbdullahTarek/news-classifier-model/commit/fd3665f5a036a5afd948efbb5e85ac9745d141af', commit_message='Upload tokenizer', commit_description='', oid='fd3665f5a036a5afd948efbb5e85ac9745d141af', pr_url=None, repo_url=RepoUrl('https://huggingface.co/AbdullahTarek/news-classifier-model', endpoint='https://huggingface.co', repo_type='model', repo_id='AbdullahTarek/news-classifier-model'), pr_revision=None, pr_num=None)

# 6. Evaluate Model

In [None]:
from sklearn.metrics import classification_report

In [None]:
preds = trainer.predict(tokenized_train)
preds = np.argmax(preds[:3][0],axis=-1)
GT = df_train['label'].tolist()
print(classification_report(GT,preds))

              precision    recall  f1-score   support

           0       0.80      0.73      0.76       792
           1       0.77      0.84      0.80       803
           2       0.73      0.79      0.76       794
           3       0.73      0.78      0.76       807
           4       0.79      0.66      0.72       804

    accuracy                           0.76      4000
   macro avg       0.76      0.76      0.76      4000
weighted avg       0.76      0.76      0.76      4000



In [None]:
preds = trainer.predict(tokenized_test)
preds = np.argmax(preds[:3][0],axis=-1)
GT = df_test['label'].tolist()
print(classification_report(GT,preds))

              precision    recall  f1-score   support

           0       0.74      0.70      0.72       208
           1       0.73      0.76      0.74       197
           2       0.76      0.80      0.78       206
           3       0.65      0.73      0.69       193
           4       0.76      0.65      0.70       196

    accuracy                           0.73      1000
   macro avg       0.73      0.73      0.73      1000
weighted avg       0.73      0.73      0.73      1000



# 7. Model Inference

In [None]:
from transformers import pipeline

In [None]:
clf = pipeline("text-classification",
               model="AbdullahTarek/news-classifier-model",
               tokenizer="AbdullahTarek/news-classifier-model",
               )

config.json:   0%|          | 0.00/1.16k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/335 [00:00<?, ?B/s]

Device set to use cuda:0


In [None]:
exmple_article = """
Naughty or nice? That's often how I think about foods packed with carbohydrates. Whole grains, like brown rice and whole wheat, fall squarely into the nice category, while white pasta and rice, well they're more naughty.

"They're naughty, in a sense, because we digest them rapidly and that creates a fast rise in blood sugar," says nutritionist Mindy Patterson, at Texas Woman's University in Houston. They're also low in fiber and protein, compared to their whole grain cousins.

Over time, all those quick surges in blood sugar can hurt your health, Patterson says. They can contribute to insulin resistance and just leave you feeling tired.
"""

In [None]:
result= clf(exmple_article)
print(result)

[{'label': 'health', 'score': 0.9228864908218384}]
