# Imports

In [1]:
# Imports 

import requests # HTTP requests 
from bs4 import BeautifulSoup # Extract HTML content

import numpy as np



# Title scraper

In [2]:
def scrape_tech_news():

    url = 'https://www.ft.com/technology'

    # HTTP request to scrap page information
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)

    # Check response status
    if response.status_code != 200:
        print(f"Échec du scraping : {response.status_code}")
        return
    
    # HTML content analysis
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all the article listed on the page
    articles = soup.find_all('div', class_='o-teaser__content')
    
    news = []
    
    for article in articles:
        # Title and link extraction
        article_heading = article.find('a', class_='js-teaser-heading-link')
        title = article_heading.get_text(strip=True) if article_heading else "No title"
        link = "https://www.ft.com" + article_heading['href'] if article_heading else "No link"
        
        # Extract article tag to define categories
        article_tag = article.find('a', class_='o-teaser__tag')
        tag = article_tag['aria-label'] if article_tag else 'No cat'
        tag = tag.replace('Category: ', '')
        
        news.append({
            'title': title,
            'link': link,
            'tag': tag,
        })
    
    return news

ft_news_scraped = scrape_tech_news()

# Print
for article in ft_news_scraped[0:2]:
    print(f"Title: {article['title']}")
    print(f"Link: {article['link']}")
    print(f"Category: {article['tag']}")
    print("-" * 40)


Title: OpenAI pushes ahead with for-profit plans and talks to give Altman a stake
Link: https://www.ft.com/content/78b7e7a7-7428-4c5e-bfa2-0921c9d6cd25
Category: OpenAI
----------------------------------------
Title: Ubisoft shares fall 17% after it delays launch of new ‘Assassin’s Creed’ game
Link: https://www.ft.com/content/293b3384-1326-4bb5-9ebf-29b2a6e1a218
Category: Ubisoft
----------------------------------------


# Text generation

In [3]:
# Pre-trained model to generate text 

from openai import OpenAI # I won't use openai cause there is a limit for the requests. 
from transformers import T5ForConditionalGeneration, T5Tokenizer # Prompt issue
from transformers import BartForConditionalGeneration, BartTokenizer


client = OpenAI(
    api_key = "xxx"
)

# # T5 model
# model_name = 't5-base'
# tokenizer = T5Tokenizer.from_pretrained(model_name)
# model = T5ForConditionalGeneration.from_pretrained(model_name)

# Bart model
model_name = "facebook/bart-large" # already trained
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Generator function

def generate_content_from_title(title):
    prompt = f"Write a tech article on this subject : {title}."

    inputs = tokenizer(prompt, return_tensors="pt", max_length=2000, truncation=True)

    outputs = model.generate(inputs.input_ids, max_length=2000, num_beams=4)

    # Decrypt generated text
    article = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return article


for article in ft_news_scraped[0:2]:
    title = article['title']
    # tag = article['tag']
    article['content'] = generate_content_from_title(title)
    article['content'] = article['content'].replace('. ', '. \n') # Output ergonomy

# Print
for article in ft_news_scraped[0:2]:
    print(f"Title: {article['title']}")
    print(f"Link: {article['link']}")
    print(f"Category: {article['tag']}")
    print(f"Content: {article['content']}")
    print('\n', "=" * 100, '\n')

Title: OpenAI pushes ahead with for-profit plans and talks to give Altman a stake
Link: https://www.ft.com/content/78b7e7a7-7428-4c5e-bfa2-0921c9d6cd25
Category: OpenAI
Content: Write a tech article on this subject : OpenAI pushes ahead with for-profit plans and talks to give Altman a stake.


Title: Ubisoft shares fall 17% after it delays launch of new ‘Assassin’s Creed’ game
Link: https://www.ft.com/content/293b3384-1326-4bb5-9ebf-29b2a6e1a218
Category: Ubisoft
Content: Write a tech article on this subject : Ubisoft shares fall 17% after it delays launch of new ‘Assassin’s Creed’ game.




In [5]:
# The Bart model is not self-sufficient. 
# I will be working on a fine-tuning solution to get better text generations.

# Fine Tuning

Imports

In [6]:
import pandas as pd
from datasets import Dataset

df = pd.read_csv('../articles/articles_cleaned.csv')
ds = Dataset.from_pandas(df)

In [7]:
# Format the articles' content for the training

def preprocess_function(examples):
    inputs = examples['title']
    targets = examples['full_content']
    
    # Tokenize (This function litterally does the neetcode problem with the tweets and amazon review.)
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    
    # Tokenize the contents as labels
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=512, truncation=True, padding="max_length")

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

tokenized_dataset = ds.map(preprocess_function, batched=True)

Map: 100%|██████████| 17/17 [00:00<00:00, 239.86 examples/s]


Check

In [8]:
# Output to check that everything works
sample = tokenized_dataset[0]

# InputIDs to readable tokens
tokens_input = tokenizer.convert_ids_to_tokens(sample['input_ids'])

# Print them
print("Tokens (input_ids):")
print(tokens_input)

# Decrypted text 
decoded_input_text = tokenizer.decode(sample['input_ids'], skip_special_tokens=True)
print("\nDecoded Input Text:")
print(decoded_input_text)

print('\n', '=' * 100, '\n')

# Same for the actual content
tokens_label = tokenizer.convert_ids_to_tokens(sample['labels'])
print("Tokens (labels):")
print(tokens_label)
decoded_target_text = tokenizer.decode(sample['labels'], skip_special_tokens=True)
print("\nDecoded Target Text:")
print(decoded_target_text)

Tokens (input_ids):
['<s>', 'Reddit', 'Ġis', 'Ġbringing', 'ĠAI', '-', 'powered', ',', 'Ġautomatic', 'Ġtranslation', 'Ġto', 'Ġdozens', 'Ġof', 'Ġnew', 'Ġcountries', 'Ġ-', 'ĠTech', 'Crunch', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<p

Training

In [9]:
# Datasets building

train_test_split = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

In [10]:
from transformers import Trainer, TrainingArguments

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",         # Compute at each epoch
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,                  # Save no more than 2 checkpoints
    save_steps=500,                      # Save at 500 steps
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

trainer.train()
trainer.evaluate()
model.save_pretrained('../models/fine_tuned_bart_1')
tokenizer.save_pretrained('../models/fine_tuned_bart_1')

  0%|          | 0/12 [00:00<?, ?it/s]

RuntimeError: MPS backend out of memory (MPS allocated: 8.99 GB, other allocations: 79.78 MB, max allowed: 9.07 GB). Tried to allocate 4.00 KB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).