In [13]:
# Imports 

import requests # HTTP requests 
from bs4 import BeautifulSoup # Extract HTML content

import numpy as np

In [14]:
# Simple scraper

def scrape_tech_news():

    url = 'https://www.ft.com/technology'

    # HTTP request to scrap page information
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)

    # Check response status
    if response.status_code != 200:
        print(f"Échec du scraping : {response.status_code}")
        return
    
    # HTML content analysis
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all the article listed on the page
    articles = soup.find_all('div', class_='o-teaser__content')
    
    news = []
    
    for article in articles:
        # Title and link extraction
        article_heading = article.find('a', class_='js-teaser-heading-link')
        title = article_heading.get_text(strip=True) if article_heading else "No title"
        link = "https://www.ft.com" + article_heading['href'] if article_heading else "No link"
        
        # Extract article tag to define categories
        article_tag = article.find('a', class_='o-teaser__tag')
        tag = article_tag['aria-label'] if article_tag else 'No cat'
        tag = tag.replace('Category: ', '')
        
        news.append({
            'title': title,
            'link': link,
            'tag': tag,
        })
    
    return news

ft_news_scraped = scrape_tech_news()

# Print
for article in ft_news_scraped:
    print(f"Title: {article['title']}")
    print(f"Link: {article['link']}")
    print(f"Category: {article['tag']}")
    print("-" * 100)


Title: FBI probes whether Silicon Valley venture firm passed secrets to China
Link: https://www.ft.com/content/d94a5467-ebf9-4992-af13-3e71061707a4
Category: Industrial espionage
----------------------------------------------------------------------------------------------------
Title: Google files Brussels complaint against Microsoft cloud business
Link: https://www.ft.com/content/65567a16-434c-4865-9098-2cc8a0c76f68
Category: EU tech regulation
----------------------------------------------------------------------------------------------------
Title: Uber’s next act: taking on Amazon
Link: https://www.ft.com/content/7b503c54-ee5b-4413-ad06-580cdab531e5
Category: The Big Read
----------------------------------------------------------------------------------------------------
Title: US antitrust lawsuit accuses Visa of using dominance to shut down rivals
Link: https://www.ft.com/content/59479f79-7f0b-43c2-973c-1578e2999c3c
Category: Visa Inc
--------------------------------------------

In [15]:
# Pre-trained model to generate text 

from openai import OpenAI # I won't use openai cause there is a limit for the requests. 
from transformers import T5ForConditionalGeneration, T5Tokenizer # Prompt issue
from transformers import BartForConditionalGeneration, BartTokenizer


client = OpenAI(
    api_key = "xxx"
)

# # T5 model

# model_name = 't5-base'
# tokenizer = T5Tokenizer.from_pretrained(model_name)
# model = T5ForConditionalGeneration.from_pretrained(model_name)

# Bart model
model_name = "facebook/bart-large-cnn" # already trained
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)





In [16]:
def generate_content_from_title(title):
    prompt = f"Rédige un article d'analyse en français basé sur ce titre : {title}."

    inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)

    outputs = model.generate(inputs.input_ids, max_length=500, num_beams=4, early_stopping=True)

    # Décoder le texte généré
    article = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return article

for article in ft_news_scraped:
    title = article['title']
    # tag = article['tag']
    article['content'] = generate_content_from_title(title)

# Print
for article in ft_news_scraped:
    print(f"Title: {article['title']}")
    print(f"Link: {article['link']}")
    print(f"Category: {article['tag']}")
    print(f"Content: {article['content']}")
    print("-" * 40)

Title: FBI probes whether Silicon Valley venture firm passed secrets to China
Link: https://www.ft.com/content/d94a5467-ebf9-4992-af13-3e71061707a4
Category: Industrial espionage
Content: Rédige un article d'analyse en français basé sur ce titre. FBI probes whether Silicon Valley venture firm passed secrets to China. FBI: Silicon Valley firm passed secret to China, according to reports. C'est la première version d'un article dans laquelle l'on analyse.
----------------------------------------
Title: Google files Brussels complaint against Microsoft cloud business
Link: https://www.ft.com/content/65567a16-434c-4865-9098-2cc8a0c76f68
Category: EU tech regulation
Content: Rédige un article d'analyse en français basé sur ce titre : Google files Brussels complaint against Microsoft cloud business. Google files complaints against Microsoft over cloud business in Brussels. Google says it has no plans to pull out of the cloud market.
----------------------------------------
Title: Uber’s next 