In [68]:
# Imports 

import requests # HTTP requests 
from bs4 import BeautifulSoup # Extract HTML content

import numpy as np

In [69]:
# Simple scraper

def scrape_tech_news():

    url = 'https://www.ft.com/technology'

    # HTTP request to scrap page information
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)

    # Check response status
    if response.status_code != 200:
        print(f"Ã‰chec du scraping : {response.status_code}")
        return
    
    # HTML content analysis
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all the article listed on the page
    articles = soup.find_all('div', class_='o-teaser__content')
    
    news = []
    
    for article in articles:
        # Title and link extraction
        article_heading = article.find('a', class_='js-teaser-heading-link')
        title = article_heading.get_text(strip=True) if article_heading else "No title"
        link = "https://www.ft.com" + article_heading['href'] if article_heading else "No link"
        
        # Extract article tag to define categories
        article_tag = article.find('a', class_='o-teaser__tag')
        tag = article_tag['aria-label'] if article_tag else 'No cat'
        tag = tag.replace('Category: ', '')
        
        news.append({
            'title': title,
            'link': link,
            'tag': tag,
        })
    
    return news

ft_news_scraped = scrape_tech_news()

# Print
for article in ft_news_scraped[0:2]:
    print(f"Title: {article['title']}")
    print(f"Link: {article['link']}")
    print(f"Category: {article['tag']}")
    print("-" * 40)


Title: FBI probes whether Silicon Valley venture firm passed secrets to China
Link: https://www.ft.com/content/d94a5467-ebf9-4992-af13-3e71061707a4
Category: Industrial espionage
----------------------------------------
Title: Google files Brussels complaint against Microsoft cloud business
Link: https://www.ft.com/content/65567a16-434c-4865-9098-2cc8a0c76f68
Category: EU tech regulation
----------------------------------------


In [70]:
# Pre-trained model to generate text 

from openai import OpenAI # I won't use openai cause there is a limit for the requests. 
from transformers import T5ForConditionalGeneration, T5Tokenizer # Prompt issue
from transformers import BartForConditionalGeneration, BartTokenizer


client = OpenAI(
    api_key = "xxx"
)

# # T5 model
# model_name = 't5-base'
# tokenizer = T5Tokenizer.from_pretrained(model_name)
# model = T5ForConditionalGeneration.from_pretrained(model_name)

# Bart model
model_name = "facebook/bart-large-cnn" # already trained
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)



In [71]:
# Generation function

def generate_content_from_title(title):
    prompt = f"Write a tech article on this subject : {title}."

    inputs = tokenizer(prompt, return_tensors="pt", max_length=2000, truncation=True)

    outputs = model.generate(inputs.input_ids, max_length=2000, num_beams=4)

    # Decrypt generated text
    article = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return article


for article in ft_news_scraped[0:2]:
    title = article['title']
    # tag = article['tag']
    article['content'] = generate_content_from_title(title)
    article['content'] = article['content'].replace('. ', '. \n') # Output ergonomy

# Print
for article in ft_news_scraped[0:2]:
    print(f"Title: {article['title']}")
    print(f"Link: {article['link']}")
    print(f"Category: {article['tag']}")
    print(f"Content: {article['content']}")
    print('\n', "=" * 100, '\n')

Title: FBI probes whether Silicon Valley venture firm passed secrets to China
Link: https://www.ft.com/content/d94a5467-ebf9-4992-af13-3e71061707a4
Category: Industrial espionage
Content: FBI probes whether Silicon Valley venture firm passed secrets to China. 
FBI probing whether venture firm pass secrets to Chinese government. 
FBI also probing whether firm helped Chinese government spy on U.S. 
citizens. 
FBI investigating whether firm assisted Chinese government in spying on American citizens in Silicon Valley.


Title: Google files Brussels complaint against Microsoft cloud business
Link: https://www.ft.com/content/65567a16-434c-4865-9098-2cc8a0c76f68
Category: EU tech regulation
Content: Google files Brussels complaint against Microsoft cloud business. 
Google says Microsoft's cloud business is a threat to its business model. 
Microsoft says it has no plans to change its cloud business model in the near future. 
The company says it will continue to provide cloud services to custom