# Imports

In [1]:
import requests
import json
import pandas as pd
import sys
from newspaper import Article

# Display options
sys.displayhook = lambda x: print(json.dumps(x, indent=2)) if isinstance(x, dict) else print(repr(x))

# This script will build my database to fine-tune my BART model that generates content from titles. 



API keys management

In [2]:
api_keys = pd.read_excel('../API_Keys.xlsx')
key = api_keys.loc[api_keys['API'] == 'News API', 'Key'].values[0] 

# Data scraping

In [3]:
# Scraping function

def get_tech_news():
    url = (
        'https://newsapi.org/v2/top-headlines?'
        'category=technology&'
        'language=en&'
        f'apiKey={key}'
    )

    response = requests.get(url)

    if response.status_code == 200:
        # parse answer to json
        articles = response.json().get('articles')
        return articles
    else:
        print(f'Error: Unable to fetch articles, status code: {response.status_code}')
        return False

In [4]:
# The News API does not return complete content from articles so 
# I have to use newspaper3k to scrape from all the urls without knowing the exact HTML build.

def scrape_article_content(url) : 
    try:
        article = Article(url)
        article.download()
        article.parse()

        # Retourner le texte complet de l'article
        return article.text
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return False

In [5]:
articles_cleaned = []

articles = get_tech_news() # Scrape articles from News API

if articles :
    for article in articles : 
        url = article.get('url')
        title = article.get('title')
        full_content = scrape_article_content(url)

        if full_content : 
            articles_cleaned.append(
                {
                    'title' : title,
                    'url' : url,
                    'author' : article.get('athor'),
                    'full_content' : full_content
                }
            )
        
        if not articles_cleaned : 
            print('No article retrieved')

        # if articles_cleaned :
        #     for article in articles_cleaned :
        #         print('Title :', article['title'])
        #         print('URL', article['url'])
        #         print('Author', article['author'])
        #         print('Content', article['full_content'])
        #         print('=' * 80)

if articles_cleaned : 
    print('Articles retrieved succesfully :', len(articles_cleaned))

Articles retrieved succesfully : 17


# Export the articles

In [6]:
df = pd.DataFrame(articles_cleaned)
df.to_csv('../articles/articles_cleaned.csv')