In [None]:
import requests
import pandas as pd
import bs4 as bs
from tqdm import tqdm

In [None]:
#
# FETCHING SETUP
#
# api params and url - key needed from azure
# https://www.microsoft.com/en-us/bing/apis/bing-web-search-api

subscription_key = "<key goes here>"

search_url = "https://api.bing.microsoft.com/v7.0/news/search"
headers = {"Ocp-Apim-Subscription-Key" : subscription_key}
params  = {"q": 'placeholder', "textDecorations": True, "textFormat": "HTML", 'count':100, 'mkt': 'fr-FR'}

def search(search_term):
    params["q"] = search_term
    response = requests.get(search_url, headers=headers, params=params)
    response.raise_for_status()
    df = pd.json_normalize(response.json())
    df = pd.DataFrame(df['value'][0])
    return df

In [None]:
#
# DATA FETCHING
#
# loop over all companies,dump in case of loop failure

companies = pd.read_csv('data/external/top_companies.txt', sep='	')
all_data = []
for company in tqdm(companies['Entreprise']):
    df = search(company)
    df['company'] = company
    all_data.append(df)
    df.to_csv('dump/%s.csv' % company)

# merge all articles and company data
all_articles = pd.concat(all_data)
all_articles = all_articles.merge(
    companies[['Entreprise','Activité']],
    left_on='company', right_on='Entreprise'
    ).drop('Entreprise', axis=1)

In [None]:
#
# DATA CLEANUP AND SAVE
#

# remove html characters
def clean_html(x):
    soup = bs.BeautifulSoup(x, 'html.parser')
    return soup.get_text()

all_articles['name_clean'] = all_articles['name'].apply(clean_html)
all_articles['clean_description'] = all_articles['description'].apply(clean_html)

# add naf to articles
all_articles['naf_clean'] =  all_articles['Activité'].apply(lambda x: x.split('(')[-1].split(')')[0])
mapping = pd.read_csv('https://www.data.gouv.fr/fr/datasets/r/7bb2184b-88cb-4c6c-a408-5a0081816dcd', sep=',')[['id_2', 'id_5']].set_index('id_5')
all_articles = all_articles.merge(mapping, left_on='naf_clean', right_index=True)

# remove excessive data
clean_articles = all_articles[['company', 'name_clean', 'clean_description','id_2']]
clean_articles.columns = ['company','title','text','naf']
clean_articles.to_csv('data/external/labelled articles cleaned.csv')