In [166]:
import requests
import json
import pandas as pd
from tqdm import tqdm
import config


subscription_key = config.bing_key
search_term = "EIFFAGE GENIE CIVIL"
search_url = "https://api.bing.microsoft.com/v7.0/news/search"

In [118]:
headers = {"Ocp-Apim-Subscription-Key" : subscription_key}
params  = {"q": search_term, "textDecorations": True, "textFormat": "HTML", 'count':100, 'mkt': 'fr-FR'}

In [119]:
response = requests.get(search_url, headers=headers, params=params)
response.raise_for_status()
search_results = json.dumps(response.json())

In [120]:
def search(search_term):
    params["q"] = search_term
    response = requests.get(search_url, headers=headers, params=params)
    response.raise_for_status()
    df = pd.json_normalize(response.json())
    df = pd.DataFrame(df['value'][0])
    return df

In [126]:
companies = pd.read_csv('data/external/top_companies.txt', sep='	')

In [131]:
all_data = []
for company in tqdm(companies['Entreprise']):
    df = search(company)
    df['company'] = company
    all_data.append(df)
    df.to_csv('data/dump/%s.csv' % company)

100%|██████████| 1000/1000 [07:16<00:00,  2.29it/s]


In [140]:
all_articles = pd.concat(all_data)

In [141]:
all_articles = all_articles.merge(
    companies[['Entreprise','Activité']],
    left_on='company', right_on='Entreprise'
    ).drop('Entreprise', axis=1)

In [143]:
all_articles.to_csv('data/external/labelled articles.csv')

In [156]:
import bs4 as bs

In [160]:
tqdm.pandas()

def clean_html(x):
    soup = bs.BeautifulSoup(x, 'html.parser')
    return soup.get_text()

In [179]:
all_articles['name_clean'] = all_articles['name'].progress_apply(clean_html)
all_articles['clean_description'] = all_articles['description'].progress_apply(clean_html)
all_articles['naf_clean'] =  all_articles['Activité'].apply(lambda x: x.split('(')[-1].split(')')[0])

100%|██████████| 50116/50116 [00:01<00:00, 28135.28it/s]
100%|██████████| 50116/50116 [00:02<00:00, 22158.62it/s]


In [188]:
mapping = pd.read_csv('https://www.data.gouv.fr/fr/datasets/r/7bb2184b-88cb-4c6c-a408-5a0081816dcd', sep=',')[['id_2', 'id_5']].set_index('id_5')

all_articles = all_articles.merge(mapping, left_on='naf_clean', right_index=True)

In [192]:
clean_articles = all_articles[['company', 'name_clean', 'clean_description','id_2']]

In [193]:
clean_articles.columns = ['company','title','text','naf']

In [195]:
clean_articles.to_csv('data/external/labelled articles cleaned.csv')