In [1]:
# standard library imports
import sys
import time
sys.path.append('..')
# third party imports
import pandas as pd
from tqdm import tqdm
import numpy as np
# local imports
from bbc_monitoring.api_wrapper import BBCMonitoringApi

## Using the BBC-Monitoring API to build a dataset of location/article pairs
We'll start by building a smaller test set to see how the current model fairs on out of domain problems. 

In [4]:
# load the api
api = BBCMonitoringApi(api_key_path='data/api_key.json')
# load a list of cities to search for
all_cities_df = pd.read_csv('data/geonames_world_cities.csv', delimiter=';')
cities_df = all_cities_df[all_cities_df.Population >= 100000]
cities_df = cities_df.reset_index()

In [3]:
# Add the ISO3 code used by BBC-Monitoring to the city dataframe
iso_lookup = pd.read_csv('data/country_iso_lookup.csv', encoding='latin')
iso3_codes = []
for i, row in cities_df.iterrows():
    iso2 = row['Country Code']
    try:
        iso3 = iso_lookup[iso_lookup.ISO2==iso2].ISO3.iloc[0]
    except:
        iso3 = np.nan
    iso3_codes.append(iso3)
        
cities_df['iso3'] = iso3_codes
cities_df = cities_df[cities_df.iso3.notna()]

In [None]:
# build a zipped list of (city, country) pairs
cities = zip(cities_df['ASCII Name'].values, cities_df.iso3.values)
# search headlines for those places. We'll also check the country is correct.
article_info = []
for city, country in tqdm(cities, total = len(cities_df.Name.values)):
    try:
        articles = api.search_headlines(city)['products']
    except:
        continue
    # Sleep to avoid hitting rate limit (60 articles/min)
    time.sleep(1)
    for article in articles:
        # check that the article country is correct
        if country in (article['subjectCountryIds']):
            article_info.append({'id':article['id'], 'city':city, 'country':country})

In [20]:
article_info_df = pd.DataFrame(article_info)
article_info_df.to_csv('data/article_info.csv', index=False)

In [2]:
article_info = pd.read_csv('data/article_info.csv')

In [None]:
article_data = []
for i, info in tqdm(article_info.iterrows(),total=len(article_info)):
    content = api.search_by_id(info['id'])
    text = api.parse_article(content)
    info['text'] = text
    article_data.append(info)
    time.sleep(1)
    

In [12]:
article_df = pd.DataFrame(article_data)
article_df.head(20)

Unnamed: 0,id,city,country,text
0,c204jinn,Omsk,RUS,Russia's Federal Security Service (FSB) has re...
1,c204jidk,Omsk,RUS,Vladimir Putin had a working meeting with Acti...
2,c2046uay,Omsk,RUS,President Vladimir Putin has appointed the hea...
3,c2013td1,Omsk,RUS,Russian police have searched the home of a vol...
4,c200a87d,Omsk,RUS,A riot broke out at a high-security penal colo...
5,m1cogd38,Omsk,RUS,An experimental design bureau in the Omsk Stat...
6,m1c1r60x,Omsk,RUS,"Moscow, 19 August: Russia's [armoured vehicle ..."
7,00019947,Omsk,RUS,"Omsk, 9 April: Viktor Nazarov was approved for..."
8,m1aby505,Omsk,RUS,"Moscow, 3 April: Russian President Dmitriy Med..."
9,70175789,Omsk,RUS,[Report by Anastasiya Mitkovskaya and Viktor K...


In [13]:
article_df.to_csv('articles_text.csv', index=False)

In [14]:
api.end_session()

To make these into workable inputs into the neural-net model we will need to do a couple of things. First, lets remove any capital cities. This will avoid articles which use the capital city as a synecdoche for a country's government, for example in the headline "Beijing steps up military pressure on Taiwan". We'll use a list of capital cities from https://github.com/icyrockcom/country-capitals/blob/master/data/country-list.csv as a lookup for capitals. We'll also use fuzzy-string matching to account for missing accents and differences in spelling.

In [16]:
from thefuzz import fuzz

In [43]:
capitals_df = pd.read_csv('data/country-capital-list.csv')
capitals = capitals_df.capital.values
similarity_threshold = 80

cities_to_remove = []
for city in article_df.city.unique():
    for capital in capitals:
        if fuzz.ratio(city, capital) > similarity_threshold:
            cities_to_remove.append(city)

print(f'Total cities : {len(article_df.city.unique())}')
print(f'Captials removed : {len(set(cities_to_remove))}')

Total cities : 1733
Captials removed : 137


In [47]:
# get rid of the identified capitals
article_df = article_df[~article_df.city.isin(cities_to_remove)]
print(len(article_df))

13180


We now need to split the articles out into full sentences (or sets of sentences) containing up to 60 words. The model requiresw maximum sentence length to be 80 tokens, but we will need to account for tokens split accross multiple words and for punctuation.

In [49]:
import nltk
nltk.download('punkt')

def split_into_sentences(text, max_words=60):
    sentences = nltk.sent_tokenize(text)
    result = []
    current_sentence = ""

    for sentence in sentences:
        if len(current_sentence.split()) + len(sentence.split()) <= max_words:
            current_sentence += " " + sentence
        else:
            result.append(current_sentence.strip())
            current_sentence = sentence

    # Add the last substring
    if current_sentence:
        result.append(current_sentence.strip())

    return result



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jws10y\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [50]:
article_df

Unnamed: 0,id,city,country,text
0,c204jinn,Omsk,RUS,Russia's Federal Security Service (FSB) has re...
1,c204jidk,Omsk,RUS,Vladimir Putin had a working meeting with Acti...
2,c2046uay,Omsk,RUS,President Vladimir Putin has appointed the hea...
3,c2013td1,Omsk,RUS,Russian police have searched the home of a vol...
4,c200a87d,Omsk,RUS,A riot broke out at a high-security penal colo...
...,...,...,...,...
15426,m12nz4io,Jacobabad,PAK,A suspected Al-Qa'idah member was arrested for...
15427,m12is7i6,Jacobabad,PAK,Jacobabad: Army foiled a terrorists act to des...
15428,m12isath,Jacobabad,PAK,Jacobabad: Army foiled a terrorists act to des...
15429,m12iq95c,Jacobabad,PAK,Islamabad: An unmanned US spy plane has crashe...


In [51]:
article_sentences = []

for i, row in tqdm(article_df.iterrows(), total=len(article_df)):
    sentences = split_into_sentences(row.text)
    new_row = {'id':row.id, 'city':row.city, 'country':row.country}
    for sentence in sentences:
        new_row[text] = sentence
        article_sentences.append(new_row)
        

100%|██████████| 13180/13180 [00:00<00:00, 15586.95it/s]


In [55]:
sentences_df = pd.DataFrame(article_sentences)
sentences_df.to_csv('news_places_full.csv', index=False)