In [1]:
import pandas as pd
import numpy as np

In [2]:
# Importing final dataset used by topic clustering model 

clean = pd.read_csv('clean.csv').drop(columns=['Unnamed: 0'])

In [3]:
clean.shape

(4328, 7)

In [4]:
clean.head()

Unnamed: 0,outlet,url,title,authors,publish_date,text,keywords
0,cbc,https://www.cbc.ca/news/covid-19,CBC News | Information about COVID-19 in Canada,'Darren Bernhardt',,Sylvan Lake to clamp down on beach crowds with...,[]
1,cbc,https://www.cbc.ca/news/local,CBC News,'Andrew Kurjata',,What you need to know about COVID-19 in Ottawa...,[]
2,cbc,https://www.cbc.ca/news/politics,Politics,'John Paul Tasker',,WE Charity contract could have been worth up t...,[]
3,cbc,https://www.cbc.ca/news/indigenous,Indigenous,'Jessica Deer',,Manitoba judge points to systemic issues in In...,[]
4,cbc,https://www.cbc.ca/news/business,Business,'Pete Evans',,Twitter says about 130 accounts were targeted ...,[]


There are 21 unique news outlets in this dataset.

In [5]:
clean['outlet'].unique()

array(['cbc', 'ctvnews', 'nationalpost', 'torontosun', 'thestar', 'cp24',
       'mapleridgenews', 'tricitynews', 'langleyadvancetimes', 'abbynews',
       'theprogress', 'northdeltareporter', 'surreynowleader',
       'vancouverobserver', 'vancourier', 'nsnews', 'richmond-news',
       'burnabynow', 'newwestrecord', 'bowenislandundercurrent',
       'The Record'], dtype=object)

**Options for location-based coordinates for Demo Day:**

1. Count number of locations in the text and find coordinates of most frequent 
    - spacy API
    
2. Use location of outlet (21 unique locations) 

**Pipeline:**

- For all text that have GPE (location) items,
        - Count occurrences of location names in text
        - Select most frequently mentioned location  **(1A)**

- For text without GPE item,
        - Select location of news outlet  **(1B)**
        
        
- Find coordinates of location **(2)**
        - use mapbox API and forward geocoding (https://docs.mapbox.com/api/search/#forward-geocoding)
        - Add noise to lat and long coordinates 

## (1) Determining location of article

In [6]:
import spacy

### Create dictionary of news outlets and location

In [7]:
def find_location(text, model):
    
    doc = model(text)
    final_loc = ''
    locations = []
    
    for ent in doc.ents:
        if (ent.label_ == 'GPE'):
            locations.append(ent.text)
            #print(ent.text, ent.start_char, ent.end_char, ent.label_)
    
    if len(locations) > 0:
        loc_counts = pd.DataFrame([[x, locations.count(x)] for x in set(locations)])
        loc_counts.columns = ['location', 'count']
        loc_counts = loc_counts.sort_values(by=['count'], ascending=False)
    
        final_loc = loc_counts.iloc[0, 0]
        
    return final_loc   

### Creating new .csv of articles with locations

In [8]:
new_clean = clean

In [9]:
# Model to use for location 

nlp = spacy.load("en_core_web_sm")

In [10]:
count_spacy = 0
count_outlet = 0

In [None]:
for i in range(len(new_clean)):
    
    # Finding location of article
    print('Row: ' + str(i))
    text_loc = find_location(new_clean.loc[i, 'text'], nlp)
    
    if text_loc=='Surrey':
        text_loc = 'Surrey, British Columbia'
    elif text_loc=='B.C.':
        text_loc = 'British Columbia'
        
    print('Location from spacy: ' + text_loc)
    
    if text_loc != '': 
        try: 
            new_clean.loc[i, 'location'] = text_loc        # (1A)
            loc = gl.geocode(text_loc)
            lat, long = loc.latitude, loc.longitude
            
            print('Coordinates from geopy: [' + str(lat) + ', ' + str(long) + ']' )
            
            if not lat: 
                new_clean.loc[i, 'lat'] = lat
                new_clean.loc[i, 'long'] = long
            else: 
                new_clean.loc[i, 'lat'] = 0.0
                new_clean.loc[i, 'long'] = 0.0
                
            count_spacy +=1
            
        except:
            print("Error adding text_loc as location.")
            
    else:
        try:
            new_clean.loc[i, 'location'] = loc_dict[new_clean.loc[i, 'outlet']]      # (1B)
            new_clean.loc[i, 'lat'] = lat_dict[new_clean.loc[i, 'outlet']] 
            new_clean.loc[i, 'long'] = long_dict[new_clean.loc[i, 'outlet']]  
            count_outlet += 1
        except:
            print("Error calling from dictionaries.")

Row: 0
Location from spacy: Alberta
Coordinates from geopy: [55.001251, -115.002136]
Row: 1
Location from spacy: Ottawa
Coordinates from geopy: [45.421106, -75.690308]
Row: 2
Location from spacy: 
Row: 3
Location from spacy: Manitoba
Coordinates from geopy: [55.001251, -97.001038]
Row: 4
Location from spacy: 
Row: 5
Location from spacy: 
Row: 6
Location from spacy: 
Row: 7
Location from spacy: N.S.
Coordinates from geopy: [24.4349856, 105.5550331]
Row: 8
Location from spacy: 
Row: 9
Location from spacy: Canada
Coordinates from geopy: [61.0666922, -107.9917071]
Row: 10
Location from spacy: Greece
Coordinates from geopy: [38.9953683, 21.9877132]
Row: 11
Location from spacy: Canada
Coordinates from geopy: [61.0666922, -107.9917071]
Row: 12
Location from spacy: 
Row: 13
Location from spacy: Moscow
Coordinates from geopy: [55.7504461, 37.6174943]
Row: 14
Location from spacy: Jordan
Coordinates from geopy: [31.1667049, 36.941628]
Row: 15
Location from spacy: Canada
Coordinates from geopy: [6

Location from spacy: Italy
Coordinates from geopy: [42.6384261, 12.674297]
Row: 102
Location from spacy: Toronto
Coordinates from geopy: [43.6534817, -79.3839347]
Row: 103
Location from spacy: Orchestre Métropolitain
Error adding text_loc as location.
Row: 104
Location from spacy: Montreal
Coordinates from geopy: [45.4972159, -73.6103642]
Row: 105
Location from spacy: New Orleans
Coordinates from geopy: [29.9499323, -90.0701156]
Row: 106
Location from spacy: Victoria
Coordinates from geopy: [-36.5986096, 144.6780052]
Row: 107
Location from spacy: Toronto
Coordinates from geopy: [43.6534817, -79.3839347]
Row: 108
Location from spacy: Cambodia
Coordinates from geopy: [13.5066394, 104.869423]
Row: 109
Location from spacy: Xoxox
Error adding text_loc as location.
Row: 110
Location from spacy: Alessia
Coordinates from geopy: [40.6855514, 14.7325899]
Row: 111
Location from spacy: 
Row: 112
Location from spacy: Weeknd
Coordinates from geopy: [10.4273236, -85.0943199]
Row: 113
Location from sp

Coordinates from geopy: [45.421106, -75.690308]
Row: 199
Location from spacy: 
Row: 200
Location from spacy: Canada
Coordinates from geopy: [61.0666922, -107.9917071]
Row: 201
Location from spacy: Ottawa
Coordinates from geopy: [45.421106, -75.690308]
Row: 202
Location from spacy: Canada
Coordinates from geopy: [61.0666922, -107.9917071]
Row: 203
Location from spacy: 
Row: 204
Location from spacy: Alberta
Coordinates from geopy: [55.001251, -115.002136]
Row: 205
Location from spacy: Canada
Coordinates from geopy: [61.0666922, -107.9917071]
Row: 206
Location from spacy: B.C.
Coordinates from geopy: [52.1591116, 9.9514374]
Row: 207
Location from spacy: 
Row: 208
Location from spacy: McKnight
Coordinates from geopy: [40.5550681, -80.0364448]
Row: 209
Location from spacy: Edmonton
Coordinates from geopy: [53.535411, -113.507996]
Row: 210
Location from spacy: Canada
Coordinates from geopy: [61.0666922, -107.9917071]
Row: 211
Location from spacy: Edmonton
Coordinates from geopy: [53.535411, 

Coordinates from geopy: [61.0666922, -107.9917071]
Row: 303
Location from spacy: Vancouver
Coordinates from geopy: [49.2608724, -123.1139529]
Row: 304
Location from spacy: Canada
Coordinates from geopy: [61.0666922, -107.9917071]
Row: 305
Location from spacy: B.C.
Coordinates from geopy: [52.1591116, 9.9514374]
Row: 306
Location from spacy: Canada
Coordinates from geopy: [61.0666922, -107.9917071]
Row: 307
Location from spacy: Langley
Coordinates from geopy: [48.3604648, 6.3258262]
Row: 308
Location from spacy: 
Row: 309
Location from spacy: New Westminster
Coordinates from geopy: [49.2067726, -122.9108818]
Row: 310
Location from spacy: Vancouver
Coordinates from geopy: [49.2608724, -123.1139529]
Row: 311
Location from spacy: South Burnaby
Coordinates from geopy: [49.2433804, -122.9725459]
Row: 312
Location from spacy: Coquitlam
Coordinates from geopy: [49.2842958, -122.793281]
Row: 313
Location from spacy: B.C.
Coordinates from geopy: [52.1591116, 9.9514374]
Row: 314
Location from spa

Coordinates from geopy: [45.1895845, -88.73215845]
Row: 412
Location from spacy: Edinburgh
Error adding text_loc as location.
Row: 413
Location from spacy: U.K.
Error adding text_loc as location.
Row: 414
Location from spacy: Selma
Coordinates from geopy: [32.4078632, -87.0207473]
Row: 415
Location from spacy: Traore
Coordinates from geopy: [16.068147, 0.0648645]
Row: 416
Location from spacy: China
Coordinates from geopy: [35.000074, 104.999927]
Row: 417
Location from spacy: Calgary
Coordinates from geopy: [51.0534234, -114.0625892]
Row: 418
Location from spacy: Calgary
Coordinates from geopy: [51.0534234, -114.0625892]
Row: 419
Location from spacy: Alberta
Coordinates from geopy: [55.001251, -115.002136]
Row: 420
Location from spacy: Alberta
Coordinates from geopy: [55.001251, -115.002136]
Row: 421
Location from spacy: Calgary
Coordinates from geopy: [51.0534234, -114.0625892]
Row: 422
Location from spacy: Ottawa
Coordinates from geopy: [45.421106, -75.690308]
Row: 423
Location from s

Coordinates from geopy: [50.44876, -104.61731]
Row: 515
Location from spacy: Canada
Coordinates from geopy: [61.0666922, -107.9917071]
Row: 516
Location from spacy: India
Coordinates from geopy: [22.3511148, 78.6677428]
Row: 517
Location from spacy: Wisconsin
Coordinates from geopy: [44.4308975, -89.6884637]
Row: 518
Location from spacy: Canada
Coordinates from geopy: [61.0666922, -107.9917071]
Row: 519
Location from spacy: Canada
Coordinates from geopy: [61.0666922, -107.9917071]
Row: 520
Location from spacy: OTTAWA
Coordinates from geopy: [45.421106, -75.690308]
Row: 521
Location from spacy: B.C.
Coordinates from geopy: [52.1591116, 9.9514374]
Row: 522
Location from spacy: B.C.
Coordinates from geopy: [52.1591116, 9.9514374]
Row: 523
Location from spacy: the City of Victoria's
Coordinates from geopy: [10.9013412, 123.0714943]
Row: 524
Location from spacy: B.C.
Coordinates from geopy: [52.1591116, 9.9514374]
Row: 525
Location from spacy: catamaran
Coordinates from geopy: [-43.5519452,

Coordinates from geopy: [45.966425, -66.645813]
Row: 611
Location from spacy: New-Brunswick
Coordinates from geopy: [46.500283, -66.750183]
Row: 612
Location from spacy: 
Row: 613
Location from spacy: N.S.
Coordinates from geopy: [24.4349856, 105.5550331]
Row: 614
Location from spacy: Ontario
Coordinates from geopy: [50.000678, -86.000977]
Row: 615
Location from spacy: N.B.
Coordinates from geopy: [39.830767, -84.923802]
Row: 616
Location from spacy: Ottawa
Coordinates from geopy: [45.421106, -75.690308]
Row: 617
Location from spacy: Ottawa
Coordinates from geopy: [45.421106, -75.690308]
Row: 618
Location from spacy: Ottawa
Coordinates from geopy: [45.421106, -75.690308]
Row: 619
Location from spacy: Ontario
Coordinates from geopy: [50.000678, -86.000977]
Row: 620
Location from spacy: Ottawa
Coordinates from geopy: [45.421106, -75.690308]
Row: 621
Location from spacy: Ottawa
Coordinates from geopy: [45.421106, -75.690308]
Row: 622
Location from spacy: Centretown
Coordinates from geopy:

Coordinates from geopy: [45.1895845, -88.73215845]
Row: 711
Location from spacy: Canada
Coordinates from geopy: [61.0666922, -107.9917071]
Row: 712
Location from spacy: Trikafta
Error adding text_loc as location.
Row: 713
Location from spacy: Canada
Coordinates from geopy: [61.0666922, -107.9917071]
Row: 714
Location from spacy: Calgary
Coordinates from geopy: [51.0534234, -114.0625892]
Row: 715
Location from spacy: Barrie
Coordinates from geopy: [44.3893113, -79.6901736]
Row: 716
Location from spacy: Thornhill
Coordinates from geopy: [38.2884032, -85.6257945]
Row: 717
Location from spacy: 
Row: 718
Location from spacy: 
Row: 719
Location from spacy: 
Row: 720
Location from spacy: Alberta
Coordinates from geopy: [55.001251, -115.002136]
Row: 721
Location from spacy: Toronto
Coordinates from geopy: [43.6534817, -79.3839347]
Row: 722
Location from spacy: Ontario
Coordinates from geopy: [50.000678, -86.000977]
Row: 723
Location from spacy: Toronto
Coordinates from geopy: [43.6534817, -79.

### Exploring dataset with locations (new_clean.csv)

In [12]:
count_outlet

0

In [40]:
new_clean.head()

Unnamed: 0,outlet,url,title,authors,publish_date,text,keywords,location,lat,long
0,cbc,https://www.cbc.ca/news/covid-19,CBC News | Information about COVID-19 in Canada,'Darren Bernhardt',,Sylvan Lake to clamp down on beach crowds with...,[],Alberta,,
1,cbc,https://www.cbc.ca/news/local,CBC News,'Andrew Kurjata',,What you need to know about COVID-19 in Ottawa...,[],Ottawa,,
2,cbc,https://www.cbc.ca/news/politics,Politics,'John Paul Tasker',,WE Charity contract could have been worth up t...,[],Toronto,45.518055,-73.55093
3,cbc,https://www.cbc.ca/news/indigenous,Indigenous,'Jessica Deer',,Manitoba judge points to systemic issues in In...,[],Manitoba,,
4,cbc,https://www.cbc.ca/news/business,Business,'Pete Evans',,Twitter says about 130 accounts were targeted ...,[],Toronto,45.518055,-73.55093


In [14]:
new_clean['location'].value_counts()

Canada       4
Edinburgh    1
Manitoba     1
Jordan       1
Moscow       1
Alberta      1
Britain      1
Montreal     1
Greece       1
Ottawa       1
N.S.         1
Name: location, dtype: int64

**Notes:**
- Change "Surrey" to "Surrey, British Columbia"
- Change "B.C." to "British Columbia"

## Exploring geo.csv (locations of outlets) 

In [15]:
geo = pd.read_csv('geo.csv')

In [16]:
clean.columns

Index(['outlet', 'url', 'title', 'authors', 'publish_date', 'text', 'keywords',
       'location'],
      dtype='object')

In [17]:
geo.columns = ['outlet', 'url', 'title', 'authors', 'publish_date', 'text', 'keywords', 'scope', 'location', 'latitude', 'longitude']

In [18]:
geo.head()

Unnamed: 0,outlet,url,title,authors,publish_date,text,keywords,scope,location,latitude,longitude
0,ctvnews,https://www.ctvnews.ca/health/coronavirus/trac...,Tracking every case of COVID-19 in Canada,,2020-03-13 14:17:00-04:00,July 17 – Ontario health officials recorded 11...,[],national,Ottawa,-75.697472,45.421861
1,ctvnews,https://www.ctvnews.ca/politics/liberals-revis...,"Liberals revise COVID-19 wage subsidy, ease el...",,2020-07-17 10:22:00-04:00,OTTAWA -- Finance Minister Bill Morneau says t...,[],national,Ottawa,-75.697472,45.421861
2,ctvnews,https://www.ctvnews.ca/health/coronavirus/new-...,New Normal: Casinos betting on temperature che...,,2020-07-17 12:03:00-04:00,TORONTO -- Canadian gamblers are trading in ca...,[],national,Ottawa,-75.697472,45.421861
3,ctvnews,https://www.ctvnews.ca/health/coronavirus/it-s...,'It's amazing': Toronto man recovering after 1...,,2020-07-15 22:00:00-04:00,"TORONTO -- For months, Bruno Iozzo’s family wa...",[],national,Ottawa,-75.697472,45.421861
4,ctvnews,https://www.ctvnews.ca/health/coronavirus/ariz...,Arizona man in a coma from coronavirus wakes u...,'Madeline Holcombe',2020-07-16 08:15:00-04:00,"For Eddie Case, recovering from COVID-19 comes...",[],national,Ottawa,-75.697472,45.421861


### Creates dictionaries from scope and location of each outlet

In [19]:
group = geo.groupby('outlet')

In [20]:
# Not all 

scope_dict = group.apply(lambda x: x['scope'].unique()).apply(pd.Series).loc[:, 0].to_dict()

In [21]:
loc_dict = group.apply(lambda x: x['location'].unique()).apply(pd.Series).loc[:, 0].to_dict()

In [22]:
lat_dict = group.apply(lambda x: x['latitude'].unique()).apply(pd.Series).loc[:, 0].to_dict()

In [23]:
long_dict = group.apply(lambda x: x['longitude'].unique()).apply(pd.Series).loc[:, 0].to_dict()

In [24]:
loc_dict['cbc'] = 'Toronto'
loc_dict['tricitynews'] = 'Port Coquitlam'
loc_dict['vancouverobserver'] = 'Vancouver'
loc_dict['vancourier'] = 'Vancouver'
loc_dict['richmond-news'] = 'Richmond'
loc_dict['burnabynow'] = 'Burnaby'
loc_dict['newwestrecord'] = 'Burnaby'
loc_dict['bowenislandundercurrent'] = 'Bowen Island'

In [25]:
lat_dict['cbc'] = 45.518055
lat_dict['tricitynews'] = 49.247012
lat_dict['vancouverobserver'] = -123.10096000000001
lat_dict['vancourier'] = 49.266922
lat_dict['richmond-news'] = 49.174152
lat_dict['burnabynow'] = 49.252834, 
lat_dict['newwestrecord'] = 49.252857
lat_dict['bowenislandundercurrent'] = 49.379632

In [26]:
long_dict['cbc'] = -73.550930
long_dict['tricitynews'] = -122.760220
long_dict['vancouverobserver'] = 49.318767
long_dict['vancourier'] = -123.110863
long_dict['richmond-news'] = -123.119285
long_dict['burnabynow'] = -122.917969
long_dict['newwestrecord'] = -122.916369
long_dict['bowenislandundercurrent'] = -123.338253

In [27]:
loc_dict

{'The Record': 'Kitchener',
 'abbynews': 'Abbotsford',
 'cp24': 'Toronto',
 'ctvnews': 'Ottawa',
 'langleyadvancetimes': 'Langley',
 'mapleridgenews': 'Maple Ridge',
 'nationalpost': 'Toronto',
 'northdeltareporter': 'Delta',
 'nsnews': 'Vancouver',
 'surreynowleader': 'Surrey',
 'theprogress': 'Chilliwack',
 'thestar': 'Toronto',
 'torontosun': 'Toronto',
 'cbc': 'Toronto',
 'tricitynews': 'Port Coquitlam',
 'vancouverobserver': 'Vancouver',
 'vancourier': 'Vancouver',
 'richmond-news': 'Richmond',
 'burnabynow': 'Burnaby',
 'newwestrecord': 'Burnaby',
 'bowenislandundercurrent': 'Bowen Island'}

In [28]:
lat_dict

{'The Record': -80.486177,
 'abbynews': -122.272222,
 'cp24': -79.390167,
 'ctvnews': -75.69747199999999,
 'langleyadvancetimes': -122.661056,
 'mapleridgenews': -122.595583,
 'nationalpost': -79.377972,
 'northdeltareporter': -122.798996,
 'nsnews': -123.10096000000001,
 'surreynowleader': -122.799361,
 'theprogress': -121.956167,
 'thestar': -79.540139,
 'torontosun': -79.377972,
 'cbc': 45.518055,
 'tricitynews': 49.247012,
 'vancouverobserver': -123.10096000000001,
 'vancourier': 49.266922,
 'richmond-news': 49.174152,
 'burnabynow': (49.252834,),
 'newwestrecord': 49.252857,
 'bowenislandundercurrent': 49.379632}

### Exploring geopy API for coordinates

In [29]:
from geopy.geocoders import Nominatim

gl = Nominatim(user_agent='newsworthy_ml')

In [35]:
loc = gl.geocode('Kanye West')

In [36]:
loc.address

'Kanye, Southern District, PLOT, Botswana'

In [37]:
loc.latitude

-24.9766112

In [38]:
loc.longitude

25.3358859