In [None]:
%matplotlib inline

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
with open('Case_Study3/headlines.txt') as f:
    headlines = [line.strip() for line in f.readlines()]
num_headlines = len(headlines)

In [None]:
f"{num_headlines} headlines have been loaded"

In [None]:
from unidecode import unidecode

In [None]:
import re
def name_to_regex(name):
    decoded_name = unidecode(name)
    if name != decoded_name:
        regex = fr'\b({name}|{decoded_name})\b'
    else:
        regex = fr'\b{name}\b'
    return re.compile(regex, flags=re.IGNORECASE)

In [None]:
from geonamescache import GeonamesCache

In [None]:
gc = GeonamesCache()

In [None]:
countries = [country['name'] for country in gc.get_countries().values()]

In [None]:
countries

In [None]:
cities = [city['name'] for city in gc.get_cities().values()]

In [None]:
cities

In [None]:
country_to_name = {name_to_regex(name): name for name in countries}

In [None]:
city_to_name = {name_to_regex(name): name for name in cities}

In [None]:
country_to_name

In [None]:
cities

In [None]:
def get_name_in_text(text, dictionary):
    for regex, name in sorted(dictionary.items(), key=lambda x: x[1]):
        if regex.search(text):
            return name
    return None

In [None]:
matched_countries = [get_name_in_text(headline, country_to_name) for headline in headlines]
matched_cities = [get_name_in_text(headline, city_to_name) for headline in headlines]

In [None]:
data = {'Headline': headlines, 'City': matched_cities, 'Country': matched_countries}

In [None]:
df = pd.DataFrame(data)

In [None]:
df

In [None]:
matched_countries

In [None]:
df[['City', 'Country']].describe()

In [None]:
of_cities = df[df.City == 'Of'][['City', 'Headline']]

In [None]:
of_cities

In [None]:
city_to_name

In [None]:
def get_cities_in_headline(headline):
    cities_in_headline = set()
    for regex, name in city_to_name.items():
        match = regex.search(headline)
        if match:
            if headline[match.start()].isupper():
                cities_in_headline.add(name)
    return list(cities_in_headline)

In [None]:
df['Headline'].apply(get_cities_in_headline)

In [None]:
df['Cities'] = _

In [None]:
df['Num_cities'] = df['Cities'].apply(len)

In [None]:
df_multiple_cities = df[df.Num_cities > 1]

In [None]:
df_multiple_cities

In [None]:
num_rows, _ = df_multiple_cities.shape
print(f"{num_rows} headlines match multiple cities")

In [None]:
def get_longest_city(cities):
    if cities:
        return max(cities, key=len)
    return None
df['City'] = df['Cities'].apply(get_longest_city)

In [None]:
df

In [None]:
short_cities = df[df.City.str.len() <= 4][['City', 'Headline']]

In [None]:
short_cities

In [None]:
df_countries = df[df.Country.notnull()][['City', 'Country', 'Headline']]

In [None]:
df_countries

In [None]:
df.drop('Country', axis=1, inplace=True)

In [None]:
df_unmatched = df[df.City.isnull()]
num_unmatched = len(df_unmatched)
print(f"{num_unmatched} headlines contain no city matches.")

In [None]:
print(df_unmatched.head(10)[['Headline']].values)

In [None]:
df = df[~df.City.isnull()][['City', 'Headline']]

In [None]:
df

In [None]:
latitudes, longitudes = [], []
for city_name in df.City.values:
    city = max(gc.get_cities_by_name(city_name), key=lambda x: list(x.values())[0]['population'])
    city = list(city.values())[0]
    latitudes.append(city['latitude'])
    longitudes.append(city['longitude'])
df = df.assign(Latitude=latitudes, Longitude=longitudes)

In [None]:
df

In [None]:
coordinates = df[['Latitude', 'Longitude']].values

In [None]:
k_values = range(1, 10)

In [None]:
from sklearn.cluster import KMeans
inertia_values = []
for k in k_values:
    inertia_values.append(KMeans(n_clusters=k).fit(coordinates).inertia_)

In [None]:
plt.plot(range(1, 10), inertia_values)
plt.xlabel('K')
plt.ylabel('Inertia')

In [None]:
from cartopy.crs import PlateCarree

In [None]:
def plot_clusters(clusters, longitudes, latitudes):
    plt.figure(figsize=(12, 10))
    ax = plt.axes(projection=PlateCarree())
    ax.coastlines()
    ax.scatter(longitudes, latitudes, c=clusters)
    ax.set_global()

In [None]:
df['Cluster'] = KMeans(n_clusters=3).fit_predict(coordinates)

In [None]:
plot_clusters(df.Cluster, df.Longitude, df.Latitude)

In [None]:
df['Cluster'] = KMeans(6).fit_predict(coordinates)

In [None]:
plot_clusters(df.Cluster, df.Longitude, df.Latitude)

In [None]:
from math import sin, cos, asin
def great_circle_distance(coord1, coord2, radius=3956):
    if np.array_equal(coord1, coord2):
        return 0.0
    coord1, coord2 = np.radians(coord1), np.radians(coord2)
    delta_x, delta_y = coord2 - coord1
    haversin = sin(delta_x / 2) ** 2 + np.product([cos(coord1[0]), cos(coord2[0]), sin(delta_y / 2) ** 2])
    return 2 * radius * asin(haversin ** 0.5)

In [None]:
from sklearn.cluster import DBSCAN
metric = great_circle_distance
dbscan = DBSCAN(eps=250, min_samples=3, metric=metric)
df['Cluster'] = dbscan.fit_predict(coordinates)

In [None]:
df_no_outliers = df[df.Cluster != -1]

In [None]:
plot_clusters(df_no_outliers.Cluster, df_no_outliers.Longitude, df_no_outliers.Latitude)

In [None]:
def get_country_code(city_name):
    city = max(gc.get_cities_by_name(city_name), key=lambda x: list(x.values())[0]['population'])
    return list(city.values())[0]['countrycode']
df['Country_code'] = df.City.apply(get_country_code)

In [None]:
df

In [None]:
df_us = df[df.Country_code == 'US']
df_not_us = df[df.Country_code != 'US']

In [None]:
df_us

In [None]:
def re_cluster(input_df, eps):
    input_coord = input_df[['Latitude', 'Longitude']].values
    dbscan = DBSCAN(eps=eps, min_samples=3, metric=great_circle_distance)
    clusters = dbscan.fit_predict(input_coord)
    input_df = input_df.assign(Cluster=clusters)
    return input_df[input_df.Cluster > -1]
df_not_us = re_cluster(df_not_us, 250)
df_us = re_cluster(df_us, 125)

In [None]:
groups = df_not_us.groupby('Cluster')
num_groups = len(groups)
print(f"{num_groups} Non-US clusters have been detected")

In [None]:
sorted_groups = sorted(groups, key=lambda x: len(x[1]),
reverse=True)
group_id, largest_group = sorted_groups[0]
group_size = len(largest_group)
print(f"Largest cluster contains {group_size} headlines")

In [None]:
def compute_centrality(group):
    group_coords = group[['Latitude', 'Longitude']].values
    center = group_coords.mean(axis=0)
    distance_to_center = [great_circle_distance(center, coord) for coord in group_coords]
    group['Distance_to_center'] = distance_to_center

In [None]:
def sort_by_centrality(group):
    compute_centrality(group)
    return group.sort_values(by=['Distance_to_center'], ascending=True)

In [None]:
largest_group = sort_by_centrality(largest_group)
for headline in largest_group.Headline.values[:5]:
    print(headline)

In [None]:
largest_group

In [None]:
from collections import Counter
def top_countries(group):
    countries = [gc.get_countries()[country_code]['name'] for country_code in group.Country_code.values]
    return Counter(countries).most_common(3)

In [None]:
top_countries(largest_group)

In [None]:
for _, group in sorted_groups[1:5]:
    sorted_group = sort_by_centrality(group)
    print(top_countries(sorted_group))
    for headline in sorted_group.Headline.values[:5]:
        print(headline)
    print('\n')