## Read in Data


In [4]:
import pandas as pd

data = pd.read_csv("processed_headlines_clustered.csv", index_col=0).drop(
    columns=["accented_city", "pop"])

data[['headline', 'city', 'latitude', 'longitude', 'countrycode', 'cluster']].head()

Unnamed: 0,headline,city,latitude,longitude,countrycode,cluster
0,Zika Outbreak Hits Miami,Miami,25.77427,-80.19366,US,0
1,Could Zika Reach New York City?,New York City,40.71427,-74.00597,US,0
2,First Case of Zika in Miami Beach,Miami Beach,25.79065,-80.13005,US,0
3,"Mystery Virus Spreads in Recife, Brazil",Recife,-8.05389,-34.88111,BR,1
4,Dallas man comes down with case of Zika,Dallas,32.78306,-96.80667,US,0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 605 entries, 0 to 646
Data columns (total 6 columns):
headline       605 non-null object
city           605 non-null object
countrycode    605 non-null object
latitude       605 non-null float64
longitude      605 non-null float64
cluster        605 non-null int64
dtypes: float64(2), int64(1), object(3)
memory usage: 33.1+ KB


In [6]:
us_data = data[data["countrycode"] == "US"].copy()
world_data = data[data["countrycode"] != "US"].copy()

print(f"There are {len(us_data)} headlines in the US.")
print(f"There are {len(world_data)} headlines outside the US.")

There are 304 headlines in the US.
There are 301 headlines outside the US.


In [7]:
def cluster_location(clusterer, data, lat_string="latitude", lon_string="longitude"):
    """
    Fit a clustering algorithm on location data.
    """
    features = data[[lat_string, lon_string]].copy()
    clusterer.fit(features)
    # Assign the cluster labels
    data["cluster"] = clusterer.labels_
    return data

# Re-Cluster the Data

Let's apply the DBSCAN clustering method to both datasets separately. This may require different parameters due to the headline distribution; headlines in the United States are more dense than the world headlines.

In [8]:
from sklearn.cluster import DBSCAN

us_clusterer = DBSCAN(eps=3, min_samples=10)
us_data = cluster_location(us_clusterer, us_data)
us_data['cluster'].value_counts()

 1    74
 3    57
-1    51
 4    45
 0    41
 2    21
 5    10
 6     5
Name: cluster, dtype: int64

The rest of the world headlines are more spread out, so we will allow greater distance between headlines, setting eps to 10. We will also decrease the number of headlines needed in a cluster to 8.

In [9]:
world_clusterer = DBSCAN(eps=10, min_samples=8)
world_data = cluster_location(world_clusterer, world_data)
world_data['cluster'].value_counts()

-1    76
 1    60
 2    59
 3    53
 0    22
 5    11
 4    11
 6     9
Name: cluster, dtype: int64

Visualizations

In [None]:
!conda install Basemap
from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt

plt.rcParams["font.size"] = 20

# %#matplotlib inline

# Map of only the continental US
us_m = Basemap(
    llcrnrlon=-119,
    llcrnrlat=22,
    urcrnrlon=-64,
    urcrnrlat=49,
    projection="lcc",
    lat_1=33,
    lat_2=45,
    lon_0=-95,
)

# Convert to map coordinates
us_data["x"], us_data["y"] = us_m(
    x=list(us_data["longitude"]), y=list(us_data["latitude"])
)

# Entire world
world_m = Basemap()

# Convert to map coordinates
world_data["x"], world_data["y"] = world_m(
    x=list(world_data["longitude"]), y=list(world_data["latitude"])
)

Solving environment: - 

In [None]:
plot_clusters(us_data, True)

In [None]:
plot_clusters(world_data, False)

# Analyze World Clusters

## Cluster Centers

In [None]:
import math


def find_centroid(data):
    """
    Calculate the centroid of geographic points.
    
    Based on https://stackoverflow.com/a/57346455/5755357
    """

    x = 0
    y = 0
    z = 0

    # Iterate through each coordinate in radians
    for longitude, latitude in zip(data["longitude"], data["latitude"]):
        longitude, latitude = math.radians(longitude), math.radians(latitude)
        # Convert to a 3D position in radians
        x += math.cos(latitude) * math.cos(longitude)
        y += math.cos(latitude) * math.sin(longitude)
        z += math.sin(latitude)

    # Get the averages of each radians
    n = len(data)
    x /= n
    y /= n
    z /= n

    # Apply formula
    central_longitude = math.atan2(y, x)
    central_square_root = math.sqrt(x ** 2 + y ** 2)
    central_latitude = math.atan2(z, central_square_root)

    # Convert back to degrees
    centroid = dict(
        latitude=math.degrees(central_latitude),
        longitude=math.degrees(central_longitude),
    )
    return centroid

We use this function by segmenting to a specific cluster and passing that cluster's points into the algorithm.

In [None]:
find_centroid(world_data[world_data['cluster'] == 1])

## Extract Information about World Clusters

In [None]:
def analyze_clusters(data, data_map):
    """
    Find the centers of each cluster and the number of points in a cluster.
    """
    clusters = []

    # Iterate through each cluster
    for cluster, grouped in data.groupby("cluster"):
        # Find the center location of the cluster
        cluster_centroid = find_centroid(grouped)

        # Convert the center to x, y in the map projection system
        x, y = data_map(cluster_centroid["longitude"], cluster_centroid["latitude"])

        # Record the cluster number, count of points in cluster, and location of cluster center
        clusters.append(
            dict(
                cluster=cluster,
                count=len(grouped),
                longitude=cluster_centroid["longitude"],
                latitude=cluster_centroid["latitude"],
                x=x,
                y=y,
            )
        )

    return clusters


world_clusters = analyze_clusters(world_data, world_m)
world_clusters[:2]

## Visualizing Cluster Centers

In [None]:
plot_clusters(world_data, cluster_centers=world_clusters)

In [None]:
world_data = world_data.merge(
    pd.DataFrame(world_clusters), on=["cluster"], suffixes=["", "_cluster_center"]
)
world_data.head()

## Distance between Headline and Cluster Center

An accurate calculation of the distance on a globe requires using the Great Circle distance. We can use the Haversine formula to calculate the distance along the spherical Earth between each point and the center of the cluster to which it is assigned.


In [None]:
import numpy as np

def haversine_formula_two_arrays(
    longitude_one, latitude_one, longitude_two, latitude_two
):
    """
    Calculate the Great Circle distance between two points 
    using the Haversine Formula. Latitude and Longitude are in degrees.
    
    Source: https://stackoverflow.com/a/4913653/5755357
    """
    # Convert to radians
    longitude_one, latitude_one, longitude_two, latitude_two = map(
        np.radians, [longitude_one, latitude_one, longitude_two, latitude_two]
    )

    # Apply Haversine formula
    delta_longitude = longitude_two - longitude_one
    delta_latitude = latitude_two - latitude_one
    # Formula
    a = (
        np.sin(delta_latitude / 2) ** 2
        + np.cos(latitude_one) * np.cos(latitude_two) * np.sin(delta_longitude / 2) ** 2
    )
    # Convert from radians to km
    d = 2 * np.arcsin(np.sqrt(a))
    radius_miles = 3956
    return d * radius_miles

In [None]:
world_data["distance"] = haversine_formula_two_arrays(
    world_data["longitude"],
    world_data["latitude"],
    world_data["longitude_cluster_center"],
    world_data["latitude_cluster_center"],
)
_ = world_data["distance"].plot.hist(
    title="Distribution of Distance to Cluster Centers", figsize=(16, 10), edgecolor='k',
)
plt.xlabel('Distance (miles)');

### Distribution of Distance to Cluster Centers


In [None]:
import seaborn as sns

plt.figure(figsize=(16, 10))

for cluster, grouped in world_data.groupby("cluster"):
    if len(grouped) < 10:
        continue
    else:
        sns.kdeplot(grouped["distance"], label=f"{cluster}")

ax = plt.gca()
_ = ax.set_title("Distribution of Distances from Cluster Centers")
_ = ax.set_xlabel("Distance (miles)")
_ = ax.set_ylabel("Density")

## Sort Clusters by Size and Distance to Center

In [None]:
world_data = world_data.sort_values(by=['count', 'distance'], ascending=[False, True])
world_data.head()

In [None]:
world_data.tail()

# Look for Patterns in Headlines

For each cluster, we'll look at the top 5 headlines as measured by closest distance to the cluster center.

In [None]:
for cluster, grouped in world_data.groupby("cluster"):
    print("\nCluster", cluster)
    print("Number of headlines: ", len(grouped))
    print("Top 5 Headlines")
    print(list(grouped.nsmallest(n=5, columns="distance")["headline"]))

## World Disease Outbreaks

We have identified several outbreaks! Based on the headlines, we can draw the following conclusions about the clusters:


* -1: No outbreak
* 0: Zika outbreak
* 1: Mad Cow outbreak
* 2: Zika outbreak
* 3: No outbreak
* 4: No outbreak
* 5: Zika outbreak
* 6: No outbreak


Next, we want to locate each outbreak. We'll do this by visualizing them on a map.

## Visualizing World Outbreaks

In [None]:
world_outbreak_mapping = {0: "Zika", 1: "Mad Cow", 2: "Zika", 5: "Zika"}

plot_clusters(
    world_data[world_data["cluster"].isin(world_outbreak_mapping.keys())],
    label_outbreak=world_outbreak_mapping,
)
ax = plt.gca()
_ = ax.set_title("Locations of World Headlines and Outbreaks")

We now can make the following observations:

* There is a Zika outbreak on the East Coast of South America
* There is a Zika outbreak in East Asia
* There is a Mad Cow outbreak in Europe
* There is a Zika outbreak in India

Let's look at the total number of headlines associated with each disease according to the clustering.

In [None]:
world_data["disease"] = world_data["cluster"].map(world_outbreak_mapping)
world_data["disease"].value_counts()

# Analysis of United States Outbreaks

In [None]:
us_clusters = analyze_clusters(us_data, us_m)
us_clusters[:2]

In [None]:
plot_clusters(us_data, plot_us=True, cluster_centers=us_clusters)

In [None]:
us_data = us_data.merge(
    pd.DataFrame(us_clusters), on=["cluster"], suffixes=["", "_cluster_center"]
)
us_data.head()

### Distance from Headlines to Cluster Centers

In [None]:
us_data["distance"] = haversine_formula_two_arrays(
    us_data["longitude"],
    us_data["latitude"],
    us_data["longitude_cluster_center"],
    us_data["latitude_cluster_center"],
)

plt.figure(figsize=(16, 10))

for cluster, grouped in us_data.groupby("cluster"):
    sns.kdeplot(grouped["distance"], label=f"{cluster}")

ax = plt.gca()
_ = ax.set_title("Distribution of Distances from US Cluster Centers")
_ = ax.set_xlabel("Distance (miles)")
_ = ax.set_ylabel("Density")

In [None]:
us_data = us_data.sort_values(by=['count', 'distance'], ascending=[False, True])
us_data.head()

In [None]:
us_data.tail()

## Examine Headlines from Each Cluster

In [None]:
for cluster, grouped in us_data.groupby("cluster"):
    print("\nCluster", cluster)
    print("Number of headlines: ", len(grouped))
    print("Top 10 Headlines")
    print(list(grouped.nsmallest(n=10, columns="distance")["headline"]))

In [None]:
There are only 2 clusters with repeated disease mentions in the United States. Both of them are associated with Zika.

Cluster -1: No outbreak
Cluster 0: Zika Outbreak
Cluster 1: No outbreak
Cluster 2: Zika Outbreak
Cluster 3: No outbreak
Cluster 4: No outbreak
Cluster 5: No outbreak
Cluster 6: No outbreak
Let's update the cluster map to show the United States outbreaks.

In [None]:
us_outbreak_mapping = {0: "Zika", 2: "Zika"}

plot_clusters(
    us_data[us_data["cluster"].isin(us_outbreak_mapping.keys())],
    plot_us=True,
    label_outbreak=us_outbreak_mapping,
)
ax = plt.gca()
_ = ax.set_title("Locations of US Headlines and Outbreaks")

We can draw the following conclusions about the United States:

* A Zika outbreak in Texas
* A Zika outbreak in the Eastern South

Let's look at the value counts of US disease clusters.

In [None]:
us_data['disease'] = us_data['cluster'].map(us_outbreak_mapping)
us_data['disease'].value_counts()

# Final Plot of Worldwide Outbreak

First, we'll join together the US and world data, keeping only those that were in a Zika cluster. 

## Find All Zika Cases

Now that we have determined zika is the major disease outbreak, we can find all mentions using simple text matching. We can use Pandas string methods to find the headlines mentioning Zika and plot these.

In [None]:
zika_data = pd.concat([us_data, world_data])

# Keep only headlines matching "Zika"
zika_data = zika_data[zika_data["headline"].str.lower().str.contains('zika', na=False)]

In [None]:
zika_world = zika_data[zika_data["countrycode"] != "US"]
print(f"There are {len(zika_world)} Zika cases around the world.")

zika_us = zika_data[zika_data["countrycode"] == "US"]
print(f"There are {len(zika_us)} Zika cases in the United States.")

In [None]:
_ = (
    zika_world["cluster"]
    .value_counts()
    .plot.bar(title="Zika Outbreak World Cluster Counts", figsize=(12, 8))
)

In [None]:
_ = (
    zika_us["cluster"]
    .value_counts()
    .plot.bar(title="Zika Outbreak US Cluster Counts", figsize=(12, 8))
)

In [None]:
zika_data["x"], zika_data["y"] = world_m(
    x=zika_data["longitude"], y=zika_data["latitude"]
)

plot_clusters(zika_data[zika_data['cluster'] != -1])
ax = plt.gca()
_ = ax.set_title("Locations of All Zika Headlines in a Cluster")

We can clearly see there are 4 major outbreaks:

* South-eastern United States and Central America
* Eastern South America
* India
* Oceania and Eastern Asia

These are the areas to which we should direct medical assistance!

In [None]:
major_clusters = zika_data[zika_data['cluster'].isin([0, 2, 3, 5])]
plot_clusters(major_clusters)
ax = plt.gca()
_ = ax.set_title("Locations of Major Zika Outbreaks");
ax.get_legend().remove()
plt.savefig('../figs/completed_map.png');

In [None]:
zika_us[zika_us['cluster'].isin([0, 2, 3, 5])]['cluster'].value_counts()

In [None]:
zika_world[zika_world['cluster'].isin([0, 2, 3, 5])]['cluster'].value_counts()

In [None]:
plot_clusters(major_clusters)
ax = plt.gca()
_ = ax.set_title("Locations of Major Zika Outbreaks");
plt.savefig('final_outbreak_map.png')

### Counts of Zika Cases in Each Outbreak

* Mid-South United States and Central America: 106 headlines
* East Asia: 39 headlines
* Eastern South America: 18 headlines
* India: 9 headlines

In [None]:
plot_clusters(zika_data[zika_data['cluster'].isin([0, 2, 3, 5])].sample(frac=0.1))
ax = plt.gca()
plt.rcParams['font.size'] = 22
_ = ax.set_title("Sample of Headlines Mentioning Epidemic Disease X")

In [None]:
zika_data.to_csv("zika_outbreaks.csv")
major_clusters.to_csv('major_clusters.csv')

# Output

The end output from this section is a listing of the headlines in the major disease outbreaks along with a world map of the same headlines. The text information can be used for further analysis, and the map shows an overview of the issue (and would make for a good graphic in an article). We will place this map in the Executive Summary. 

In [None]:
zika_data[zika_data['cluster'].isin([0, 2, 3, 5])][['headline', 'city', 'latitude', 'longitude', 'countrycode', 'cluster']].head(10)

In [None]:
from IPython.display import Image

Image('final_outbreak_map.png')

In [None]:
zika_data.tail(10)