## Import Libraries

In [28]:
import pandas as pd
from geonamescache import GeonamesCache
from unidecode import unidecode
import re

## Import and Examine Headlines

In [29]:
file = open('data/headlines.txt', 'r')
for i in range(10):
    print(file.readline())
file.close() 

Zika Outbreak Hits Miami

Could Zika Reach New York City?

First Case of Zika in Miami Beach

Mystery Virus Spreads in Recife, Brazil

Dallas man comes down with case of Zika

Trinidad confirms first Zika case

Zika Concerns are Spreading in Houston

Geneve Scientists Battle to Find Cure

The CDC in Atlanta is Growing Worried

Zika Infested Monkeys in Sao Paulo



In [30]:
data = pd.read_table('data/headlines.txt', header=None, names=['headlines'])

In [31]:
data.shape

(650, 1)

In [32]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 650 entries, 0 to 649
Data columns (total 1 columns):
headlines    650 non-null object
dtypes: object(1)
memory usage: 5.2+ KB


In [33]:
data.describe()

Unnamed: 0,headlines
count,650
unique,648
top,Spanish Flu Spreading through Madrid
freq,2


In [34]:
data.head(10)

Unnamed: 0,headlines
0,Zika Outbreak Hits Miami
1,Could Zika Reach New York City?
2,First Case of Zika in Miami Beach
3,"Mystery Virus Spreads in Recife, Brazil"
4,Dallas man comes down with case of Zika
5,Trinidad confirms first Zika case
6,Zika Concerns are Spreading in Houston
7,Geneve Scientists Battle to Find Cure
8,The CDC in Atlanta is Growing Worried
9,Zika Infested Monkeys in Sao Paulo


In [35]:
data.tail(10)

Unnamed: 0,headlines
640,Authorities are Worried about the Spread of Va...
641,More Zika patients reported in Fort Worth
642,Zika symptoms spotted in Boynton Beach
643,Outbreak of Zika in Portoviejo
644,Influenza Exposure in Muscat
645,Rumors about Rabies spreading in Jerusalem hav...
646,More Zika patients reported in Indang
647,Suva authorities confirmed the spread of Rotav...
648,More Zika patients reported in Bella Vista
649,Zika Outbreak in Wichita Falls


In [36]:
data = data[data.duplicated() == False] #remove duplicates
data.describe()

Unnamed: 0,headlines
count,648
unique,648
top,Lawrenceville is infested with Dengue
freq,1


## Get City and Country Names

In [37]:
gc = GeonamesCache()
cities = gc.get_cities()
list(cities.items())[0]

('3041563',
 {'geonameid': 3041563,
  'name': 'Andorra la Vella',
  'latitude': 42.50779,
  'longitude': 1.52109,
  'countrycode': 'AD',
  'population': 20430,
  'timezone': 'Europe/Andorra',
  'admin1code': '07'})

In [38]:
accented_names=pd.DataFrame(columns = ['key', 'city'])
for city in cities.values():
    if city['name'] != unidecode(city['name']):
        accented_names = accented_names.append({'key':city['geonameid'], 'city':city['name']}, ignore_index=True)
        city.update({'accented_name' : city['name']})
        city.update({'name' : unidecode(city['name'])}) 

In [39]:
accented_names.shape

(4904, 2)

In [40]:
headlines = [unidecode(line) for line in data['headlines']]

In [41]:
countries = gc.get_countries()

## Extract Headlines

In [42]:
results = pd.DataFrame(columns = ['headline', 'city', 'country'])

In [43]:
for key in list(cities.keys()):
    for j in range(len(headlines)):
        name = cities[key]['name']
        match = re.search(name, headlines[j])
        if match != None:
            results = results.append({"headline":headlines[j], 'city':name,"country":countries[cities[key]['countrycode']]['name']}, ignore_index = True)
results.shape

(1502, 3)

In [44]:
results.describe()

Unnamed: 0,headline,city,country
count,1502,1502,1502
unique,624,709,118
top,Lower Hospitalization in Richmond after Mumps ...,Viru,United States
freq,10,44,601


In [45]:
results.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1502 entries, 0 to 1501
Data columns (total 3 columns):
headline    1502 non-null object
city        1502 non-null object
country     1502 non-null object
dtypes: object(3)
memory usage: 35.3+ KB


## Clean Results

In [46]:
results.groupby('city').count().sort_values(by = 'headline', axis=0, ascending = False).head(30)

Unnamed: 0_level_0,headline,country
city,Unnamed: 1_level_1,Unnamed: 2_level_1
Viru,44,44
Ho,33,33
Vac,24,24
Pati,23,23
San,22,22
Mala,21,21
Wil,20,20
Man,20,20
Hit,19,19
Monroe,16,16


In [47]:
results.groupby('headline').count().sort_values(by = 'city', axis=0, ascending = False).head(30)

Unnamed: 0_level_0,city,country
headline,Unnamed: 1_level_1,Unnamed: 2_level_1
Lower Hospitalization in Richmond after Mumps Vaccine becomes Mandatory,10,10
Spike of Pneumonia Cases in Springfield,9,9
Rhinovirus Comes to San Jose,8,8
Jacksonville man hit by Zika,8,8
Rumors about West Nile Virus Spreading in Salem have been Refuted,8,8
Zika spreads to San Luis Potosi,8,8
Lower Hospitalization in Monroe after Hepatitis D Vaccine becomes Mandatory,8,8
Lower Hospitalization in Lakewood after Hepatitis B Vaccine becomes Mandatory,8,8
Zika Virus Reaches San Francisco,8,8
Rumors about Hepatitis D Spreading in San Juan Capistrano have been Refuted,7,7


In [48]:
rows_to_drop = []
for headline in results['headline']:
    test = results[results['headline']==headline]
    s = test.city.str.len().sort_values().index
    test = test.reindex(s)
    if len(test.index) > 2:
        for i in range(len(test.index)-1):
            for j in range(1,len(test.index)):
                match = re.match(test['city'].loc[test.index[i]],test['city'].loc[test.index[j]])
                if match != None:
                    match2 = re.match(test['city'].loc[test.index[j]],test['city'].loc[test.index[i]])
                    if match2 == None:
                        rows_to_drop.append(test.index[i])
                elif match == None:
                    rows_to_drop.append(test.index[i])
    elif len(test.index) == 2:
        match = re.match(test['city'].loc[test.index[0]],test['city'].loc[test.index[1]])
        if match != None:
            match2 = re.match(test['city'].loc[test.index[1]],test['city'].loc[test.index[0]])
            if match2 == None:
                rows_to_drop.append(test.index[0])
        elif match == None:
            rows_to_drop.append(test.index[0])

In [49]:
results2 = results.drop(rows_to_drop, axis = 0)

In [50]:
results2.shape

(883, 3)

In [51]:
results2.groupby('city').count().sort_values(by = 'headline', axis=0, ascending = False).head(10)

Unnamed: 0_level_0,headline,country
city,Unnamed: 1_level_1,Unnamed: 2_level_1
Richmond,8,8
Rochester,8,8
Fairfield,8,8
Cambridge,8,8
Springfield,8,8
Monroe,7,7
Birmingham,6,6
Madrid,6,6
Fresno,6,6
Greenville,6,6


In [52]:
results2.groupby('headline').count().sort_values(by = 'city', axis=0, ascending = False).head(10)

Unnamed: 0_level_0,city,country
headline,Unnamed: 1_level_1,Unnamed: 2_level_1
Spike of Pneumonia Cases in Springfield,8,8
Authorities are Worried about the Spread of Chickenpox in Richmond,7,7
Will Hepatitis B vaccine help La Paz?,6,6
Case of Varicella Reported in Concord,5,5
Tests negative on Windsor Zika carriers,5,5
San Juan reports 1st U.S. Zika-related death amid outbreak,5,5
Hepatitis E re-emerges in Santa Rosa,5,5
Rumors about Hepatitis D spreading in Albany have been refuted,5,5
Hepatitis B has not Left Florence,5,5
Madison lab developing vaccine against Zika virus [The Wisconsin State Journal],5,5


## Save Results

In [53]:
results2.to_csv('Checkpoint01.csv')