# **Importing Dependencies**

In [None]:
!pip install geonamescache
!pip install unidecode
!pip install cartopy

In [None]:
from geonamescache import GeonamesCache
from unidecode import unidecode
import regex as re
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
gc = GeonamesCache()
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans, DBSCAN
import cartopy
from cartopy.crs import PlateCarree
import math

# **Loading Dataset**

In [None]:
with open('/content/headlines.txt', 'r') as files: # opening the loaded file
  headlines_text = files.readlines() # reading the text linewise and storing in a variable
print(headlines_text)
print("shape of headlines: ", len(headlines_text), "Type is: ",type(headlines_text))

['Zika Outbreak Hits Miami\n', 'Could Zika Reach New York City?\n', 'First Case of Zika in Miami Beach\n', 'Mystery Virus Spreads in Recife, Brazil\n', 'Dallas man comes down with case of Zika\n', 'Trinidad confirms first Zika case\n', 'Zika Concerns are Spreading in Houston\n', 'Geneve Scientists Battle to Find Cure\n', 'The CDC in Atlanta is Growing Worried\n', 'Zika Infested Monkeys in Sao Paulo\n', 'Brownsville teen contracts Zika virus\n', 'Mosquito control efforts in St. Louis take new tactics with Zika threat\n', 'San Juan reports 1st U.S. Zika-related death amid outbreak\n', 'Flu outbreak in Galveston, Texas\n', 'Zika alert – Manila now threatened\n', 'Zika afflicts 7 in Iloilo City\n', 'New Los Angeles Hairstyle goes Viral\n', 'Louisiana Zika cases up to 26\n', 'Orlando volunteers aid Zika research\n', 'Zika infects pregnant woman in Cebu\n', "Chicago's First Zika Case Confirmed\n", 'Tampa Bay Area Zika Case Count Climbs\n', 'Bad Water Leads to Sickness in Flint, Michigan\n', 

# **Removing '/n' and spaces at the end/start of every headline**





In [None]:
headlines = [line.strip() for line in headlines_text ]
print(headlines)

['Zika Outbreak Hits Miami',
 'Could Zika Reach New York City?',
 'First Case of Zika in Miami Beach',
 'Mystery Virus Spreads in Recife, Brazil',
 'Dallas man comes down with case of Zika',
 'Trinidad confirms first Zika case',
 'Zika Concerns are Spreading in Houston',
 'Geneve Scientists Battle to Find Cure',
 'The CDC in Atlanta is Growing Worried',
 'Zika Infested Monkeys in Sao Paulo',
 'Brownsville teen contracts Zika virus',
 'Mosquito control efforts in St. Louis take new tactics with Zika threat',
 'San Juan reports 1st U.S. Zika-related death amid outbreak',
 'Flu outbreak in Galveston, Texas',
 'Zika alert – Manila now threatened',
 'Zika afflicts 7 in Iloilo City',
 'New Los Angeles Hairstyle goes Viral',
 'Louisiana Zika cases up to 26',
 'Orlando volunteers aid Zika research',
 'Zika infects pregnant woman in Cebu',
 "Chicago's First Zika Case Confirmed",
 'Tampa Bay Area Zika Case Count Climbs',
 'Bad Water Leads to Sickness in Flint, Michigan',
 'Baltimore plans for Zi

# **Dataframe with column 'Headlines'**

In [None]:
df = pd.DataFrame()
df["Headlines"] = headlines
df

Unnamed: 0,Headlines
0,Zika Outbreak Hits Miami
1,Could Zika Reach New York City?
2,First Case of Zika in Miami Beach
3,"Mystery Virus Spreads in Recife, Brazil"
4,Dallas man comes down with case of Zika
...,...
645,Rumors about Rabies spreading in Jerusalem hav...
646,More Zika patients reported in Indang
647,Suva authorities confirmed the spread of Rotav...
648,More Zika patients reported in Bella Vista


Finding Duplicate headlines

In [None]:
df.duplicated().value_counts()

False    647
True       3
dtype: int64

Dropping duplicate Headlines

In [None]:
df.drop_duplicates()

Unnamed: 0,Headlines
0,Zika Outbreak Hits Miami
1,Could Zika Reach New York City?
2,First Case of Zika in Miami Beach
3,"Mystery Virus Spreads in Recife, Brazil"
4,Dallas man comes down with case of Zika
...,...
645,Rumors about Rabies spreading in Jerusalem hav...
646,More Zika patients reported in Indang
647,Suva authorities confirmed the spread of Rotav...
648,More Zika patients reported in Bella Vista


# **Cities name in geonamescache**

In [None]:
# Extracting information of all cities from geonamescache
cities  = gc.get_cities()

#geoname id are the keys of the resulted dictionary and each keys corresponds to a value dictionary
#'name', 'latitude' , 'longitude','country code', 'population','timezone','admin1code','alternatenames'
print(cities.keys())

# visualizing the information
cities['3040051']

dict_keys(['3040051', '3041563', '290594', '291074', '291580', '291696', '292223', '292231', '292239', '292672', '292688', '292878', '292913', '292932', '292953', '292968', '8057551', '12042052', '12042053', '12047416', '12047417', '1120985', '1121381', '1123004', '1125155', '1125444', '1125896', '1127110', '1127628', '1127768', '1128265', '1129516', '1129648', '1130490', '1131316', '1132495', '1133453', '1133574', '1133616', '1134720', '1135158', '1135689', '1136469', '1136575', '1136863', '1137168', '1137807', '1138336', '1138958', '1139715', '1139807', '1140026', '1141089', '1141269', '1141540', '1141857', '1142170', '1142264', '1142404', '1145352', '1147066', '1147242', '1147290', '1147540', '1148106', '1148205', '1148311', '1148658', '1429434', '1469706', '3576022', '3573374', '363243', '781988', '782661', '782756', '783148', '783263', '783493', '3183719', '3183875', '3184081', '3184517', '3184518', '3184862', '3184935', '3185012', '3185082', '3185211', '3185670', '3185672', '3185

{'geonameid': 3040051,
 'name': 'les Escaldes',
 'latitude': 42.50729,
 'longitude': 1.53414,
 'countrycode': 'AD',
 'population': 15853,
 'timezone': 'Europe/Andorra',
 'admin1code': '08',
 'alternatenames': ["Ehskal'des-Ehndzhordani",
  'Escaldes',
  'Escaldes-Engordany',
  'Les Escaldes',
  'esukarudesu=engorudani jiao qu',
  'lai sai si ka er de-en ge er da',
  'Эскальдес-Энджордани',
  'エスカルデス＝エンゴルダニ教区',
  '萊塞斯卡爾德-恩戈爾達',
  '萊塞斯卡爾德－恩戈爾達']}

cities name in geonamescache

In [None]:
# removing the accented marks from dictionaries name
cities_in_geonamescache= [unidecode(cities[keys]['name']) for keys in cities.keys()]

print(f"total {len(cities_in_geonamescache)} cities found in geonamescache")

total 26463 cities found in geonamescache


In [None]:
# Identify cities from the GeonamesCache cities (cities_in_geonamescache) that are mentioned in each headline.
for line in headlines:
  for i in range(len(cities_in_geonamescache)):
    check = r'\b{}\b'.format(cities_in_geonamescache[i])
    if re.search(re.compile(check), line):
      df.loc[df['Headlines'].str.contains(line), 'City'] = cities_in_geonamescache[i]
      # df.loc[df['Headlines'] == line,'City'] = cities_in_geonamescache[i]
      print(cities_in_geonamescache[i])

Miami
York
York
New York City
Miami
Miami Beach
Recife
Dallas
Dallas
Trinidad
Trinidad
Trinidad
Houston
Geneve
Atlanta
Sao Paulo
Brownsville
Brownsville
Brownsville
St. Louis
San Juan
San Juan
San
San Juan
San Juan
San Juan
San Juan
Galveston
Manila
Iloilo
Los Angeles
Los Angeles
Orlando
Chicago
Bay
Tampa
Flint
Flint
Baltimore
London
London
Ho
Ho Chi Minh City
Philadelphia
Boston
Boston
Paris
Paris
San Diego
San Diego
San
San Diego
Bangkok
Beijing
Salvador
Hit
Kuala Lumpur
Yangon
Tallahassee
San Francisco
San Francisco
San
San Francisco
San Francisco
San Francisco
San Francisco
Bethesda
Townsville
Mandaluyong City
Santa Rosa
Santa Rosa
Santa Rosa
Santa Rosa
Santa Rosa
Santa Rosa
Santa Rosa
Salvador
San
San Salvador
Cleveland
Cleveland
Austin
Austin
Piracicaba
Lima
Lima
Toronto
Bogota
Brisbane
Dakar
Havana
Key West
Vancouver
Vancouver
Seattle
Nashville
Croix
Saint Croix
Fort Collins
Klang
Guatemala City
Madison
Madison
Madison
Madison
Madison
Sarasota
Entebbe
Brasilia
Jacksonville
Jacks

In [None]:
df

Unnamed: 0,Headlines,City
0,Zika Outbreak Hits Miami,Miami
1,Could Zika Reach New York City?,New York City
2,First Case of Zika in Miami Beach,Miami Beach
3,"Mystery Virus Spreads in Recife, Brazil",Recife
4,Dallas man comes down with case of Zika,Dallas
...,...,...
645,Rumors about Rabies spreading in Jerusalem hav...,Jerusalem
646,More Zika patients reported in Indang,Indang
647,Suva authorities confirmed the spread of Rotav...,Suva
648,More Zika patients reported in Bella Vista,Vista


headlines with no cities

In [None]:
no_cities_headlines = df[df['City'].isna()]

# 42 headlines are found to donot have city name
print(len(no_cities_headlines))

In [None]:
df.info()

# **Dataframe with headline & cities**





In [None]:
# countries data extraction from geonames cache
countries = gc.get_countries_by_names()
print(countries)

# Removing accented marks from the extracted countries name
countries_name_in_geonamescache =[unidecode(keys) for keys in countries.keys()]

#  countries_name_in_geonamescache, 252 countries
print(f"total {len(countries_name_in_geonamescache)} country names are found in geonamescache")

{'Andorra': {'geonameid': 3041565, 'name': 'Andorra', 'iso': 'AD', 'iso3': 'AND', 'isonumeric': 20, 'fips': 'AN', 'continentcode': 'EU', 'capital': 'Andorra la Vella', 'areakm2': 468, 'population': 77006, 'tld': '.ad', 'currencycode': 'EUR', 'currencyname': 'Euro', 'phone': '376', 'postalcoderegex': '^(?:AD)*(\\d{3})$', 'languages': 'ca', 'neighbours': 'ES,FR'}, 'United Arab Emirates': {'geonameid': 290557, 'name': 'United Arab Emirates', 'iso': 'AE', 'iso3': 'ARE', 'isonumeric': 784, 'fips': 'AE', 'continentcode': 'AS', 'capital': 'Abu Dhabi', 'areakm2': 82880, 'population': 9630959, 'tld': '.ae', 'currencycode': 'AED', 'currencyname': 'Dirham', 'phone': '971', 'postalcoderegex': '', 'languages': 'ar-AE,fa,en,hi,ur', 'neighbours': 'SA,OM'}, 'Afghanistan': {'geonameid': 1149361, 'name': 'Afghanistan', 'iso': 'AF', 'iso3': 'AFG', 'isonumeric': 4, 'fips': 'AF', 'continentcode': 'AS', 'capital': 'Kabul', 'areakm2': 647500, 'population': 37172386, 'tld': '.af', 'currencycode': 'AFN', 'curr

# **Finding Countries in Headlines**

In [None]:
 # Forming a column named Country and assigned the matched country to the matched index
for line in headlines:
  for i in range(len(countries_name_in_geonamescache)):
    if countries_name_in_geonamescache[i] in line:
      df.loc[df['Headlines'] == line,'Country'] = countries_name_in_geonamescache[i]


# **Finding and deleting headlines that donot have country and city **

In [None]:
no_city_country_headlines = df[df['Country'].isna()]
print(f"{len(no_city_country_headlines)} headlines donot have countries")

590 headlines donot have countries


In [None]:
df.dropna(subset = ['City', 'Country'],how='all',inplace = True)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 606 entries, 0 to 649
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Headlines  606 non-null    object
 1   City       606 non-null    object
 2   Country    16 non-null     object
dtypes: object(3)
memory usage: 18.9+ KB


In [None]:
# Forming A csv file
df.to_csv('df_headline_city_country.csv',index = False)
