# **Location Data Prep for Choropleth Maps**

### Setup dependencies.

In [1]:
import pandas as pd
import datetime

# Import cleaned news dataset
path = "../data/files/headlines.csv"
df = pd.read_csv(path, converters={"keywords": eval})
df.head()

Unnamed: 0,abstract,document_type,headline,keywords,lead_paragraph,news_desk,pub_date,section_name,snippet,type_of_material,word_count
0,Farhad Manjoo picks four products from 2014 th...,article,"Standouts in Tech: Drones, Virtual Reality, In...","[{'name': 'organizations', 'value': 'Oculus VR...",LOTS of cool new technology products come out ...,Business,2015-01-01,Technology,Farhad Manjoo picks four products from 2014 th...,News,824
1,Representative Steve Scalise’s effort to expla...,article,Much of David Duke’s ’91 Campaign Is Now in Lo...,"[{'name': 'persons', 'value': 'Alford, Jeremy'...","BATON ROUGE, La. — David Duke seems a figure f...",National,2015-01-01,U.S.,Representative Steve Scalise’s effort to expla...,News,1293
2,Minimum wage increases go into effect in 20 st...,article,"States’ Minimum Wages Rise, Helping Millions o...","[{'name': 'subject', 'value': 'Minimum Wage', ...","For some low-wage workers, everyday tasks like...",Business,2015-01-01,Business Day,Minimum wage increases go into effect in 20 st...,News,1017
3,A new job title — chief of laboratory safety —...,article,New C.D.C. Job Overseeing Laboratory Safety,"[{'name': 'persons', 'value': 'McNeil, Donald ...",A new job title — chief of laboratory safety —...,National,2015-01-01,Health,A new job title — chief of laboratory safety —...,Brief,129
4,"Lawyers for Dzhokhar Tsarnaev, the defendant i...",article,Massachusetts: New Effort to Move Bombings Trial,"[{'name': 'subject', 'value': 'Boston Marathon...","Lawyers for Dzhokhar Tsarnaev, the defendant i...",National,2015-01-01,U.S.,"Lawyers for Dzhokhar Tsarnaev, the defendant i...",Brief,145


### Add 'nid' (news id) field

In [2]:
nid_list = []
for i in range(len(df)):
    nid = int(i+1)
    nid_list.append(nid)
df.insert(loc=0, column='nid', value=nid_list)
df

Unnamed: 0,nid,abstract,document_type,headline,keywords,lead_paragraph,news_desk,pub_date,section_name,snippet,type_of_material,word_count
0,1,Farhad Manjoo picks four products from 2014 th...,article,"Standouts in Tech: Drones, Virtual Reality, In...","[{'name': 'organizations', 'value': 'Oculus VR...",LOTS of cool new technology products come out ...,Business,2015-01-01,Technology,Farhad Manjoo picks four products from 2014 th...,News,824
1,2,Representative Steve Scalise’s effort to expla...,article,Much of David Duke’s ’91 Campaign Is Now in Lo...,"[{'name': 'persons', 'value': 'Alford, Jeremy'...","BATON ROUGE, La. — David Duke seems a figure f...",National,2015-01-01,U.S.,Representative Steve Scalise’s effort to expla...,News,1293
2,3,Minimum wage increases go into effect in 20 st...,article,"States’ Minimum Wages Rise, Helping Millions o...","[{'name': 'subject', 'value': 'Minimum Wage', ...","For some low-wage workers, everyday tasks like...",Business,2015-01-01,Business Day,Minimum wage increases go into effect in 20 st...,News,1017
3,4,A new job title — chief of laboratory safety —...,article,New C.D.C. Job Overseeing Laboratory Safety,"[{'name': 'persons', 'value': 'McNeil, Donald ...",A new job title — chief of laboratory safety —...,National,2015-01-01,Health,A new job title — chief of laboratory safety —...,Brief,129
4,5,"Lawyers for Dzhokhar Tsarnaev, the defendant i...",article,Massachusetts: New Effort to Move Bombings Trial,"[{'name': 'subject', 'value': 'Boston Marathon...","Lawyers for Dzhokhar Tsarnaev, the defendant i...",National,2015-01-01,U.S.,"Lawyers for Dzhokhar Tsarnaev, the defendant i...",Brief,145
...,...,...,...,...,...,...,...,...,...,...,...,...
50567,50568,The nonsensical catchphrase from a Bud Light a...,article,How Do You Turn an Ad Into a Meme? Two Words: ...,"[{'name': 'subject', 'value': 'ADVERTISING AND...","In 1995, a nation was rapt as three frogs croa...",Business,2017-12-31,Business Day,The nonsensical catchphrase from a Bud Light a...,News,762
50568,50569,The earthquake of sexual misconduct allegation...,article,"Partisans, Wielding Money, Begin Seeking to Ex...","[{'name': 'subject', 'value': 'United States P...",WASHINGTON — As the #MeToo movement to expose ...,National,2017-12-31,U.S.,The earthquake of sexual misconduct allegation...,News,1884
50569,50570,Artificial intelligence is making it possible ...,article,How Do You Vote? 50 Million Google Images Give...,"[{'name': 'subject', 'value': 'Artificial Inte...",What vehicle is most strongly associated with ...,Business,2017-12-31,Technology,Artificial intelligence is making it possible ...,News,1191
50570,50571,State officials are considering legal challeng...,article,Democrats in High-Tax States Plot to Blunt Imp...,"[{'name': 'subject', 'value': 'Tax Credits, De...","Democrats in high-cost, high-tax states are pl...",Business,2017-12-31,Business Day,State officials are considering legal challeng...,News,1480


In [3]:
# # Convert to CSV, and save cleaned data with 'nid'.

# df.to_csv("../data/files/headlines_with_nid.csv", index=False)

### Filter to get relevant data

In [4]:
df = df.loc[:, ['nid', 'keywords', 'pub_date']]
df

Unnamed: 0,nid,keywords,pub_date
0,1,"[{'name': 'organizations', 'value': 'Oculus VR...",2015-01-01
1,2,"[{'name': 'persons', 'value': 'Alford, Jeremy'...",2015-01-01
2,3,"[{'name': 'subject', 'value': 'Minimum Wage', ...",2015-01-01
3,4,"[{'name': 'persons', 'value': 'McNeil, Donald ...",2015-01-01
4,5,"[{'name': 'subject', 'value': 'Boston Marathon...",2015-01-01
...,...,...,...
50567,50568,"[{'name': 'subject', 'value': 'ADVERTISING AND...",2017-12-31
50568,50569,"[{'name': 'subject', 'value': 'United States P...",2017-12-31
50569,50570,"[{'name': 'subject', 'value': 'Artificial Inte...",2017-12-31
50570,50571,"[{'name': 'subject', 'value': 'Tax Credits, De...",2017-12-31


### Extract news locations.

In [5]:
# Iterate through 'keywords' to extract glocations.

keywords = df["keywords"].to_list()

# Create dictionary for glocations
glocations = {}

for i, e in enumerate(keywords):
    nid = i+1
    for idx, dict in enumerate(e):
        if dict["name"] == "glocations":
            glocations[nid] = dict["value"]
            
print(glocations)

{2: 'Louisiana', 3: 'United States', 5: 'Boston (Mass)', 6: 'Killeen (Tex)', 11: 'East Village (Manhattan, NY)', 12: 'Naples (Fla)', 13: 'Europe', 17: 'Japan', 18: 'Dagestan (Russia)', 21: 'Turkey', 23: 'Las Cruces (NM)', 24: 'Peachtree City (Ga)', 25: 'Boston (Mass)', 35: 'United States', 36: 'Europe', 37: 'Virginia', 39: 'Pennsylvania', 40: 'United States', 42: 'ANTARCTIC REGIONS', 55: 'New Zealand', 56: 'Europe', 59: 'Boston (Mass)', 60: 'Kentucky', 61: 'Michigan', 62: 'Colorado', 72: 'United States', 73: 'Boston (Mass)', 75: 'United States', 76: 'Florida', 89: 'Nebraska', 92: 'United States', 97: 'Louisiana', 99: 'Cape Canaveral (Fla)', 103: 'Vernon (Vt)', 105: 'Pennsylvania', 107: 'Las Vegas (Nev)', 108: 'Boston (Mass)', 111: 'United States', 112: 'France', 113: 'Gilbert (Ariz)', 128: 'Ferguson (Mo)', 129: 'Quebec Province (Canada)', 135: 'Indonesia', 136: 'Iceland', 138: 'Botswana', 140: 'Cambridge (Mass)', 143: 'Wisconsin', 145: 'Europe', 146: 'California', 151: 'ARGENTINA', 154

In [7]:
# Create a dataframe from the dicationary, and add publication dates

my_dict = glocations
df_location = pd.DataFrame(list(my_dict.items()),columns = ['nid','glocation'])
df_location['pub_date'] = " "
df_location = df_location[['nid', 'glocation', 'pub_date']]

# for i in range(len(df_location)):
#     for j in range(len(df)):
#         if df_location['nid'][i] == df['nid'][j]:
#             df_location['pub_date'][i] = df['pub_date'][j]

df_location

Unnamed: 0,nid,glocation,pub_date
0,2,Louisiana,
1,3,United States,
2,5,Boston (Mass),
3,6,Killeen (Tex),
4,11,"East Village (Manhattan, NY)",
...,...,...,...
16283,50563,Denver (Colo),
16284,50564,Acadia National Park (Me),
16285,50570,San Francisco (Calif),
16286,50571,New York State,


In [7]:
# # Create dataframe from dict of glocations, then reformat dataframe.

# df_location = pd.DataFrame.from_dict(glocations, orient='index')
# df_location.reset_index(level=0)
# df_location['nid'] = df_location.index
# df_location = df_location.reset_index(drop=True)
# df_location = df_location.rename(columns={df_location.columns[0]:'glocation'})

# df_location['pub_date'] = " "
# df_location = df_location[['nid', 'glocation', 'pub_date']]

for i in range(len(df_location)):
    for j in range(len(df)):
        if df_location['nid'][i] == df['nid'][j]:
            df_location['pub_date'][i] = df['pub_date'][j]

df_location = df_location[['nid', 'glocation', 'pub_date']]
df_location

Unnamed: 0,0
2,Louisiana
3,United States
5,Boston (Mass)
6,Killeen (Tex)
11,"East Village (Manhattan, NY)"
...,...
50563,Denver (Colo)
50564,Acadia National Park (Me)
50570,San Francisco (Calif)
50571,New York State


### News publication dates data

In [6]:
# Extract year, month and day of week data from publication date. 
df_location['year'] = " "
df_location['month'] = " "
df_location['weekday'] = " "
df_location = df_location[['nid', 'glocation', 'pub_date', 'year', 'month', 'weekday']]  

for i in range(len(df_location)):
    pub_date = datetime.datetime.strptime(df_location.iloc[i, 2], '%Y-%m-%d')
    
    df_location.iloc[i, 3] = pub_date.year
    
    df_location.iloc[i, 4] = pub_date.strftime("%B")  
    
    df_location.iloc[i, 5] = pub_date.strftime("%A")

df_location

NameError: name 'df_location' is not defined

In [None]:
# Convert to CSV to save location and dates data with 'nid'.

df_location.to_csv("static/data/choropleth_locations1_clean.csv", index=False)

In [14]:
# Count of unique glocations
print(df_location['glocation'].nunique())

1348
