# **Combining Location Data and Sentiment Scores for Geographic Heatmaps**

In [1]:
# Setup dependencies
import pandas as pd
import numpy as np
import pycountry
import us

# Import cleaned final dataset with locations
path = "static/data/geolocations_clean_final.csv"
df = pd.read_csv(path)
df

Unnamed: 0,nid,glocation,country,state,latitude,longitude,pub_date,year,month,weekday
0,2,Louisiana,United States,Louisiana,30.9842977,-91.96233269999999,2015-01-01,2015,January,Thursday
1,3,United States,United States,,37.09024,-95.712891,2015-01-01,2015,January,Thursday
2,5,Boston (Mass),United States,Massachusetts,42.3600825,-71.0588801,2015-01-01,2015,January,Thursday
3,6,Killeen (Tex),United States,Texas,31.1171194,-97.72779589999999,2015-01-01,2015,January,Thursday
4,11,"East Village (Manhattan, NY)",United States,New York,40.7264773,-73.98153370000001,2015-01-01,2015,January,Thursday
...,...,...,...,...,...,...,...,...,...,...
16283,50563,Denver (Colo),United States,Colorado,39.7392358,-104.990251,2015-11-13,2015,November,Friday
16284,50564,Acadia National Park (Me),United States,Maine,44.3385559,-68.2733346,2015-11-13,2015,November,Friday
16285,50570,San Francisco (Calif),United States,California,37.7749295,-122.4194155,2015-11-13,2015,November,Friday
16286,50571,New York State,United States,New York,43.2994285,-74.21793260000001,2015-11-13,2015,November,Friday


In [2]:
# Drop rows where country name is missing
df = df.replace(r'^\s*$', np.NaN, regex=True)
for i in range(len(df)):
    if df['country'][i]:
        continue
    else:
        df['country'][i] == np.nan
df_countries = df.dropna(subset=['country'])
df_countries = df_countries.reset_index(drop=True)
df_countries

Unnamed: 0,nid,glocation,country,state,latitude,longitude,pub_date,year,month,weekday
0,2,Louisiana,United States,Louisiana,30.9842977,-91.96233269999999,2015-01-01,2015,January,Thursday
1,3,United States,United States,,37.09024,-95.712891,2015-01-01,2015,January,Thursday
2,5,Boston (Mass),United States,Massachusetts,42.3600825,-71.0588801,2015-01-01,2015,January,Thursday
3,6,Killeen (Tex),United States,Texas,31.1171194,-97.72779589999999,2015-01-01,2015,January,Thursday
4,11,"East Village (Manhattan, NY)",United States,New York,40.7264773,-73.98153370000001,2015-01-01,2015,January,Thursday
...,...,...,...,...,...,...,...,...,...,...
15625,50563,Denver (Colo),United States,Colorado,39.7392358,-104.990251,2015-11-13,2015,November,Friday
15626,50564,Acadia National Park (Me),United States,Maine,44.3385559,-68.2733346,2015-11-13,2015,November,Friday
15627,50570,San Francisco (Calif),United States,California,37.7749295,-122.4194155,2015-11-13,2015,November,Friday
15628,50571,New York State,United States,New York,43.2994285,-74.21793260000001,2015-11-13,2015,November,Friday


In [3]:
# Import sentiment scores
path_scores = "static/data/headlines_scores_keywords.csv"
df_scores = pd.read_csv(path_scores)
df_scores

Unnamed: 0,nid,headline,article,headline_score,article_score,pub_date,section_name,news_desk,organizations,persons,subject,glocations,creative_works,abs_headline_score,abs_article_score,Unnamed: 15,Unnamed: 16
0,1,"Standouts in Tech: Drones, Virtual Reality, In...",LOTS of cool new technology products come out ...,0,0.1655,1/1/2015,Technology,Business,"['Oculus VR Inc', 'Skype Technologies', 'DJI I...","['Manjoo, Farhad']","['Virtual Reality (Computers)', 'Computers and...",[],[],0,0.1655,,
1,2,Much of David Duke's '91 Campaign Is Now in Lo...,"BATON ROUGE, La. — David Duke seems a figure f...",0,0.128,1/1/2015,U.S.,National,[],"['Alford, Jeremy', 'Duke, David E', 'Scalise, ...","['Blacks', 'Black People', 'Race and Ethnicity...",['Louisiana'],[],0,0.128,,
2,3,"States' Minimum Wages Rise, Helping Millions o...","For some low-wage workers, everyday tasks like...",0.296,-0.0516,1/1/2015,Business Day,Business,[],[],"['Minimum Wage', 'States (US)']",['United States'],[],0.296,0.0516,,
3,4,New C.D.C. Job Overseeing Laboratory Safety,A new job title — chief of laboratory safety —...,0.4215,0.04162,1/1/2015,Health,National,['Centers for Disease Control and Prevention'],"['McNeil, Donald G Jr', 'Frieden, Thomas R']","['Ebola Virus', 'Laboratories and Scientific E...",[],[],0.4215,0.04162,,
4,5,Massachusetts: New Effort to Move Bombings Trial,"Lawyers for Dzhokhar Tsarnaev, the defendant i...",0,-0.29145,1/1/2015,U.S.,National,[],"['Tsarnaev, Dzhokhar A']",['Boston Marathon Bombings (2013)'],['Boston (Mass)'],[],0,0.29145,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50568,50569,How Do You Turn an Ad Into a Meme? Two Words: ...,"In 1995, a nation was rapt as three frogs croa...",0,0.4404,12/31/2017,Business Day,Business,['Anheuser-Busch InBev NV'],[],"['ADVERTISING AND MARKETING', 'Beer']",[],[],0,0.4404,,
50569,50570,"Partisans, Wielding Money, Begin Seeking to Ex...",WASHINGTON — As the #MeToo movement to expose ...,-0.5994,-0.6124,12/31/2017,U.S.,National,[],"['Allred, Gloria', 'Bloom, Lisa (1961- )', 'Ce...","['United States Politics and Government', '#Me...",[],[],0.5994,0.6124,,
50570,50571,How Do You Vote? 50 Million Google Images Give...,What vehicle is most strongly associated with ...,0,0.3384,12/31/2017,Technology,Business,"['Google Street View', 'Stanford University']",[],"['Artificial Intelligence', 'Data-Mining and D...","['Casper (Wyo)', 'Burlington (Vt)', 'Chicago (...",[],0,0.3384,,
50571,50572,Democrats in High-Tax States Plot to Blunt Imp...,"Democrats in high-cost, high-tax states are pl...",0,0,12/31/2017,Business Day,Business,"['Democratic Party', 'Republican Party']",[],"['Tax Credits, Deductions and Exemptions', 'Ta...","['California', 'Connecticut', 'New Jersey', 'N...",[],0,0,,


In [4]:
# Add scores to locations dataframe
df_countries['headline_score'] = " "
df_countries['article_score'] = " "
df_countries = df_countries[['nid', 'country', 'state', 'year', 'month', 'weekday', 'headline_score', 'article_score']] 
df_countries

Unnamed: 0,nid,country,state,year,month,weekday,headline_score,article_score
0,2,United States,Louisiana,2015,January,Thursday,,
1,3,United States,,2015,January,Thursday,,
2,5,United States,Massachusetts,2015,January,Thursday,,
3,6,United States,Texas,2015,January,Thursday,,
4,11,United States,New York,2015,January,Thursday,,
...,...,...,...,...,...,...,...,...
15625,50563,United States,Colorado,2015,November,Friday,,
15626,50564,United States,Maine,2015,November,Friday,,
15627,50570,United States,California,2015,November,Friday,,
15628,50571,United States,New York,2015,November,Friday,,


In [5]:
# Fetch headline and article sentiment scores for countries dataset
df_countries['headline_score'] = df_countries.nid.map(df_scores.set_index('nid')['headline_score'].to_dict())
df_countries['article_score'] = df_countries.nid.map(df_scores.set_index('nid')['article_score'].to_dict())
df_countries.head()

Unnamed: 0,nid,country,state,year,month,weekday,headline_score,article_score
0,2,United States,Louisiana,2015,January,Thursday,0.0,0.128
1,3,United States,,2015,January,Thursday,0.296,-0.0516
2,5,United States,Massachusetts,2015,January,Thursday,0.0,-0.29145
3,6,United States,Texas,2015,January,Thursday,0.0,-0.6705
4,11,United States,New York,2015,January,Thursday,0.0,0.5859


In [6]:
# Add country code and state code columns to dataframe
df_countries['country_ISO_code'] = " "
df_countries['US_state_code'] = " "
df = df_countries[['nid', 'country', 'country_ISO_code', 'state', 'US_state_code', 'year', 'month', 'weekday', 'headline_score', 'article_score']] 
df = df.replace(np.nan, ' ', regex=True)
df

Unnamed: 0,nid,country,country_ISO_code,state,US_state_code,year,month,weekday,headline_score,article_score
0,2,United States,,Louisiana,,2015,January,Thursday,0,0.128
1,3,United States,,,,2015,January,Thursday,0.296,-0.0516
2,5,United States,,Massachusetts,,2015,January,Thursday,0,-0.29145
3,6,United States,,Texas,,2015,January,Thursday,0,-0.6705
4,11,United States,,New York,,2015,January,Thursday,0,0.5859
...,...,...,...,...,...,...,...,...,...,...
15625,50563,United States,,Colorado,,2015,November,Friday,0,0.4939
15626,50564,United States,,Maine,,2015,November,Friday,-0.6369,-0.630833333
15627,50570,United States,,California,,2015,November,Friday,-0.5994,-0.6124
15628,50571,United States,,New York,,2015,November,Friday,0,0.3384


In [7]:
# Get ISO country codes from "pycountry" library
# Get US state codes from "us" library

for i in range(len(df)):
    country = df['country'][i]
    try:
        result_c = pycountry.countries.search_fuzzy(country)[0].alpha_3 
    except:
        result_c = " "
    df['country_ISO_code'][i] = result_c   


    state = df['state'][i]
    try:
        result_s = us.states.lookup(state).abbr
    except:  
        result_s = " "  
    df['US_state_code'][i] = result_s
    
df

Unnamed: 0,nid,country,country_ISO_code,state,US_state_code,year,month,weekday,headline_score,article_score
0,2,United States,USA,Louisiana,LA,2015,January,Thursday,0,0.128
1,3,United States,USA,,,2015,January,Thursday,0.296,-0.0516
2,5,United States,USA,Massachusetts,MA,2015,January,Thursday,0,-0.29145
3,6,United States,USA,Texas,TX,2015,January,Thursday,0,-0.6705
4,11,United States,USA,New York,NY,2015,January,Thursday,0,0.5859
...,...,...,...,...,...,...,...,...,...,...
15625,50563,United States,USA,Colorado,CO,2015,November,Friday,0,0.4939
15626,50564,United States,USA,Maine,ME,2015,November,Friday,-0.6369,-0.630833333
15627,50570,United States,USA,California,CA,2015,November,Friday,-0.5994,-0.6124
15628,50571,United States,USA,New York,NY,2015,November,Friday,0,0.3384


In [8]:
# Convert to CSV to save news location data with sentiment data and dates

df.to_csv("static/data/geolocations_with_sentiment_scores.csv", index=False)