# **Heatmaps of News Sentiment**

### Setup dependencies.

In [1]:
import pandas as pd
import re
import datetime
import calendar
from random import seed
from random import random

# Import cleaned news dataset
path1 = "files/headlines.csv"
df = pd.read_csv(path1, converters={"keywords": eval})
df.head()

Unnamed: 0,abstract,document_type,headline,keywords,lead_paragraph,news_desk,pub_date,section_name,snippet,type_of_material,word_count
0,Farhad Manjoo picks four products from 2014 th...,article,"Standouts in Tech: Drones, Virtual Reality, In...","[{'name': 'organizations', 'value': 'Oculus VR...",LOTS of cool new technology products come out ...,Business,2015-01-01,Technology,Farhad Manjoo picks four products from 2014 th...,News,824
1,Representative Steve Scalise’s effort to expla...,article,Much of David Duke’s ’91 Campaign Is Now in Lo...,"[{'name': 'persons', 'value': 'Alford, Jeremy'...","BATON ROUGE, La. — David Duke seems a figure f...",National,2015-01-01,U.S.,Representative Steve Scalise’s effort to expla...,News,1293
2,Minimum wage increases go into effect in 20 st...,article,"States’ Minimum Wages Rise, Helping Millions o...","[{'name': 'subject', 'value': 'Minimum Wage', ...","For some low-wage workers, everyday tasks like...",Business,2015-01-01,Business Day,Minimum wage increases go into effect in 20 st...,News,1017
3,A new job title — chief of laboratory safety —...,article,New C.D.C. Job Overseeing Laboratory Safety,"[{'name': 'persons', 'value': 'McNeil, Donald ...",A new job title — chief of laboratory safety —...,National,2015-01-01,Health,A new job title — chief of laboratory safety —...,Brief,129
4,"Lawyers for Dzhokhar Tsarnaev, the defendant i...",article,Massachusetts: New Effort to Move Bombings Trial,"[{'name': 'subject', 'value': 'Boston Marathon...","Lawyers for Dzhokhar Tsarnaev, the defendant i...",National,2015-01-01,U.S.,"Lawyers for Dzhokhar Tsarnaev, the defendant i...",Brief,145


### Import list of 50 US states and their codes.

In [2]:
path2 = "files/us_states50.csv"
df_states = pd.read_csv(path2)
df_states

Unnamed: 0,state,code
0,Alabama,AL
1,Alaska,AK
2,Arizona,AZ
3,Arkansas,AR
4,California,CA
5,Colorado,CO
6,Connecticut,CT
7,Delaware,DE
8,Florida,FL
9,Georgia,GA


### Define news id ('nid').

In [3]:
df['nid'] = range(1, len(df) + 1)
df = df[['nid', 'keywords', 'pub_date']]
df.head()

Unnamed: 0,nid,keywords,pub_date
0,1,"[{'name': 'organizations', 'value': 'Oculus VR...",2015-01-01
1,2,"[{'name': 'persons', 'value': 'Alford, Jeremy'...",2015-01-01
2,3,"[{'name': 'subject', 'value': 'Minimum Wage', ...",2015-01-01
3,4,"[{'name': 'persons', 'value': 'McNeil, Donald ...",2015-01-01
4,5,"[{'name': 'subject', 'value': 'Boston Marathon...",2015-01-01


### Extract news locations.

In [4]:
# Convert 'keywords' to list, and loop through rows to obtain values where name=glocations.
keywords = df["keywords"].to_list()

# Create dictionary for glocations
glocations = {}

for i, e in enumerate(keywords):
    nid = i+1
    for idx, dict in enumerate(e):
        if dict["name"] == "glocations":
            glocations[nid] = dict["value"]
            
# print(glocations)

In [5]:
# Create dataframe from dict of glocations, then reformat dataframe.

df_location = pd.DataFrame.from_dict(glocations, orient='index')
df_location['nid'] = df_location.index
df_location = df_location.reset_index(drop=True)
df_location = df_location.rename(columns={df_location.columns[0]:'glocation'})

df_location = df_location[['nid', 'glocation']]
df_location

Unnamed: 0,nid,glocation
0,2,Louisiana
1,3,United States
2,5,Boston (Mass)
3,6,Killeen (Tex)
4,11,"East Village (Manhattan, NY)"
...,...,...
16283,50563,Denver (Colo)
16284,50564,Acadia National Park (Me)
16285,50570,San Francisco (Calif)
16286,50571,New York State


### Get publication date details for news locations.

In [6]:
# Join pub_date for each news based on nid.
df_location = pd.merge(df_location, df, how="left", on=['nid'])
df_location = df_location[['nid', 'glocation', 'pub_date']]
df_location

Unnamed: 0,nid,glocation,pub_date
0,2,Louisiana,2015-01-01
1,3,United States,2015-01-01
2,5,Boston (Mass),2015-01-01
3,6,Killeen (Tex),2015-01-01
4,11,"East Village (Manhattan, NY)",2015-01-01
...,...,...,...
16283,50563,Denver (Colo),2017-12-31
16284,50564,Acadia National Park (Me),2017-12-31
16285,50570,San Francisco (Calif),2017-12-31
16286,50571,New York State,2017-12-31


In [7]:
# Extract year, month and day of week data from publication date. 

df_location['year'] = " "
df_location['month'] = " "
df_location['weekday'] = " "
df_location = df_location[['nid', 'glocation', 'pub_date', 'year', 'month', 'weekday']]

for i in range(len(df_location)):
    pub_date = str(df_location.iloc[i, 2])
    
    pub_weekday_num = datetime.datetime.strptime(pub_date,'%Y-%m-%d').weekday()
    df_location.iloc[i, 5] = calendar.day_name[pub_weekday_num]
    
    pub_month_num = int(pub_date[6:7])
    df_location.iloc[i, 4] = calendar.month_name[pub_month_num]
    
    df_location.iloc[i, 3] = pub_date[:4]

df_location

Unnamed: 0,nid,glocation,pub_date,year,month,weekday
0,2,Louisiana,2015-01-01,2015,January,Thursday
1,3,United States,2015-01-01,2015,January,Thursday
2,5,Boston (Mass),2015-01-01,2015,January,Thursday
3,6,Killeen (Tex),2015-01-01,2015,January,Thursday
4,11,"East Village (Manhattan, NY)",2015-01-01,2015,January,Thursday
...,...,...,...,...,...,...
16283,50563,Denver (Colo),2017-12-31,2017,February,Sunday
16284,50564,Acadia National Park (Me),2017-12-31,2017,February,Sunday
16285,50570,San Francisco (Calif),2017-12-31,2017,February,Sunday
16286,50571,New York State,2017-12-31,2017,February,Sunday


### Determine State for news location (only US data considered).

In [8]:
# Add column for US state based on glocation.
df_location['state'] = " "
df_location = df_location[['nid', 'glocation', 'state', 'pub_date', 'year', 'month', 'weekday']]

# Extract state names within parentheses in glocation.
paran = ["("]
for i in range(len(df_location)):
    str1 = df_location.iloc[i, 1] 
    for e in str1:  
        if e in paran: 
            mymatch = re.search(r'\((.+)\)', str1) 
            str2 = mymatch.group(0)
            str2 = str2[1:-1]
            for j in range(len(df_states)):
                state = df_states.iloc[j, 0]
                code = df_states.iloc[j, 1]
                if re.search(code, str2):
                    df_location.iloc[i, 2] = code
                    break              
                elif re.search(str2, state):
                    df_location.iloc[i, 2] = code
                    break
                    
df_location

Unnamed: 0,nid,glocation,state,pub_date,year,month,weekday
0,2,Louisiana,,2015-01-01,2015,January,Thursday
1,3,United States,,2015-01-01,2015,January,Thursday
2,5,Boston (Mass),MA,2015-01-01,2015,January,Thursday
3,6,Killeen (Tex),TX,2015-01-01,2015,January,Thursday
4,11,"East Village (Manhattan, NY)",NY,2015-01-01,2015,January,Thursday
...,...,...,...,...,...,...,...
16283,50563,Denver (Colo),CO,2017-12-31,2017,February,Sunday
16284,50564,Acadia National Park (Me),NM,2017-12-31,2017,February,Sunday
16285,50570,San Francisco (Calif),CA,2017-12-31,2017,February,Sunday
16286,50571,New York State,,2017-12-31,2017,February,Sunday


In [9]:
# Extract state names not included in parentheses.

for i in range(len(df_location)):
    str1 = df_location.iloc[i, 1]
    if df_location.iloc[i, 2]:
        continue
    else:
        for j in range(len(df_states)):
            code = df_states.iloc[j, 1]
            state = df_states.iloc[j, 0]
            state1 = state.split()[0]
            if state1 == state:
                if re.search(state, str1):
                    df_location.iloc[i, 2] = code
                elif re.search(code, str1):
                    df_location.iloc[i, 2] = code
                break
            else:    
                state2 = state.split()[1]
                if re.search(state1, str1):
                    if re.search(state2, str1):
                        df_location.iloc[i, 2] = code
                elif re.search(code, str1):
                    df_location.iloc[i, 2] = code

df_location

Unnamed: 0,nid,glocation,state,pub_date,year,month,weekday
0,2,Louisiana,,2015-01-01,2015,January,Thursday
1,3,United States,,2015-01-01,2015,January,Thursday
2,5,Boston (Mass),MA,2015-01-01,2015,January,Thursday
3,6,Killeen (Tex),TX,2015-01-01,2015,January,Thursday
4,11,"East Village (Manhattan, NY)",NY,2015-01-01,2015,January,Thursday
...,...,...,...,...,...,...,...
16283,50563,Denver (Colo),CO,2017-12-31,2017,February,Sunday
16284,50564,Acadia National Park (Me),NM,2017-12-31,2017,February,Sunday
16285,50570,San Francisco (Calif),CA,2017-12-31,2017,February,Sunday
16286,50571,New York State,,2017-12-31,2017,February,Sunday


### Location data availability

In [10]:
state_count = df_location['state'].value_counts()
print(state_count)

      12534
CA      823
NY      344
TX      338
IL      260
MA      164
MI      161
SC      159
MT      154
OH      133
NV      112
WA      102
CO       95
TN       81
NC       74
OR       73
MN       72
AZ       57
WI       53
AL       52
NM       50
IN       44
NJ       36
OK       32
IA       31
MS       30
UT       28
CT       27
NH       22
KS       20
ND       16
RI       15
HI       13
NE       13
AR       11
VA       11
AK       10
LA        9
ID        9
DE        7
WY        6
PA        3
SD        3
MO        1
Name: state, dtype: int64


### Generate fake sentiment scores for testing.

In [16]:
# Add column for sentiment scores.
df_location['sentiment_score'] = " "
df_location = df_location[['nid', 'glocation', 'state', 'pub_date', 'year', 'month', 'weekday', 'sentiment_score']]

# generate random floating point values
# seed random number generator
seed
# generate random numbers between 0-1
for i in range(len(df_location)):
    value = "{:.2f}".format(random())
    df_location.iloc[i, 7] = value
    
df_location

Unnamed: 0,nid,glocation,state,pub_date,year,month,weekday,sentiment_score
0,2,Louisiana,,2015-01-01,2015,January,Thursday,0.12
1,3,United States,,2015-01-01,2015,January,Thursday,0.52
2,5,Boston (Mass),MA,2015-01-01,2015,January,Thursday,0.32
3,6,Killeen (Tex),TX,2015-01-01,2015,January,Thursday,0.31
4,11,"East Village (Manhattan, NY)",NY,2015-01-01,2015,January,Thursday,0.21
...,...,...,...,...,...,...,...,...
16283,50563,Denver (Colo),CO,2017-12-31,2017,February,Sunday,0.01
16284,50564,Acadia National Park (Me),NM,2017-12-31,2017,February,Sunday,0.91
16285,50570,San Francisco (Calif),CA,2017-12-31,2017,February,Sunday,0.60
16286,50571,New York State,,2017-12-31,2017,February,Sunday,0.23
