# Introduction to Literary Geocoding with HTRC Analytics & Folium

In [None]:
import pandas as pd

## Import NER data from HTRC Analytics

In [None]:
entities = ("https://raw.githubusercontent.com/kaylendwyer/text-analysis-workshops/main/data/aa-fic-entities-sample.csv")
df = pd.read_csv(entities)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,vol_id,page_seq,entity,type
88480,1075594,1075619,mdp.39015030113503,299,He,PERSON
99304,1583814,1583863,uc1.$b399374,180,Anderson,PERSON
15026,2565704,2565819,uc1.32106016182757,131,her,PERSON
66503,613277,613295,mdp.39015002185265,60,her,PERSON
13998,2212506,2212601,uc1.$b401343,298,his,PERSON
...,...,...,...,...,...,...
33503,44446,44447,mdp.39015015209623,320,his,PERSON
19566,2233009,2233104,uc1.$b401346,539,Nazarene,MISC
55538,1482442,1482491,hvd.32044021096433,352,Legree,PERSON
15487,159175,159176,mdp.39015014891447,133,her,PERSON


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,vol_id,page_seq,entity,type
99531,1515731,1515780,nc01.ark:/13960/t9t16484h,82,Carteret,LOCATION
48651,2438106,2438214,uc1.b4368522,102,Bonn,LOCATION
13467,2744066,2744190,hvd.32044019565514,182,New England,LOCATION
4884,214727,214729,osu.32435055405872,91,Paris,LOCATION
69827,1392835,1392884,mdp.39015004128289,230,Fifth Avenue,LOCATION


## Geocode locations

In [None]:
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="your-email@test.test") # sign with your email

Location(Underground Railroad, Hinesburg, Chittenden County, Vermont, United States, (44.2994485, -73.0334429, 0.0))

In [None]:
# Filter the dataframe
df_locations = df.loc[df['type'] == 'LOCATION'].copy()
df_locations

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,vol_id,page_seq,entity,type
99531,1515731,1515780,nc01.ark:/13960/t9t16484h,82,Carteret,LOCATION
48651,2438106,2438214,uc1.b4368522,102,Bonn,LOCATION
13467,2744066,2744190,hvd.32044019565514,182,New England,LOCATION
4884,214727,214729,osu.32435055405872,91,Paris,LOCATION
69827,1392835,1392884,mdp.39015004128289,230,Fifth Avenue,LOCATION


In [None]:
# Geocode function

def get_latitude(x):
  try:
    address=x
    location = geolocator.geocode(address)
    return location.latitude
  except:
    pass

def get_longitude(x):
  try:
    address=x
    location = geolocator.geocode(address)
    return location.longitude
  except:
    pass

df_locations['lat'] = df_locations['entity'].apply(get_latitude)
df_locations['lng'] = df_locations['entity'].apply(get_longitude)

In [None]:
# Drop NaN values
df_geocoded = df_locations.dropna(subset=['lat', 'lng'])
df_geocoded

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,vol_id,page_seq,entity,type,lat,lng
99531,1515731,1515780,nc01.ark:/13960/t9t16484h,82,Carteret,LOCATION,40.577327,-74.2282
48651,2438106,2438214,uc1.b4368522,102,Bonn,LOCATION,50.735851,7.10066
13467,2744066,2744190,hvd.32044019565514,182,New England,LOCATION,44.205708,-70.753784
4884,214727,214729,osu.32435055405872,91,Paris,LOCATION,48.85889,2.320041
69827,1392835,1392884,mdp.39015004128289,230,Fifth Avenue,LOCATION,18.829012,99.013142


In [None]:
# Value counts
# df_geocoded['entity'].value_counts()
df_geocoded['amount'] = df_geocoded.groupby('entity')['entity'].transform('count').astype('float')
df_geocoded

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,vol_id,page_seq,entity,type,lat,lng,amount
99531,1515731,1515780,nc01.ark:/13960/t9t16484h,82,Carteret,LOCATION,40.577327,-74.2282,1.0
48651,2438106,2438214,uc1.b4368522,102,Bonn,LOCATION,50.735851,7.10066,1.0
13467,2744066,2744190,hvd.32044019565514,182,New England,LOCATION,44.205708,-70.753784,1.0
4884,214727,214729,osu.32435055405872,91,Paris,LOCATION,48.85889,2.320041,1.0
69827,1392835,1392884,mdp.39015004128289,230,Fifth Avenue,LOCATION,18.829012,99.013142,1.0


## Generating a heatmap using Folium

In [None]:
## Import modules
import folium
from folium import plugins
from folium.plugins import HeatMap
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
hmap = folium.Map(prefer_canvas=True)

hm = HeatMap( list(zip(df_geocoded.lat.values,
                         df_geocoded.lng.values,
                         df_geocoded.amount.values)),
                         min_opacity=0.2,
                         radius=10, blur=15,
                         max_zoom=1,

)

hmap.add_child(hm)

# Heatmap from a larger sample of geocoded locations

In [None]:
geocoded = ("https://raw.githubusercontent.com/kaylendwyer/text-analysis-workshops/main/data/african-american-fic-geocoded.csv")
larger_sample = pd.read_csv(geocoded, encoding='utf-8')

hmap = folium.Map(prefer_canvas=True)

hm = HeatMap( list(zip(larger_sample.lat.values,
                       larger_sample.lng.values,
                       larger_sample.amount.values)),
                       min_opacity=0.1,
                       radius=10, blur=17, 
                       max_zoom=1,
            )

hmap.add_child(hm)

## In Groups, discuss:
- How can this visualization help us study the literature in our corpus?
- How would we go about doing a "close reading" of the texts?
- How might this map be incomplete or incorrect?