In [1]:
import os
import re
from pprint import pprint
import pandas
import numpy
import matplotlib.pyplot as plt

pandas.set_option("display.max_colwidth", 50)


In [2]:
dispatchFile = "entities.csv"
dispatch = pandas.read_csv(dispatchFile, sep="\t", header=0)

Let's do some minor modification: add a column with all dates of each month changed to 1 (we can use that to aggregate our data into months)

In [3]:
dispatch["month"] = [re.sub("-\d\d$", "-01", str(i)) for i in dispatch["date"]]
# convert into date format
dispatch["month"] = pandas.to_datetime(dispatch["month"], format="%Y-%m-%d")
dispatch["date"] = pandas.to_datetime(dispatch["date"], format="%Y-%m-%d")

# reorder columns
dispatch = dispatch[["itemID", "month", "date", "itemType", "itemUnified", "itemId"]]

In [4]:
dispatch

Unnamed: 0,itemID,month,date,itemType,itemUnified,itemId
0,1864-04-28_article_001,1864-04-01,1864-04-28,placename,"gordonsville, orange, virginia","tgn,2111971"
1,1864-04-28_article_002,1864-04-01,1864-04-28,placename,"plymouth, washington, north carolina","tgn,2076159"
2,1864-04-28_article_002,1864-04-01,1864-04-28,placename,"plymouth, washington, north carolina","tgn,2076159"
3,1864-04-28_article_002,1864-04-01,1864-04-28,persname,"wessels,brigadier-general,,,,","wessels,h.,w."
4,1864-04-28_article_002,1864-04-01,1864-04-28,persname,"lincoln,,,,,",lincoln
...,...,...,...,...,...,...
989392,1864-03-31_article_169,1864-03-01,1864-03-31,persname,"hunt,,chas,,,","hunt,chas"
989393,1864-03-31_article_170,1864-03-01,1864-03-31,persname,"davis,,,,,","davis,waddy"
989394,1864-03-31_article_170,1864-03-01,1864-03-31,placename,"albemarle, virginia, united states","tgn,2002137"
989395,1864-03-31_article_170,1864-03-01,1864-03-31,persname,"cook,,,,,",cook


We can easily count frequencies in the following manner:

In [5]:
dispatch_place = dispatch[dispatch["itemType"] == "placename"]

In [6]:
dispatch_place[["month", "itemId"]]

Unnamed: 0,month,itemId
0,1864-04-01,"tgn,2111971"
1,1864-04-01,"tgn,2076159"
2,1864-04-01,"tgn,2076159"
5,1864-04-01,"tgn,2076159"
7,1864-04-01,"tgn,2076159"
...,...,...
989380,1864-03-01,"tgn,7014404"
989381,1864-03-01,"tgn,7013981"
989384,1864-03-01,"tgn,7013964"
989385,1864-03-01,"tgn,7013964"


In [7]:
tgnFile = "TGNOut_Coordinates.csv"
tgnData = pandas.read_csv(tgnFile, sep="\t", header=0)

In [8]:
tgnData.head(10)

Unnamed: 0,TGN_ID,LAT,LON
0,"tgn,1000001",42.284495,-101.123047
1,"tgn,1000002",-3.55,-56.683
2,"tgn,1000003",56.2,15.016
3,"tgn,1000004",43.8,87.633
4,"tgn,1000006",-18.510219,139.367118
5,"tgn,1000007",-83.843,65.725
6,"tgn,1000009",17.05,-61.8
7,"tgn,1000046",-17.0,-65.0
8,"tgn,1000047",-10.0,-55.0
9,"tgn,1000048",-60.0,-45.0


In [9]:
tgnData.columns = ["itemId", "lat", "lon"]

tgnData.head(10)

Unnamed: 0,itemId,lat,lon
0,"tgn,1000001",42.284495,-101.123047
1,"tgn,1000002",-3.55,-56.683
2,"tgn,1000003",56.2,15.016
3,"tgn,1000004",43.8,87.633
4,"tgn,1000006",-18.510219,139.367118
5,"tgn,1000007",-83.843,65.725
6,"tgn,1000009",17.05,-61.8
7,"tgn,1000046",-17.0,-65.0
8,"tgn,1000047",-10.0,-55.0
9,"tgn,1000048",-60.0,-45.0


In [10]:
merged = pandas.merge(dispatch_place, tgnData, on=["itemId"])

In [11]:
merged

Unnamed: 0,itemID,month,date,itemType,itemUnified,itemId,lat,lon
0,1864-04-28_article_001,1864-04-01,1864-04-28,placename,"gordonsville, orange, virginia","tgn,2111971",38.1333,-78.1833
1,1864-06-22_orders_130,1864-06-01,1864-06-22,placename,"gordonsville, orange, virginia","tgn,2111971",38.1333,-78.1833
2,1862-07-25_article_013,1862-07-01,1862-07-25,placename,"gordonsville, orange, virginia","tgn,2111971",38.1333,-78.1833
3,1862-07-25_article_013,1862-07-01,1862-07-25,placename,"gordonsville, orange, virginia","tgn,2111971",38.1333,-78.1833
4,1862-07-17_article_003,1862-07-01,1862-07-17,placename,"gordonsville, orange, virginia","tgn,2111971",38.1333,-78.1833
...,...,...,...,...,...,...,...,...
348604,1863-06-04_article_019,1863-06-01,1863-06-04,placename,"seneca, new york, united states","tgn,1002882",42.7833,-76.8500
348605,1862-08-25_article_010,1862-08-01,1862-08-25,placename,"mallard creek, lawrence, alabama","tgn,2489504",34.5833,-87.2167
348606,1862-08-25_article_010,1862-08-01,1862-08-25,placename,"wards mill creek, tuscaloosa, alabama","tgn,2751076",33.3667,-87.6833
348607,1864-11-18_article_029,1864-11-01,1864-11-18,placename,"gwinnett, georgia, united states","tgn,2000358",33.9500,-84.0500


In [12]:
merged = merged.sort_values('month')

In [13]:
merged

Unnamed: 0,itemID,month,date,itemType,itemUnified,itemId,lat,lon
144156,1860-11-22_advert_330,1860-11-01,1860-11-22,placename,"henrico, virginia","tgn,1002467",37.5500,-77.3667
218225,1860-11-02_advert_423,1860-11-01,1860-11-02,placename,"ashland, hanover, virginia","tgn,2110477",37.7500,-77.4667
218226,1860-11-02_advert_554,1860-11-01,1860-11-02,placename,"ashland, hanover, virginia","tgn,2110477",37.7500,-77.4667
218227,1860-11-02_advert_561,1860-11-01,1860-11-02,placename,"ashland, hanover, virginia","tgn,2110477",37.7500,-77.4667
218228,1860-11-02_advert_561,1860-11-01,1860-11-02,placename,"ashland, hanover, virginia","tgn,2110477",37.7500,-77.4667
...,...,...,...,...,...,...,...,...
109407,1865-12-16_article_138,1865-12-01,1865-12-16,placename,"washington, district of columbia, united states","tgn,7013962",38.8833,-77.0333
109408,1865-12-16_article_138,1865-12-01,1865-12-16,placename,"washington, district of columbia, united states","tgn,7013962",38.8833,-77.0333
109409,1865-12-16_article_138,1865-12-01,1865-12-16,placename,"washington, district of columbia, united states","tgn,7013962",38.8833,-77.0333
109401,1865-12-16_article_129,1865-12-01,1865-12-16,placename,"washington, district of columbia, united states","tgn,7013962",38.8833,-77.0333


In [14]:
merged = merged[["month","itemUnified","lat","lon"]]
merged = merged.reset_index()

In [15]:
merged

Unnamed: 0,index,month,itemUnified,lat,lon
0,144156,1860-11-01,"henrico, virginia",37.5500,-77.3667
1,218225,1860-11-01,"ashland, hanover, virginia",37.7500,-77.4667
2,218226,1860-11-01,"ashland, hanover, virginia",37.7500,-77.4667
3,218227,1860-11-01,"ashland, hanover, virginia",37.7500,-77.4667
4,218228,1860-11-01,"ashland, hanover, virginia",37.7500,-77.4667
...,...,...,...,...,...
348604,109407,1865-12-01,"washington, district of columbia, united states",38.8833,-77.0333
348605,109408,1865-12-01,"washington, district of columbia, united states",38.8833,-77.0333
348606,109409,1865-12-01,"washington, district of columbia, united states",38.8833,-77.0333
348607,109401,1865-12-01,"washington, district of columbia, united states",38.8833,-77.0333


In [16]:
merged = merged[["month","itemUnified","lat","lon"]]

In [17]:
merged = merged.groupby(merged.columns.tolist(),as_index=False).size()

In [18]:
merged

Unnamed: 0,month,itemUnified,lat,lon,size
0,1860-11-01,"aberdeen, aberdeen, scotland",57.133333,-2.1000,1
1,1860-11-01,"abingdon, washington, virginia",36.700000,-81.9667,1
2,1860-11-01,"accomac, accomack, virginia",37.716700,-75.6500,10
3,1860-11-01,"acme, dickinson, kansas",38.833300,-97.2500,9
4,1860-11-01,"adams valley, la crosse, wisconsin",43.966700,-91.0000,37
...,...,...,...,...,...
47396,1865-12-01,"york, virginia, united states",37.233300,-76.5500,2
47397,1865-12-01,"york, virginia, united states",37.516700,-76.7833,1
47398,1865-12-01,"york, york, pennsylvania",39.950000,-76.7167,2
47399,1865-12-01,yucatan,20.833333,-89.0000,1


In [28]:
merged['month'] = merged['month'].dt.strftime('%Y-%m')

In [30]:
merged

Unnamed: 0,month,itemUnified,lat,lon,size
0,1860-11,"aberdeen, aberdeen, scotland",57.133333,-2.1000,1
1,1860-11,"abingdon, washington, virginia",36.700000,-81.9667,1
2,1860-11,"accomac, accomack, virginia",37.716700,-75.6500,10
3,1860-11,"acme, dickinson, kansas",38.833300,-97.2500,9
4,1860-11,"adams valley, la crosse, wisconsin",43.966700,-91.0000,37
...,...,...,...,...,...
47396,1865-12,"york, virginia, united states",37.233300,-76.5500,2
47397,1865-12,"york, virginia, united states",37.516700,-76.7833,1
47398,1865-12,"york, york, pennsylvania",39.950000,-76.7167,2
47399,1865-12,yucatan,20.833333,-89.0000,1
