In [2]:
import os
import re
from pprint import pprint
import pandas
import numpy
import matplotlib.pyplot as plt

pandas.set_option("display.max_colwidth", 50)


In [3]:
dispatchFile = "entities.csv"
dispatch = pandas.read_csv(dispatchFile, sep="\t", header=0)

Let's do some minor modification: add a column with all dates of each month changed to 1 (we can use that to aggregate our data into months)

In [4]:
dispatch["month"] = [re.sub("-\d\d$", "-01", str(i)) for i in dispatch["date"]]
# convert into date format
dispatch["month"] = pandas.to_datetime(dispatch["month"], format="%Y-%m-%d")
dispatch["date"] = pandas.to_datetime(dispatch["date"], format="%Y-%m-%d")

# reorder columns
dispatch = dispatch[["itemID", "month", "date", "itemType", "itemUnified", "itemId"]]

In [5]:
dispatch

Unnamed: 0,itemID,month,date,itemType,itemUnified,itemId
0,1864-04-28_article_001,1864-04-01,1864-04-28,placename,"gordonsville, orange, virginia","tgn,2111971"
1,1864-04-28_article_002,1864-04-01,1864-04-28,placename,"plymouth, washington, north carolina","tgn,2076159"
2,1864-04-28_article_002,1864-04-01,1864-04-28,placename,"plymouth, washington, north carolina","tgn,2076159"
3,1864-04-28_article_002,1864-04-01,1864-04-28,persname,"wessels,brigadier-general,,,,","wessels,h.,w."
4,1864-04-28_article_002,1864-04-01,1864-04-28,persname,"lincoln,,,,,",lincoln
...,...,...,...,...,...,...
989392,1864-03-31_article_169,1864-03-01,1864-03-31,persname,"hunt,,chas,,,","hunt,chas"
989393,1864-03-31_article_170,1864-03-01,1864-03-31,persname,"davis,,,,,","davis,waddy"
989394,1864-03-31_article_170,1864-03-01,1864-03-31,placename,"albemarle, virginia, united states","tgn,2002137"
989395,1864-03-31_article_170,1864-03-01,1864-03-31,persname,"cook,,,,,",cook


We can easily count frequencies in the following manner:

In [6]:
dispatch_place = dispatch[dispatch["itemType"] == "placename"]

In [16]:
dispatch_place[["month", "itemId", "itemID"]]

Unnamed: 0,month,itemId,itemID
0,1864-04-01,"tgn,2111971",1864-04-28_article_001
1,1864-04-01,"tgn,2076159",1864-04-28_article_002
2,1864-04-01,"tgn,2076159",1864-04-28_article_002
5,1864-04-01,"tgn,2076159",1864-04-28_article_002
7,1864-04-01,"tgn,2076159",1864-04-28_article_002
...,...,...,...
989380,1864-03-01,"tgn,7014404",1864-03-31_article_166
989381,1864-03-01,"tgn,7013981",1864-03-31_article_166
989384,1864-03-01,"tgn,7013964",1864-03-31_article_167
989385,1864-03-01,"tgn,7013964",1864-03-31_article_167


In [17]:
tgnFile = "TGNOut_Coordinates.csv"
tgnData = pandas.read_csv(tgnFile, sep="\t", header=0)

In [18]:
tgnData.head(10)

Unnamed: 0,TGN_ID,LAT,LON
0,"tgn,1000001",42.284495,-101.123047
1,"tgn,1000002",-3.55,-56.683
2,"tgn,1000003",56.2,15.016
3,"tgn,1000004",43.8,87.633
4,"tgn,1000006",-18.510219,139.367118
5,"tgn,1000007",-83.843,65.725
6,"tgn,1000009",17.05,-61.8
7,"tgn,1000046",-17.0,-65.0
8,"tgn,1000047",-10.0,-55.0
9,"tgn,1000048",-60.0,-45.0


In [19]:
tgnData.columns = ["itemId", "lat", "lon"]

tgnData.head(10)

Unnamed: 0,itemId,lat,lon
0,"tgn,1000001",42.284495,-101.123047
1,"tgn,1000002",-3.55,-56.683
2,"tgn,1000003",56.2,15.016
3,"tgn,1000004",43.8,87.633
4,"tgn,1000006",-18.510219,139.367118
5,"tgn,1000007",-83.843,65.725
6,"tgn,1000009",17.05,-61.8
7,"tgn,1000046",-17.0,-65.0
8,"tgn,1000047",-10.0,-55.0
9,"tgn,1000048",-60.0,-45.0


In [20]:
merged = pandas.merge(dispatch_place, tgnData, on=["itemId"])

In [11]:
merged

Unnamed: 0,itemID,month,date,itemType,itemUnified,itemId,lat,lon
0,1864-04-28_article_001,1864-04-01,1864-04-28,placename,"gordonsville, orange, virginia","tgn,2111971",38.1333,-78.1833
1,1864-06-22_orders_130,1864-06-01,1864-06-22,placename,"gordonsville, orange, virginia","tgn,2111971",38.1333,-78.1833
2,1862-07-25_article_013,1862-07-01,1862-07-25,placename,"gordonsville, orange, virginia","tgn,2111971",38.1333,-78.1833
3,1862-07-25_article_013,1862-07-01,1862-07-25,placename,"gordonsville, orange, virginia","tgn,2111971",38.1333,-78.1833
4,1862-07-17_article_003,1862-07-01,1862-07-17,placename,"gordonsville, orange, virginia","tgn,2111971",38.1333,-78.1833
...,...,...,...,...,...,...,...,...
348604,1863-06-04_article_019,1863-06-01,1863-06-04,placename,"seneca, new york, united states","tgn,1002882",42.7833,-76.8500
348605,1862-08-25_article_010,1862-08-01,1862-08-25,placename,"mallard creek, lawrence, alabama","tgn,2489504",34.5833,-87.2167
348606,1862-08-25_article_010,1862-08-01,1862-08-25,placename,"wards mill creek, tuscaloosa, alabama","tgn,2751076",33.3667,-87.6833
348607,1864-11-18_article_029,1864-11-01,1864-11-18,placename,"gwinnett, georgia, united states","tgn,2000358",33.9500,-84.0500


In [22]:
dispatchTextFile = "textList.csv"
dispatchText = pandas.read_csv(dispatchTextFile, sep="\t", header=0)
dispatchText = dispatchText[["itemID", "text"]]
place_text = pandas.merge(merged, dispatchText, on=["itemID"])

In [28]:
place_text = place_text[["month", "itemUnified", "itemId", "lat", "lon", "text"]]
place_text = place_text.sort_values('month')
place_text = place_text.reset_index()

In [77]:
place_text

Unnamed: 0,index,month,itemUnified,itemId,lat,lon,text
0,221807,1860-11-01,"richmond, richmond, virginia","tgn,7013964",37.5500,-77.4500,the undersigned have formed a copartnership fo...
1,295981,1860-11-01,united states,"tgn,7012149",38.0000,-98.0000,"lawrence s. marye, attorney at law,;;; practic..."
2,306397,1860-11-01,"charleston, charleston, south carolina","tgn,7013582",32.7667,-79.9167,pretty locomotives.;;; --at the dredegar works...
3,295918,1860-11-01,bharat,"tgn,7000198",20.0000,77.0000,audacious burglary.;;; --about 3 o'clock yeste...
4,295917,1860-11-01,bharat,"tgn,7000198",20.0000,77.0000,encouragement to agriculture.;;; we have latel...
...,...,...,...,...,...,...,...
349447,9767,1865-12-01,"virginia, united states, north and central ame...","tgn,7007919",37.0000,-80.0000,a jamestown festival.;;; we never read the acc...
349448,9768,1865-12-01,united kingdom,"tgn,7002445",53.0000,-2.0000,a jamestown festival.;;; we never read the acc...
349449,9769,1865-12-01,"europe,","tgn,1000003",56.2000,15.0160,a jamestown festival.;;; we never read the acc...
349450,240435,1865-12-01,"richmond, richmond, virginia","tgn,7013964",37.5500,-77.4500,appointments.;;; --the following appointments ...


In [94]:
var = "sherman"

dispatch_search = place_text[place_text["text"].str.contains(var)]
dispatch_search = dispatch_search.reset_index()
dispatch_search['occur'] = dispatch_search["text"].str.count(var)


#dispatch_search = dispatch_search.reset_index()
#dispatch_search = dispatch_search.groupby(dispatch_search.columns.tolist(),as_index=False).size()

#dispatch_search = dispatch_search.groupby(dispatch_search.columns.tolist(),as_index=False).size()
#aggregation = {"occur":"sum"}
#dispatch_searchs = dispatch_search.groupby(dispatch_search['month']).aggregate(aggregation)
#dispatch_search = dispatch_search.reset_index()

In [99]:
sherman_place = dispatch_search[dispatch_search["occur"] > 3]

In [109]:
sherman_place = sherman_place[["month", "itemUnified", "lat", "lon"]]
sherman_place = sherman_place.reset_index()
sherman_place = sherman_place[["month", "itemUnified", "lat", "lon"]]

In [110]:
sherman_place

Unnamed: 0,month,itemUnified,lat,lon
0,1860-12-01,florida,28.00,-82.0000
1,1860-12-01,alabama,33.00,-87.0000
2,1860-12-01,alabama,33.00,-87.0000
3,1860-12-01,alabama,33.00,-87.0000
4,1860-12-01,alabama,33.00,-87.0000
...,...,...,...,...
7234,1865-03-01,"philadelphia, philadelphia, pennsylvania",39.95,-75.1500
7235,1865-04-01,"fayetteville, cumberland, north carolina",35.05,-78.8667
7236,1865-04-01,"moscow, fayette, tennessee",35.05,-89.4000
7237,1865-04-01,"united states, north and central america,",38.00,-98.0000


In [112]:
sherman_place = sherman_place.groupby(sherman_place.columns.tolist(),as_index=False).size()

In [113]:
sherman_place

Unnamed: 0,month,itemUnified,lat,lon,size
0,1860-12-01,alabama,33.0000,-87.0000,5
1,1860-12-01,florida,28.0000,-82.0000,1
2,1860-12-01,georgia,32.0000,-84.0000,2
3,1860-12-01,illinois,40.0000,-89.0000,3
4,1860-12-01,indiana,40.0000,-86.0000,1
...,...,...,...,...,...
2310,1865-03-01,"york, virginia, united states",37.5167,-76.7833,1
2311,1865-04-01,"fayetteville, cumberland, north carolina",35.0500,-78.8667,1
2312,1865-04-01,georgia,32.0000,-84.0000,1
2313,1865-04-01,"moscow, fayette, tennessee",35.0500,-89.4000,1


In [28]:
sherman_place['month'] = sherman_place['month'].dt.strftime('%Y-%m')

In [30]:
sherman_place

Unnamed: 0,month,itemUnified,lat,lon,size
0,1860-11,"aberdeen, aberdeen, scotland",57.133333,-2.1000,1
1,1860-11,"abingdon, washington, virginia",36.700000,-81.9667,1
2,1860-11,"accomac, accomack, virginia",37.716700,-75.6500,10
3,1860-11,"acme, dickinson, kansas",38.833300,-97.2500,9
4,1860-11,"adams valley, la crosse, wisconsin",43.966700,-91.0000,37
...,...,...,...,...,...
47396,1865-12,"york, virginia, united states",37.233300,-76.5500,2
47397,1865-12,"york, virginia, united states",37.516700,-76.7833,1
47398,1865-12,"york, york, pennsylvania",39.950000,-76.7167,2
47399,1865-12,yucatan,20.833333,-89.0000,1
