# Start of Data Transformation

Task: use Pandas to transform csv files into DataFrames that match desired tables for database schema

In [1]:
import pandas as pd

In [2]:
!ls SourceData/CorrelatesOfWar/

[34mCodebooks[m[m                    MID_Narratives_2002-2010.pdf
CowWarList.pdf               NMC_5_0-wsupplementary.csv
[31mEntities.pdf[m[m                 Non-StateWarData_v4.0.csv
Extra-StateWarData_v4.0.csv  Territories.csv
IGO_stateunit_v2.3.csv       alliance_v4.1_by_member.csv
Inter-StateWarData_v4.0.csv  contdir.csv
Intra-StateWarData_v4.1.csv  igounit_v2.3.csv
[31mMIDA_4.2.csv[m[m                 majors2016.csv
[31mMIDB_4.2.csv[m[m                 states2016.csv
[31mMIDLOCA_2.0.csv[m[m              system2016.csv
MID_Narratives_1993-2001.pdf tc2014.csv


## Create 'STATE' table

Task: transform states2016.csv into a table with attributes:

- StateID
- StateAbbr
- StateName

in which each StateID occurs once (as it is the Primary Key)

In [3]:
dfStates = pd.read_csv('SourceData/CorrelatesOfWar/states2016.csv')
dfStates

Unnamed: 0,stateabb,ccode,statenme,styear,stmonth,stday,endyear,endmonth,endday,version
0,USA,2,United States of America,1816,1,1,2016,12,31,2016
1,CAN,20,Canada,1920,1,10,2016,12,31,2016
2,BHM,31,Bahamas,1973,7,10,2016,12,31,2016
3,CUB,40,Cuba,1902,5,20,1906,9,25,2016
4,CUB,40,Cuba,1909,1,23,2016,12,31,2016
5,HAI,41,Haiti,1859,1,1,1915,7,28,2016
6,HAI,41,Haiti,1934,8,15,2016,12,31,2016
7,DOM,42,Dominican Republic,1894,1,1,1916,11,29,2016
8,DOM,42,Dominican Republic,1924,9,29,2016,12,31,2016
9,JAM,51,Jamaica,1962,8,6,2016,12,31,2016


In [4]:
dfStates.drop(columns=['styear', 'stmonth', 'stday', 'endyear', 'endmonth', 'endday', 'version'], inplace=True)
dfStates.drop_duplicates(inplace=True)
dfStates.rename(columns={"stateabb": "StateAbbr", "ccode":"StateID", "statenme":"StateName"}, inplace=True)
dfStates =  dfStates[['StateID', 'StateAbbr', 'StateName']]
dfStates

Unnamed: 0,StateID,StateAbbr,StateName
0,2,USA,United States of America
1,20,CAN,Canada
2,31,BHM,Bahamas
3,40,CUB,Cuba
5,41,HAI,Haiti
7,42,DOM,Dominican Republic
9,51,JAM,Jamaica
10,52,TRI,Trinidad and Tobago
11,53,BAR,Barbados
12,54,DMA,Dominica


## Create 'STATE_DATES' table

Task: transform states2016.csv into a table with attributes:

- StateID
- StartDate
- EndDate
- StartYear
- StartMonth
- StartDay
- EndYear
- EndMonth
- EndDay

in which each combination of StateID and StartDate occurs only once.

Note: StartDate and EndDate must be in the format 'YYYY-MM-DD'

In [5]:
dfStateDates = pd.read_csv('SourceData/CorrelatesOfWar/states2016.csv')
dfStateDates

Unnamed: 0,stateabb,ccode,statenme,styear,stmonth,stday,endyear,endmonth,endday,version
0,USA,2,United States of America,1816,1,1,2016,12,31,2016
1,CAN,20,Canada,1920,1,10,2016,12,31,2016
2,BHM,31,Bahamas,1973,7,10,2016,12,31,2016
3,CUB,40,Cuba,1902,5,20,1906,9,25,2016
4,CUB,40,Cuba,1909,1,23,2016,12,31,2016
5,HAI,41,Haiti,1859,1,1,1915,7,28,2016
6,HAI,41,Haiti,1934,8,15,2016,12,31,2016
7,DOM,42,Dominican Republic,1894,1,1,1916,11,29,2016
8,DOM,42,Dominican Republic,1924,9,29,2016,12,31,2016
9,JAM,51,Jamaica,1962,8,6,2016,12,31,2016


In [6]:
dfStateDates.drop(columns=['stateabb', 'statenme', 'version'], inplace=True)
dfStateDates.rename(columns={"ccode":"StateID", "styear": "StartYear", "stmonth":"StartMonth", "stday":"StartDay", "endyear": "EndYear", "endmonth":"EndMonth", "endday":"EndDay"}, inplace=True)
dfStateDates['StartDate'] = pd.to_datetime(dict(year=dfStateDates.StartYear, month=dfStateDates.StartMonth, day=dfStateDates.StartDay))
dfStateDates['EndDate'] = pd.to_datetime(dict(year=dfStateDates.EndYear, month=dfStateDates.EndMonth, day=dfStateDates.EndDay))
dfStateDates = dfStateDates[['StateID', 'StartDate', 'EndDate', 'StartYear', 'StartMonth', 'StartDay', 'EndYear', 'EndMonth', 'EndDay']]
dfStateDates

Unnamed: 0,StateID,StartDate,EndDate,StartYear,StartMonth,StartDay,EndYear,EndMonth,EndDay
0,2,1816-01-01,2016-12-31,1816,1,1,2016,12,31
1,20,1920-01-10,2016-12-31,1920,1,10,2016,12,31
2,31,1973-07-10,2016-12-31,1973,7,10,2016,12,31
3,40,1902-05-20,1906-09-25,1902,5,20,1906,9,25
4,40,1909-01-23,2016-12-31,1909,1,23,2016,12,31
5,41,1859-01-01,1915-07-28,1859,1,1,1915,7,28
6,41,1934-08-15,2016-12-31,1934,8,15,2016,12,31
7,42,1894-01-01,1916-11-29,1894,1,1,1916,11,29
8,42,1924-09-29,2016-12-31,1924,9,29,2016,12,31
9,51,1962-08-06,2016-12-31,1962,8,6,2016,12,31


## Create 'STATE_CONTIGUITY' table

Task: transform contdir.csv into a table with attributes:

- StateA
- StateB
- StartDate
- EndDate
- StartYear
- StartMonth
- EndYear
- EndMonth
- Type
- Notes

In [7]:
dfStateCont = pd.read_csv('SourceData/CorrelatesOfWar/contdir.csv')
dfStateCont

Unnamed: 0,dyad,statelno,statelab,statehno,statehab,conttype,begin,end,notes,version
0,2020,2,USA,20,CAN,1,192001,201612,Begins with CAN system entry,3.2
1,2031,2,USA,31,BHM,4,197307,201612,Across Atlantic Ocean (closest=Florida-Bimini)...,3.2
2,2040,2,USA,40,CUB,4,190205,190609,Across Florida Straits (closest=Key West); beg...,3.2
3,2040,2,USA,40,CUB,4,190901,201612,Across Florida Straits (closest=Key West); res...,3.2
4,2070,2,USA,70,MEX,1,183101,201612,Begins with MEX system entry,3.2
5,2365,2,USA,365,RUS,2,195901,201612,Across Bering Strait (closest=Alaska-Siberia);...,3.2
6,31040,31,BHM,40,CUB,4,197307,201612,Across Atlantic Ocean (closest=Great Inagua); ...,3.2
7,31041,31,BHM,41,HAI,4,197307,201612,Across Atlantic Ocean (closest=Great Inagua); ...,3.2
8,31042,31,BHM,42,DOM,4,197307,201612,Across Atlantic Ocean (closest=Great Inagua); ...,3.2
9,31051,31,BHM,51,JAM,5,197307,201612,Across Windward Passage (closest=Great Inagua)...,3.2


In [8]:
dfStateCont.drop(columns=['dyad', 'statelab', 'statehab', 'version'], inplace=True)
dfStateCont.rename(columns={"statelno":"StateA", "statehno": "StateB", "conttype":"Type", "notes":"Notes"}, inplace=True)
dfStateCont['StartYear'] = dfStateCont['begin'].astype(str).str[0:4]
dfStateCont['StartMonth'] = dfStateCont['begin'].astype(str).str[4:6]
dfStateCont['EndYear'] = dfStateCont['end'].astype(str).str[0:4]
dfStateCont['EndMonth'] = dfStateCont['end'].astype(str).str[4:6]
dfStateCont.drop(columns=['begin', 'end'], inplace=True)
dfStateCont['StartDate'] = pd.to_datetime(dict(year=dfStateCont.StartYear, month=dfStateCont.StartMonth, day='01'))
dfStateCont['EndDate'] = pd.to_datetime(dict(year=dfStateCont.EndYear, month=dfStateCont.EndMonth, day='01'))
dfStateCont = dfStateCont[['StateA', 'StateB', 'StartDate', 'EndDate', 'StartYear', 'StartMonth', 'EndYear', 'EndMonth', 'Type', 'Notes']]
dfStateCont

Unnamed: 0,StateA,StateB,StartDate,EndDate,StartYear,StartMonth,EndYear,EndMonth,Type,Notes
0,2,20,1920-01-01,2016-12-01,1920,01,2016,12,1,Begins with CAN system entry
1,2,31,1973-07-01,2016-12-01,1973,07,2016,12,4,Across Atlantic Ocean (closest=Florida-Bimini)...
2,2,40,1902-05-01,1906-09-01,1902,05,1906,09,4,Across Florida Straits (closest=Key West); beg...
3,2,40,1909-01-01,2016-12-01,1909,01,2016,12,4,Across Florida Straits (closest=Key West); res...
4,2,70,1831-01-01,2016-12-01,1831,01,2016,12,1,Begins with MEX system entry
5,2,365,1959-01-01,2016-12-01,1959,01,2016,12,2,Across Bering Strait (closest=Alaska-Siberia);...
6,31,40,1973-07-01,2016-12-01,1973,07,2016,12,4,Across Atlantic Ocean (closest=Great Inagua); ...
7,31,41,1973-07-01,2016-12-01,1973,07,2016,12,4,Across Atlantic Ocean (closest=Great Inagua); ...
8,31,42,1973-07-01,2016-12-01,1973,07,2016,12,4,Across Atlantic Ocean (closest=Great Inagua); ...
9,31,51,1973-07-01,2016-12-01,1973,07,2016,12,5,Across Windward Passage (closest=Great Inagua)...


## Create 'TERRITORIALCHANGE' table

Task: transform tc2014.csv into a table with attributes:

- TerritorialChangeID
- Gainer
- Loser
- TransferDate
- Year
- Month
- Procedure
- TerritoryID
- TerritoryArea
- TerritoryPopulation
- IsWholeTerritory
- IsMilConflict
- IsIndependence
- GainerIsCont
- LoserIsCont
- IsGainerHomeland
- IsLoserHomeland
- IsSystemEntry
- IsSystemExit

In [9]:
dfTerrChange = pd.read_csv('SourceData/CorrelatesOfWar/tc2014.csv')
dfTerrChange

Unnamed: 0,year,month,gainer,gaintype,procedur,entity,contgain,area,pop,portion,loser,losetype,contlose,entry,exit,number,indep,conflict,version
0,1816,7,160,1,-9,160,-9,2093164.00,1970000,1,230,0,0,1,0,3,1,0,5
1,1816,3,200,0,3,790,0,1.00,.,0,790,1,1,0,0,4,0,1,5
2,1816,.,200,0,3,420,0,179.00,.,0,-9,1,-9,0,0,5,0,0,5
3,1817,.,220,0,3,433,0,7819.00,100000,1,200,0,0,0,0,28,0,0,5
4,1817,.,365,1,1,365,1,650.00,.,0,-9,1,1,0,0,29,0,1,5
5,1818,10,2,1,3,20,1,84240.00,.,0,200,0,0,0,0,30,0,0,5
6,1818,12,155,1,-9,155,-9,464568.00,1656300,1,230,0,0,1,0,31,1,1,5
7,1818,10,200,0,3,2,0,41600.00,.,0,2,1,1,0,0,32,0,0,5
8,1818,6,200,0,1,750,0,421200.00,.,0,-9,1,-9,0,0,33,0,1,5
9,1818,.,200,0,2,438,0,16.00,.,0,-9,1,-9,0,0,34,0,0,5


In [10]:
dfTerrChange.rename(columns={"year":"Year", "month": "Month", "gainer":"Gainer", "gaintype":"IsGainerHomeland", "procedur":"Procedure", "entity":"TerritoryID", "contgain":"GainerIsCont", "area":"TerritoryArea", "pop":"TerritoryPopulation", "portion":"IsWholeTerritory", "loser":"Loser", "losetype":"IsLoserHomeland", "contlose": "LoserIsCont", "entry":"IsSystemEntry", "exit":"IsSystemExit", "number":"TerritorialChangeID", "indep":"IsIndependence", "conflict":"IsMilConflict"}, inplace=True)
dfTerrChange.drop(columns=['version'], inplace=True)
missingmonth = (dfTerrChange['Month'] == '.')
dfTerrChange['MonthClean'] = dfTerrChange['Month']
dfTerrChange['MonthClean'] [dfTerrChange['MonthClean'] == '.'] = 1 # boolean mask
dfTerrChange['TransferDate'] = pd.to_datetime(dict(year=dfTerrChange.Year, month=dfTerrChange.MonthClean, day='01'))
dfTerrChange.replace({'Month':'.'}, {'Month':''}, inplace=True)
dfTerrChange = dfTerrChange[['TerritorialChangeID', 'Gainer', 'Loser', 'TransferDate', 'Year', 'Month', 'Procedure', 'TerritoryID', 'TerritoryArea', 'TerritoryPopulation', 'IsWholeTerritory', 'IsMilConflict', 'IsIndependence', 'GainerIsCont', 'LoserIsCont', 'IsGainerHomeland', 'IsLoserHomeland', 'IsSystemEntry', 'IsSystemExit']]
dfTerrChange

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


Unnamed: 0,TerritorialChangeID,Gainer,Loser,TransferDate,Year,Month,Procedure,TerritoryID,TerritoryArea,TerritoryPopulation,IsWholeTerritory,IsMilConflict,IsIndependence,GainerIsCont,LoserIsCont,IsGainerHomeland,IsLoserHomeland,IsSystemEntry,IsSystemExit
0,3,160,230,1816-07-01,1816,7,-9,160,2093164.00,1970000,1,0,1,-9,0,1,0,1,0
1,4,200,790,1816-03-01,1816,3,3,790,1.00,.,0,1,0,0,1,0,1,0,0
2,5,200,-9,1816-01-01,1816,,3,420,179.00,.,0,0,0,0,-9,0,1,0,0
3,28,220,200,1817-01-01,1817,,3,433,7819.00,100000,1,0,0,0,0,0,0,0,0
4,29,365,-9,1817-01-01,1817,,1,365,650.00,.,0,1,0,1,1,1,1,0,0
5,30,2,200,1818-10-01,1818,10,3,20,84240.00,.,0,0,0,1,0,1,0,0,0
6,31,155,230,1818-12-01,1818,12,-9,155,464568.00,1656300,1,1,1,-9,0,1,0,1,0
7,32,200,2,1818-10-01,1818,10,3,2,41600.00,.,0,0,0,0,1,0,1,0,0
8,33,200,-9,1818-06-01,1818,6,1,750,421200.00,.,0,1,0,0,-9,0,1,0,0
9,34,200,-9,1818-01-01,1818,,2,438,16.00,.,0,0,0,0,-9,0,1,0,0


## Create 'TERRITORY' table

Task: transform Territory.csv into a table with attributes:

- TerritoryID
- TerritoryName

Note: Territory.csv was created by running Entities.pdf through [Tabula](https://tabula.technology/) and hand-correcting minor errors (some sets of rows were shifted to the left).