# STATE_ Data Transformation

Task: use Pandas to transform csv files into DataFrames that match desired tables for database schema

Tables:

- STATE_CONTIGUITY
- STATE_RESOURCE

In [2]:
import pandas as pd
import numpy as np

## Create 'STATE_CONTIGUITY' table

Task: transform contdir.csv into a table with attributes:

- StateA
- StateB
- StartDate
- EndDate
- StartYear
- StartMonth
- EndYear
- EndMonth
- Type
- Notes

In [13]:
dfStateCont = pd.read_csv('../SourceData/CorrelatesOfWar/contdir.csv', encoding = 'utf-8')

In [14]:
dfStateCont = dfStateCont.rename(columns={"statelno": "StateA", "statehno": "StateB", "conttype": "Type", "notes": "Notes", "begin": "StartDate", "end": "EndDate"})

dfStateCont['StartDate'] = pd.to_datetime(dfStateCont['StartDate'], format='%Y%m')
dfStateCont['EndDate'] = pd.to_datetime(dfStateCont['EndDate'], format='%Y%m')
dfStateCont['EndDate'] = dfStateCont['EndDate'].replace(pd.Timestamp('2016-12-01'), pd.NaT)

dfStateCont = dfStateCont[['StateA', 'StateB', 'StartDate', 'EndDate', 'Type', 'Notes']]
dfStateCont

Unnamed: 0,StateA,StateB,StartDate,EndDate,Type,Notes
0,2,20,1920-01-01,NaT,1,Begins with CAN system entry
1,2,31,1973-07-01,NaT,4,Across Atlantic Ocean (closest=Florida-Bimini)...
2,2,40,1902-05-01,1906-09-01,4,Across Florida Straits (closest=Key West); beg...
3,2,40,1909-01-01,NaT,4,Across Florida Straits (closest=Key West); res...
4,2,70,1831-01-01,NaT,1,Begins with MEX system entry
...,...,...,...,...,...,...
842,946,983,1999-09-01,NaT,5,Across Pacific Ocean; begins with KIR system e...
843,947,950,2000-08-01,NaT,5,Across Pacific Ocean; begins with TUV system e...
844,950,955,1999-08-01,NaT,5,Across Pacific Ocean; begins with TON system e...
845,955,990,1999-08-01,NaT,5,Across Pacific Ocean; begins with TON system e...


In [6]:
dfStateCont.to_csv('../FinalData/state_contiguity.csv', encoding='utf-8', index=False)

## Create 'STATE_RESOURCE' table

Task: transform NMC_5_0-wsupplementary.csv into a table with attributes:

- StateID
- Year
- ResourceID
- Amount
- Source
- Note
- QualityCode
- AnomalyCode

In [30]:
dfResources = pd.read_csv('../SourceData/CorrelatesOfWar/NMC_5_0-wsupplementary.csv', encoding='utf-8', na_values=[-7, -8, -9])

In [31]:
dfResources.columns

Index(['statenme', 'stateabb', 'ccode', 'year', 'milex', 'milexsource',
       'milexnote', 'milper', 'milpersource', 'milpernote', 'irst',
       'irstsource', 'irstnote', 'irstqualitycode', 'irstanomalycode', 'pec',
       'pecsource', 'pecnote', 'pecqualitycode', 'pecanomalycode', 'tpop',
       'tpopsource', 'tpopnote', 'tpopqualitycode', 'tpopanomalycode', 'upop',
       'upopsource', 'upopnote', 'upopqualitycode', 'upopanomalycode',
       'upopgrowth', 'upopgrowthsource', 'cinc', 'version'],
      dtype='object')

In [32]:
dfResMilex = dfResources[['ccode', 'year', 'milex', 'milexsource', 'milexnote']].copy() \
.rename(columns={'ccode':'StateID', 'year':'Year', 'milex':'Amount', 'milexsource':'Source', 'milexnote':'Note'})

dfResMilPer = dfResources[['ccode', 'year', 'milper', 'milpersource', 'milpernote']].copy() \
.rename(columns={'ccode':'StateID', 'year':'Year', 'milper':'Amount', 'milpersource':'Source', 'milpernote':'Note'})

dfResIrst = dfResources[['ccode', 'year', 'irst','irstsource', 'irstnote', 'irstqualitycode', 'irstanomalycode']].copy() \
.rename(columns={'ccode':'StateID', 'year':'Year', 'irst':'Amount', 'irstsource':'Source', 'irstnote':'Note', 'irstqualitycode':'QualityCode', 'irstanomalycode':'AnomalyCode'})

dfResPec = dfResources[['ccode', 'year', 'pec', 'pecsource', 'pecnote', 'pecqualitycode', 'pecanomalycode']].copy() \
.rename(columns={'ccode':'StateID', 'year':'Year', 'pec':'Amount', 'pecsource':'Source', 'pecnote':'Note', 'pecqualitycode':'QualityCode', 'pecanomalycode':'AnomalyCode'})

dfResTpop = dfResources[['ccode', 'year', 'tpop', 'tpopsource', 'tpopnote', 'tpopqualitycode', 'tpopanomalycode']].copy() \
.rename(columns={'ccode':'StateID', 'year':'Year', 'tpop':'Amount', 'tpopsource':'Source', 'tpopnote':'Note', 'tpopqualitycode':'QualityCode', 'tpopanomalycode':'AnomalyCode'})

dfResUpop = dfResources[['ccode', 'year', 'upop', 'upopsource', 'upopnote', 'upopqualitycode', 'upopanomalycode']].copy() \
.rename(columns={'ccode':'StateID', 'year':'Year', 'upop':'Amount', 'upopsource':'Source', 'upopnote':'Note', 'upopqualitycode':'QualityCode', 'upopanomalycode':'AnomalyCode'})

dfResUpgrowth = dfResources[['ccode', 'year', 'upopgrowth', 'upopgrowthsource']].copy() \
.rename(columns={'ccode':'StateID', 'year':'Year', 'upopgrowth':'Amount', 'upopgrowthsource':'Source'})

In [33]:
dfResMilex['ResourceID'] = 'milex'
dfResMilPer['ResourceID'] = 'milper'
dfResIrst['ResourceID'] = 'irst'
dfResPec['ResourceID'] = 'pec'
dfResTpop['ResourceID'] = 'tpop'
dfResUpop['ResourceID'] = 'upop'
dfResUpgrowth['ResourceID'] = 'upopgrowth'

In [34]:
resources = [dfResMilex, dfResMilPer, dfResIrst, dfResPec, dfResTpop, dfResUpop, dfResUpgrowth]
dfstateresources = pd.concat(resources, sort=True)
dfstateresources

Unnamed: 0,Amount,AnomalyCode,Note,QualityCode,ResourceID,Source,StateID,Year
0,3823.0,,,,milex,,2,1816
1,2466.0,,,,milex,,2,1817
2,1910.0,,,,milex,,2,1818
3,2301.0,,,,milex,,2,1819
4,1556.0,,,,milex,,2,1820
...,...,...,...,...,...,...,...,...
15166,,,,,upopgrowth,,990,2008
15167,,,,,upopgrowth,,990,2009
15168,,,,,upopgrowth,,990,2010
15169,,,,,upopgrowth,,990,2011


In [35]:
dfstateresources.AnomalyCode.unique()

array([nan, 'A', 'D', 'B', 'C',
       'Assumed component zero values used in PEC computation for NMC 5.0.',
       'E', 'F'], dtype=object)

In [36]:
dfstateresources['AnomalyCode'] = dfstateresources['AnomalyCode'].replace('Assumed component zero values used in PEC computation for NMC 5.0.', '0')
dfstateresources.AnomalyCode.unique()

array([nan, 'A', 'D', 'B', 'C', '0', 'E', 'F'], dtype=object)

In [37]:
dfstateresources = dfstateresources[['StateID', 'Year', 'ResourceID', 'Amount', 'Source', 'Note', 'QualityCode', 'AnomalyCode']]

In [38]:
dfstateresources.to_csv('../FinalData/state_resource.csv', encoding='utf-8', index=False)