# IGO Data Transformation

Task: use Pandas to transform csv files into DataFrames that match desired tables for database schema

Tables:

- IGO
- IGO_MEMBERSHIP

In [1]:
import pandas as pd
import numpy as np

## Create 'IGO' table

Task: transform 'igounit_v2.3.csv' into a table with the following attributes:

- igoID
- igoAbbr
- igoShortName
- igoLongName
- StartYear
- EndYear
- EndReason
- Notes

In [2]:
dfigo = pd.read_csv('../SourceData/CorrelatesOfWar/igounit_v2.3.csv', 
                    usecols=['ioname', 'orgname', 'deaddate', 'integrated', 'replaced', 'ionum', 'longorgname', 'sdate'], 
                    encoding='utf-8', dtype={'sdate': 'Int64', 'deaddate': 'Int64'}) \
        .drop_duplicates() \
        .rename(columns={'ioname':'igoAbbr', 'orgname':'igoShortName', 'deaddate':'EndYear', 'ionum':'igoID', 
                         'longorgname':'igoLongName', 'sdate':'StartYear'})

dfigo

Unnamed: 0,igoAbbr,igoShortName,EndYear,integrated,replaced,igoID,igoLongName,StartYear
0,AAAID,Arab Auth. for Ag. Invest. & Development,,0,0,370,Arab Authority for Agricultural Investment and...,1976
30,AAB,Alliance Against Biopiracy,,0,0,305,Alliance Against Biopiracy,2002
34,AACB,Assoc. of African Central Banks,,0,0,690,Association of African Central Banks (AACB),1968
72,AACarib,Anglo-Am Caribbean Comm,1946,0,1,350,Anglo-American Caribbean Commission,1942
73,AALCO,Asian-African Legal Consultative Org,,0,0,630,Asian-African Legal Consultative Committee,1956
...,...,...,...,...,...,...,...,...
12755,WNF,West Nordic Fund,,0,0,4520,West-Nordic Foundation,1987
12774,WPact,Warsaw Treaty Org,1991,0,0,4460,Warsaw Treaty Organization,1955
12804,WTO,World Trade Org,,0,0,4580,World Trade Organization (WTO),1995
12815,WTOURO,World Tourism Org.,,0,0,4570,World Tourism Organization (WTO),1975


In [3]:
dfigo['igoAbbr'].value_counts().head(8)

SITTDEC    3
AU         2
EIPA       2
AIC        2
CELC       2
IVWO       2
NCRR       2
NIB        1
Name: igoAbbr, dtype: int64

In [4]:
dfigo.loc[dfigo['igoAbbr'] == 'SITTDEC']

Unnamed: 0,igoAbbr,igoShortName,EndYear,integrated,replaced,igoID,igoLongName,StartYear
11719,SITTDEC,"South Investment, Trade & Tech. Data Exchg. Ce...",,0,0,4180,"South Investment, Trade and Technological Data...",1990.0
11730,SITTDEC,"South Investment, Trade & Tech. Data Exchg. Ce...",,0,0,4180,"South Investment, Trade and Technological Data...",
11733,SITTDEC,"South Investment, Trade & Tech. Data Exchg. Ce...",,0,0,4180,"South Investment, Trade, and Technological Dat...",


Duplicate rows for IOs due to some irregularities in the other rows. This is likely a data quality issue due to heavy redundancy in the original dataset. 
Discrepancies are in the 'sdate', 'integrated', 'longorgname', or 'deaddate' columns.

Dropping the columns with the least information.

In [5]:
dfigo = dfigo.drop([11730, 11733, 1820, 9734, 4379, 3102, 745, 8686])

In [6]:
dfigo['EndReason'] = 'None'
dfigo.loc[dfigo['integrated'] == 1, 'EndReason'] = 'integrated'
dfigo.loc[dfigo['replaced'] == 1, 'EndReason'] = 'replaced'

dfigo = dfigo[['igoID', 'igoAbbr', 'igoShortName', 'igoLongName', 'StartYear', 'EndYear', 'EndReason']]
dfigo

Unnamed: 0,igoID,igoAbbr,igoShortName,igoLongName,StartYear,EndYear,EndReason
0,370,AAAID,Arab Auth. for Ag. Invest. & Development,Arab Authority for Agricultural Investment and...,1976,,
30,305,AAB,Alliance Against Biopiracy,Alliance Against Biopiracy,2002,,
34,690,AACB,Assoc. of African Central Banks,Association of African Central Banks (AACB),1968,,
72,350,AACarib,Anglo-Am Caribbean Comm,Anglo-American Caribbean Commission,1942,1946,replaced
73,630,AALCO,Asian-African Legal Consultative Org,Asian-African Legal Consultative Committee,1956,,
...,...,...,...,...,...,...,...
12755,4520,WNF,West Nordic Fund,West-Nordic Foundation,1987,,
12774,4460,WPact,Warsaw Treaty Org,Warsaw Treaty Organization,1955,1991,
12804,4580,WTO,World Trade Org,World Trade Organization (WTO),1995,,
12815,4570,WTOURO,World Tourism Org.,World Tourism Organization (WTO),1975,,


Now to take care of those notes...

In [7]:
dfigonotes = pd.read_csv('../SourceData/CorrelatesOfWar/igounit_v2.3.csv', usecols=['ionum', 'Sources', 'Sources2'], dtype={'Sources': str, 'Sources2': str}) \
        .rename(columns={'ionum':'igoID'}) \
        .dropna(subset=['Sources'])

In [8]:
dfigonotes.Sources2 = dfigonotes.Sources2.fillna('')
dfigonotes['Notes'] = 'WEBSITE: ' + dfigonotes['Sources'] + ' NOTES: ' + dfigonotes['Sources2']
dfigonotesjoined = dfigonotes.groupby('igoID')['Notes'].apply(lambda x: ';'.join(x))
dfigonotesjoined

igoID
25      WEBSITE: http://www.africa-union.org/ NOTES: ;...
125     WEBSITE: http://www.boip.int/ NOTES: ;WEBSITE:...
275     WEBSITE: http://www.gene.ch/gentech/2002/Feb/m...
305     WEBSITE: www.rainforestcoalition.org NOTES: ;W...
725     WEBSITE: http://www.icc-cpi.int/home.html&l=en...
795     WEBSITE: http://portal.unesco.org/education/en...
825     WEBSITE: http://www.boip.int/ NOTES: ;WEBSITE:...
873     WEBSITE: www.rainforestcoalition.org NOTES: ;W...
1115            WEBSITE: http://www.marri-rc.org/ NOTES: 
1345    WEBSITE: http://www.nwhf.no/ NOTES:   Website ...
1355    WEBSITE: http://www.unctad.org/infocomm/anglai...
1415    WEBSITE: http://www.acwl.ch/ NOTES:   [Per org...
1855    WEBSITE: http://www.wmdcommission.org/ NOTES: ...
1905    WEBSITE: http://en.wikipedia.org/wiki/Bank_of_...
1960    WEBSITE: http://www.s-cica.org/page.php?page_i...
2015    WEBSITE: http://www.earthobservations.org/ NOT...
2315    WEBSITE: http://www.marri-rc.org/ NOTES: ;WEBS...
2492    

In [9]:
dfigonew = pd.merge(dfigo, dfigonotesjoined.to_frame(), on='igoID', how='outer')
dfigonew

Unnamed: 0,igoID,igoAbbr,igoShortName,igoLongName,StartYear,EndYear,EndReason,Notes
0,370,AAAID,Arab Auth. for Ag. Invest. & Development,Arab Authority for Agricultural Investment and...,1976,,,
1,305,AAB,Alliance Against Biopiracy,Alliance Against Biopiracy,2002,,,WEBSITE: www.rainforestcoalition.org NOTES: ;W...
2,690,AACB,Assoc. of African Central Banks,Association of African Central Banks (AACB),1968,,,
3,350,AACarib,Anglo-Am Caribbean Comm,Anglo-American Caribbean Commission,1942,1946,replaced,
4,630,AALCO,Asian-African Legal Consultative Org,Asian-African Legal Consultative Committee,1956,,,
...,...,...,...,...,...,...,...,...
524,4520,WNF,West Nordic Fund,West-Nordic Foundation,1987,,,
525,4460,WPact,Warsaw Treaty Org,Warsaw Treaty Organization,1955,1991,,
526,4580,WTO,World Trade Org,World Trade Organization (WTO),1995,,,
527,4570,WTOURO,World Tourism Org.,World Tourism Organization (WTO),1975,,,


In [10]:
NotesMaxLength = int(dfigonew['Notes'].str.encode(encoding='utf-8').str.len().max())
print(NotesMaxLength)

890


In [11]:
dfigonew.to_csv('../FinalData/igo.csv', encoding='utf-8', index=False)

## Create 'IGO_MEMBERSHIP' table

Task: transform 'IGO_stateunit_v2.3.csv' into a table with the following attributes:

- igoID
- StateID
- JoinYear
- LeaveYear

In [12]:
import csv

In [13]:
with open('../SourceData/CorrelatesOfWar/IGO_stateunit_v2.3.csv',  'r', encoding = 'utf-8') as fin:
    csvin = csv.reader(fin)
    headers = next(csvin)
    data = [r for r in csvin]

In [14]:
igos = headers[4:] # create a list of igo IDs
countries = list(set([row[0] for row in data])) # create a list of country Ids

In [15]:
def joinleaveyrs(sorted_years_list):
    
    # first, make a new list of lists, with each sequential section of years as a list element
    
    yearlist = [int(x) for x in sorted_years_list]
    partitioned_years_list = [] # the new list of lists
    
    startindex = 0
    endindex = 0
    
    for index, year in enumerate(yearlist):
        if index < len(yearlist) - 1:
            if yearlist[index+1] > year + 1:
                endindex = index
                partitioned_years_list.append(yearlist[startindex:endindex+1])
                startindex = endindex + 1
            else:
                continue
        else:
            partitioned_years_list.append(yearlist[startindex:len(yearlist)])
    
    # next, get first and last years in each sub list
    
    allyrpairs = []
    
    for yrlist in partitioned_years_list:
        startyr = yrlist[0]
        endyr = yrlist[-1]
        yrpair = [startyr, endyr]
        allyrpairs.append(yrpair)
    
    return allyrpairs

note: code from the first portion of the joinleaveyrs function comes from [this stackoverflow answer](https://stackoverflow.com/a/49314031)

In [16]:
countrymemdata = []

# assign an index position for each igo
for igo in igos: 
    igopos = headers.index(igo)

    # iterates through the set of all countries, and for each country gets the cell values for the year column and each igo column
    for country in countries:
        # countryfilter draws from the set of countries, and narrows the rows being examined down to one country per iteration
        countryfilter = [r for r in data if r[0] == country] # r[0] because the 1st column is country IDs
        # years gets all year numbers for that specific country for the iteration
        years = [r[3] for r in countryfilter] # r[3] because the 4th column is years
        # nums gets the igo index position for each igo
        nums = [r[igopos] for r in countryfilter] # nums possible values: 1, 0, -1, -9
        
        yearswithin = [] # empty list for all years country is part of igo to be accumulated in
        for year, num in list(zip(years, nums)): # zip creates an iterable tuple of the years and their igo-num
            if int(num) == 1: # 1 means membership
                yearswithin.append(int(year)) # adds years of membership to list. seperate list for each igo (per country)
        yearswithin.sort() # puts membership years in order
        #print(country, igo, yearswithin)
        
        # end of information gathering; start of information condensing
        
        if len(yearswithin) >= 1:
            membershipyears = joinleaveyrs(yearswithin)
        else:
            continue
        
        for yrset in membershipyears:
            countrymemdatarow = []
            countrymemdatarow.append(igo)
            countrymemdatarow.append(country)
            countrymemdatarow.append(yrset[0])
            countrymemdatarow.append(yrset[1])
            countrymemdata.append(countrymemdatarow)

In [17]:
outfileheaders = ['igoID', 'StateID', 'JoinYear', 'LeaveYear']

with open('../FinalData/proto_igo_membership.csv', 'w', newline = '') as outfile:
    csvout = csv.writer(outfile)
    csvout.writerow(outfileheaders)
    csvout.writerows(countrymemdata)

In [18]:
dfigomem = pd.read_csv('../FinalData/proto_igo_membership.csv')
dfigomem.rename(columns={'igoID':'igoAbbr'},inplace=True)
dfigomem

Unnamed: 0,igoAbbr,StateID,JoinYear,LeaveYear
0,AAAID,435,1985,2005
1,AAAID,651,1985,2005
2,AAAID,679,2005,2005
3,AAAID,616,1993,2005
4,AAAID,696,1985,2005
...,...,...,...,...
33427,Wassen,344,2005,2005
33428,Wassen,920,1996,2005
33429,Wassen,355,1996,2005
33430,Wassen,20,1996,2005


In [19]:
dfigoids = dfigo[['igoID', 'igoAbbr']]
dfigoids

Unnamed: 0,igoID,igoAbbr
0,370,AAAID
30,305,AAB
34,690,AACB
72,350,AACarib
73,630,AALCO
...,...,...
12755,4520,WNF
12774,4460,WPact
12804,4580,WTO
12815,4570,WTOURO


In [20]:
dfigomemfinal = pd.merge(dfigomem, dfigoids, on='igoAbbr')
dfigomemfinal = dfigomemfinal[['igoID', 'StateID', 'JoinYear', 'LeaveYear']]
dfigomemfinal

Unnamed: 0,igoID,StateID,JoinYear,LeaveYear
0,370,435,1985,2005
1,370,651,1985,2005
2,370,679,2005,2005
3,370,616,1993,2005
4,370,696,1985,2005
...,...,...,...,...
33427,4470,344,2005,2005
33428,4470,920,1996,2005
33429,4470,355,1996,2005
33430,4470,20,1996,2005


In [21]:
dfigomemfinal.to_csv('../FinalData/igo_membership.csv', encoding='utf-8', index=False)