# Data Formatting Script

Importing the necessary packages

In [17]:
import numpy as np
import pandas as pd
import json

In [19]:
conf = pd.read_csv("data/raw_data/ucdp-prio-acd-181.csv")  # reading in conflict dyads data
conf.head()  # examining its structure

Unnamed: 0,conflict_id,location,side_a,side_a_id,side_a_2nd,side_b,side_b_id,side_b_2nd,incompatibility,territory_name,...,ep_end,ep_end_date,ep_end_prec,gwno_a,gwno_a_2nd,gwno_b,gwno_b_2nd,gwno_loc,region,version
0,200,Bolivia,Government of Bolivia,23,,Popular Revolutionary Movement,719,,2,,...,1,1946-07-21,1.0,145,,,,145,5,18.1
1,200,Bolivia,Government of Bolivia,23,,MNR,720,,2,,...,1,1952-04-12,1.0,145,,,,145,5,18.1
2,200,Bolivia,Government of Bolivia,23,,ELN,721,,2,,...,1,1967-10-16,1.0,145,,,,145,5,18.1
3,201,Cambodia (Kampuchea),Government of France,33,,Khmer Issarak,160,,1,Cambodia,...,0,,,220,,,,811,3,18.1
4,201,Cambodia (Kampuchea),Government of France,33,,Khmer Issarak,160,,1,Cambodia,...,0,,,220,,,,811,3,18.1


In [20]:
# Function to convert numeric values and return 0 if they are not numeric, 
# this will allow entries without countries as the co-belligerent to be filtered out
def integerize(x):
    try:
        return int(x)
    except ValueError:
        return 0
    
# Filter the data to just that have countries on both sides
country_conf = conf[conf.gwno_b.apply(integerize) > 0]
print(country_conf.shape) # shape of the data frame
print(country_conf.columns) # names of the columns
country_conf.head()

(125, 28)
Index(['conflict_id', 'location', 'side_a', 'side_a_id', 'side_a_2nd',
       'side_b', 'side_b_id', 'side_b_2nd', 'incompatibility',
       'territory_name', 'year', 'intensity_level', 'cumulative_intensity',
       'type_of_conflict', 'start_date', 'start_prec', 'start_date2',
       'start_prec2', 'ep_end', 'ep_end_date', 'ep_end_prec', 'gwno_a',
       'gwno_a_2nd', 'gwno_b', 'gwno_b_2nd', 'gwno_loc', 'region', 'version'],
      dtype='object')


Unnamed: 0,conflict_id,location,side_a,side_a_id,side_a_2nd,side_b,side_b_id,side_b_2nd,incompatibility,territory_name,...,ep_end,ep_end_date,ep_end_prec,gwno_a,gwno_a_2nd,gwno_b,gwno_b_2nd,gwno_loc,region,version
119,214,"France, Thailand",Government of France,33,,Government of Thailand,147,,1,Northern Cambodia,...,1,1946-11-17,1.0,220,,800,,"220, 800","1, 3",18.1
120,215,"Albania, United Kingdom",Government of Albania,45,,Government of United Kingdom,28,,1,Korfu Channel,...,1,1946-12-31,5.0,339,,200,,"200, 339",1,18.1
131,218,"India, Pakistan",Government of India,141,,Government of Pakistan,142,,1,Kashmir,...,1,1948-12-31,1.0,750,,770,,"750, 770",3,18.1
132,218,"India, Pakistan",Government of India,141,,Government of Pakistan,142,,1,Kashmir,...,0,,,750,,770,,"750, 770",3,18.1
133,218,"India, Pakistan",Government of India,141,,Government of Pakistan,142,,1,Kashmir,...,1,1965-12-15,1.0,750,,770,,"750, 770",3,18.1


In [21]:
print(country_conf['gwno_a']) # examine the structure when there are multiple gwno's

119                         220
120                         339
131                         750
132                         750
133                         750
134                         750
135                         750
136                         750
137                         750
138                         750
139                         750
140                         750
141                         750
142                         750
143                         750
144                         750
145                         750
146                         750
147                         750
148                         750
149                         750
150                         750
151                         750
152                         750
314                         751
348     651, 645, 663, 660, 652
349     651, 645, 663, 660, 652
424                         710
425                         710
426                         710
                 ...           
1567    

In [32]:
nodes = set()  # set to store the unique countries in the data
country_name_gwno_mapping = {} # dictionary to store the combinations of gwno's and country names


for i, row in country_conf.iterrows(): # for every row in the data
    for num, name in \
    zip((row['gwno_a'].split(',') + (str(row['gwno_a_2nd']).split(','))),\
        (row['side_a'].split(',') + str(row['side_a_2nd']).split(','))): # grab the gwno and country names
        if num not in country_name_gwno_mapping and num != 'nan':
            country_name_gwno_mapping[num]= name.strip()
        nodes.add(num.strip())
        
    for num, name in \
    zip((row['gwno_b'].split(',') + (str(row['gwno_b_2nd']).split(','))),\
        (row['side_b'].split(',') + str(row['side_b_2nd']).split(','))): # grab the gwno and country names
        if num not in country_name_gwno_mapping and num != 'nan':
            country_name_gwno_mapping[num]= name.strip()
        nodes.add(num.strip())

In [33]:
print(nodes)
# print(country_name_gwno_mapping)
for k, v in country_name_gwno_mapping.items():
    print(k, v)

{'700', '600', '315', '812', '900', '560', '385', '615', '160', '200', '530', '630', '817', '210', '680', '91', '731', '93', '651', '732', '352', '750', '92', '713', '710', '775', '432', '390', '522', '616', '2', '339', '471', '95', '510', '645', '100', '692', '652', '660', '800', '500', '626', '520', '436', '640', '365', '325', '135', '694', '55', '40', '310', '531', '816', '698', '696', '230', '625', '475', '220', '678', '20', '350', '690', '235', '820', '751', '439', '433', '670', '211', '770', '620', '451', '666', 'nan', '483', '811', '212', '850', '920', '663', '771', '840', '130'}
220 Government of France
800 Government of Thailand
339 Government of Albania
200 Government of United Kingdom
750 Government of India
770 Government of Pakistan
751 Government of Hyderabad
651 Government of Egypt
 645 Government of Iraq
 663 Government of Jordan
 660 Government of Lebanon
 652 Government of Syria
666 Government of Israel
710 Government of China
713 Government of Taiwan
2 Government of 

In [35]:
conflicts = {} # dictionary to hold the combinations of countries
alliances = {}

for i, row in country_conf.iterrows():
    # List of all of the countries on side a
    side_a = [num.strip() for num in (row['gwno_a'].split(',') + (str(row['gwno_a_2nd']).split(','))) if num != 'nan']
    
    # List of all the countries on side b
    side_b = [num.strip() for num in (row['gwno_b'].split(',') + (str(row['gwno_b_2nd']).split(','))) if num != 'nan']
    
    for i in side_a:
        if i not in alliances:
            alliances[i] = {}
        for j in side_a:
            if i != j:
                if j in alliances[i]:
                    alliances[i][j] += 1
                else:
                    alliances[i][j] = 1
                    
    for i in side_b:
        if i not in alliances:
            alliances[i] = {}
        for j in side_a:
            if i != j:
                if j in alliances[i]:
                    alliances[i][j] += 1
                else:
                    alliances[i][j] = 1
    
    for num_a in side_a:
        if num_a not in conflicts:
            conflicts[num_a] = {}
        for num_b in side_b:
            if num_b in conflicts[num_a]:
                conflicts[num_a][num_b] += 1
#                 links[num_a][num_b][1].append(row['start_date'])
            else:
                conflicts[num_a][num_b] = 1
#                 links[num_a][num_b] = [1, [row['start_date']]]
                
    for num_b in side_b:
        if num_b not in conflicts:
            conflicts[num_b] = {}
        for num_a in side_a:
            if num_a in conflicts[num_b]:
                conflicts[num_b][num_a] += 1
            else:
                conflicts[num_b][num_a] = 1
    

In [36]:
print(conflicts)
print(alliances)

{'220': {'800': 1, '731': 4, '710': 4, '365': 4, '616': 1, '645': 1}, '800': {'220': 1, '731': 10, '710': 4, '365': 4, '816': 6, '811': 3, '812': 3}, '339': {'200': 1}, '200': {'339': 1, '731': 4, '710': 4, '365': 4, '651': 2, '850': 4, '160': 1, '645': 2}, '750': {'770': 22, '751': 1, '710': 2}, '770': {'750': 22, '645': 1}, '751': {'750': 1}, '651': {'666': 6, '200': 2, '645': 1}, '645': {'666': 2, '630': 10, '690': 2, '160': 1, '900': 2, '692': 1, '771': 1, '211': 1, '20': 1, '315': 1, '390': 1, '651': 1, '220': 1, '350': 1, '91': 1, '325': 1, '600': 1, '210': 1, '436': 1, '385': 1, '698': 1, '770': 1, '235': 1, '694': 1, '670': 1, '433': 1, '451': 1, '230': 1, '652': 1, '640': 1, '696': 1, '200': 2, '2': 2}, '663': {'666': 3}, '660': {'666': 2}, '652': {'666': 4, '645': 1}, '666': {'651': 6, '645': 2, '663': 3, '660': 2, '652': 4}, '710': {'713': 4, '2': 6, '732': 4, '900': 4, '20': 4, '220': 4, '350': 4, '920': 4, '840': 4, '560': 4, '800': 4, '640': 4, '200': 4, '211': 3, '100': 

In [None]:
nodes_json = [{'id': i}for i in nodes]
print(nodes_json)
conflicts_json = [{'source': a, 'target': b, 'value': w , 'type': 'enemy'} for a, v in conflicts.items() for b, w in v.items() ]
print(conflicts_json)
alliances_json = [{'source': a, 'target': b, 'value': w, 'type': 'ally'} for a, v in alliances.items() for b, w in v.items()]
print(alliances_json)

combined_links_json = conflicts_json + alliances_json

In [None]:
ccodes = pd.read_csv('data/raw_data/COW_country_codes.csv')
ccodes.head()

In [None]:
ccode_converter = {}

for i, row in ccodes.iterrows():
    if row['CCode'] not in ccode_converter:
        ccode_converter[row['CCode']]={'abb': row['StateAbb'], 'name': row['StateNme']}
    
print(ccode_converter)

In [None]:
folder = 'data/formatted_data/'
with open(folder + 'nodes.json', 'w') as f1,\
    open(folder + 'conflicts.json', 'w') as f2,\
    open(folder + 'alliances.json', 'w') as f3,\
    open(folder + 'ccode_converter.json', 'w') as f4, \
    open(folder + 'combined_links.json', 'w') as f5:
        f1.write(json.dumps(nodes_json))
        f2.write(json.dumps(conflicts_json))
        f3.write(json.dumps(alliances_json))
        f4.write(json.dumps(ccode_converter))
        f5.write(json.dumps(combined_links_json))