# Data Formatting Script

Importing the necessary packages

In [2]:
import pandas as pd
import json

In [3]:
conf = pd.read_csv("data/raw_data/ucdp-prio-acd-181.csv")  # reading in conflict dyads data
conf.head()  # examining its structure

Unnamed: 0,conflict_id,location,side_a,side_a_id,side_a_2nd,side_b,side_b_id,side_b_2nd,incompatibility,territory_name,...,ep_end,ep_end_date,ep_end_prec,gwno_a,gwno_a_2nd,gwno_b,gwno_b_2nd,gwno_loc,region,version
0,200,Bolivia,Government of Bolivia,23,,Popular Revolutionary Movement,719,,2,,...,1,1946-07-21,1.0,145,,,,145,5,18.1
1,200,Bolivia,Government of Bolivia,23,,MNR,720,,2,,...,1,1952-04-12,1.0,145,,,,145,5,18.1
2,200,Bolivia,Government of Bolivia,23,,ELN,721,,2,,...,1,1967-10-16,1.0,145,,,,145,5,18.1
3,201,Cambodia (Kampuchea),Government of France,33,,Khmer Issarak,160,,1,Cambodia,...,0,,,220,,,,811,3,18.1
4,201,Cambodia (Kampuchea),Government of France,33,,Khmer Issarak,160,,1,Cambodia,...,0,,,220,,,,811,3,18.1


In [4]:
# Function to convert numeric values and return 0 if they are not numeric, 
# this will allow entries without countries as the co-belligerent to be filtered out
def integerize(x):
    try:
        return int(x)
    except ValueError:
        return 0
    
# Filter the data to just that have countries on both sides
country_conf = conf[conf.gwno_b.apply(integerize) > 0]
print(country_conf.shape) # shape of the data frame
print(country_conf.columns) # names of the columns
country_conf.head()

(125, 28)
Index(['conflict_id', 'location', 'side_a', 'side_a_id', 'side_a_2nd',
       'side_b', 'side_b_id', 'side_b_2nd', 'incompatibility',
       'territory_name', 'year', 'intensity_level', 'cumulative_intensity',
       'type_of_conflict', 'start_date', 'start_prec', 'start_date2',
       'start_prec2', 'ep_end', 'ep_end_date', 'ep_end_prec', 'gwno_a',
       'gwno_a_2nd', 'gwno_b', 'gwno_b_2nd', 'gwno_loc', 'region', 'version'],
      dtype='object')


Unnamed: 0,conflict_id,location,side_a,side_a_id,side_a_2nd,side_b,side_b_id,side_b_2nd,incompatibility,territory_name,...,ep_end,ep_end_date,ep_end_prec,gwno_a,gwno_a_2nd,gwno_b,gwno_b_2nd,gwno_loc,region,version
119,214,"France, Thailand",Government of France,33,,Government of Thailand,147,,1,Northern Cambodia,...,1,1946-11-17,1.0,220,,800,,"220, 800","1, 3",18.1
120,215,"Albania, United Kingdom",Government of Albania,45,,Government of United Kingdom,28,,1,Korfu Channel,...,1,1946-12-31,5.0,339,,200,,"200, 339",1,18.1
131,218,"India, Pakistan",Government of India,141,,Government of Pakistan,142,,1,Kashmir,...,1,1948-12-31,1.0,750,,770,,"750, 770",3,18.1
132,218,"India, Pakistan",Government of India,141,,Government of Pakistan,142,,1,Kashmir,...,0,,,750,,770,,"750, 770",3,18.1
133,218,"India, Pakistan",Government of India,141,,Government of Pakistan,142,,1,Kashmir,...,1,1965-12-15,1.0,750,,770,,"750, 770",3,18.1


In [5]:
print(country_conf['gwno_a']) # examine the structure when there are multiple gwno's

119                         220
120                         339
131                         750
132                         750
133                         750
134                         750
135                         750
136                         750
137                         750
138                         750
139                         750
140                         750
141                         750
142                         750
143                         750
144                         750
145                         750
146                         750
147                         750
148                         750
149                         750
150                         750
151                         750
152                         750
314                         751
348     651, 645, 663, 660, 652
349     651, 645, 663, 660, 652
424                         710
425                         710
426                         710
                 ...           
1567    

In [6]:
nodes_primary = set()
nodes_all = set()

for i, row in country_conf.iterrows():
    for num in (str(row['gwno_a']).split(',') + str(row['gwno_b']).split(',')):
        if num != 'nan':
            nodes_all.add(int(num))
            nodes_primary.add(int(num))
    for num in (str(row['gwno_a_2nd']).split(',') + str(row['gwno_b_2nd']).split(',')):
        if num != 'nan':
            nodes_all.add(int(num))

In [7]:
def nums_to_list(nums):
    return [int(num) for num in (str(nums).split(',')) if num != 'nan']


def segmented_collocation_graph(primary, secondary, graph_all={}, graph_primary={}):
    for i in primary:
        if i not in graph_primary:
            graph_primary[i] = {}
            
        for j in primary:
            if i != j:
                if j in graph_primary[i]:
                    graph_primary[i][j] += 1
                else:
                    graph_primary[i][j] = 1
                    
    for i in primary + secondary:
        if i not in graph_all:
            graph_all[i] = {}
            
            for j in primary + secondary:
                if i != j:
                    if j in graph_all[i]:
                        graph_all[i][j] += 1
                    else:
                        graph_all[i][j] = 1
                        
                        
    return graph_all, graph_primary
    
    

In [8]:
conflicts_all = {}
conflicts_primary = {}

for i, row in country_conf.iterrows():
    side_a_primary = nums_to_list(row['gwno_a'])
    side_a_secondary = nums_to_list(row['gwno_a_2nd'])
    
    side_b_primary = nums_to_list(row['gwno_b'])
    side_b_secondary = nums_to_list(row['gwno_b_2nd'])
    
    alliances_all, alliances_primary = segmented_collocation_graph(side_a_primary, side_a_secondary)
    alliances_all, alliances_primary = segmented_collocation_graph(side_b_primary, side_b_secondary,
                                                                   alliances_all, alliances_primary)
    
    for num_a in side_a_primary:
        if num_a not in conflicts_primary:
            conflicts_primary[num_a] = {}
        for num_b in side_b_primary:
            if num_b in conflicts_primary[num_a]:
                conflicts_primary[num_a][num_b] += 1
            else:
                conflicts_primary[num_a][num_b] = 1
                
    for num_a in side_a_primary + side_a_secondary:
        if num_a not in conflicts_all:
            conflicts_all[num_a] = {}
        for num_b in side_b_primary + side_b_secondary:
            if num_b in conflicts_all[num_a]:
                conflicts_all[num_a][num_b] += 1
            else:
                conflicts_all[num_a][num_b] = 1
    
    
    

In [9]:
def region_converter(num):
    """
    converts gwno into region variable
    """
    if 200 <= num <= 395:
        region = "Europe"
    elif 630 <= num <= 698 and not num == 651:
        region = "Middle East"
    elif 700 <= num <= 990:
        region = "Asia"
    elif 400 <= num <= 626 and num == 651:
        region = "Africa"
    elif 2 <= num <= 165:
        region = "Americas"
    else:
        region = "Other"
    
    return region

In [10]:
nodes_all_json = [{'id': i, 'region': region_converter(i)} for i in nodes_all]
nodes_primary_json = [{'id': i, 'region': region_converter(i)} for i in nodes_primary]
primary_conflicts_json = [{'source': a, 'target': b, 'value': w , 'type': 'enemy'} for a, v in conflicts_primary.items() for b, w in v.items() ]

primary_alliances_json = [{'source': a, 'target': b, 'value': w, 'type': 'ally'} for a, v in alliances_primary.items() for b, w in v.items()]

all_conflicts_json = [{'source': a, 'target': b, 'value': w , 'type': 'enemy'} for a, v in conflicts_all.items() for b, w in v.items() ]

all_alliances_json = [{'source': a, 'target': b, 'value': w, 'type': 'ally'} for a, v in alliances_all.items() for b, w in v.items()]


In [11]:
ccodes = pd.read_csv('data/raw_data/COW_country_codes.csv')
ccodes.head()

Unnamed: 0,StateAbb,CCode,StateNme
0,USA,2,United States of America
1,CAN,20,Canada
2,BHM,31,Bahamas
3,CUB,40,Cuba
4,CUB,40,Cuba


In [12]:
ccode_converter = {}

for i, row in ccodes.iterrows():
    if row['CCode'] not in ccode_converter:
        ccode_converter[row['CCode']]={'abb': row['StateAbb'], 'name': row['StateNme']}
    
print(ccode_converter)

{2: {'abb': 'USA', 'name': 'United States of America'}, 20: {'abb': 'CAN', 'name': 'Canada'}, 31: {'abb': 'BHM', 'name': 'Bahamas'}, 40: {'abb': 'CUB', 'name': 'Cuba'}, 41: {'abb': 'HAI', 'name': 'Haiti'}, 42: {'abb': 'DOM', 'name': 'Dominican Republic'}, 51: {'abb': 'JAM', 'name': 'Jamaica'}, 52: {'abb': 'TRI', 'name': 'Trinidad and Tobago'}, 53: {'abb': 'BAR', 'name': 'Barbados'}, 54: {'abb': 'DMA', 'name': 'Dominica'}, 55: {'abb': 'GRN', 'name': 'Grenada'}, 56: {'abb': 'SLU', 'name': 'St. Lucia'}, 57: {'abb': 'SVG', 'name': 'St. Vincent and the Grenadines'}, 58: {'abb': 'AAB', 'name': 'Antigua & Barbuda'}, 60: {'abb': 'SKN', 'name': 'St. Kitts and Nevis'}, 70: {'abb': 'MEX', 'name': 'Mexico'}, 80: {'abb': 'BLZ', 'name': 'Belize'}, 90: {'abb': 'GUA', 'name': 'Guatemala'}, 91: {'abb': 'HON', 'name': 'Honduras'}, 92: {'abb': 'SAL', 'name': 'El Salvador'}, 93: {'abb': 'NIC', 'name': 'Nicaragua'}, 94: {'abb': 'COS', 'name': 'Costa Rica'}, 95: {'abb': 'PAN', 'name': 'Panama'}, 100: {'abb'

In [13]:
folder = 'data/formatted_data/'
with open(folder + 'nodes_all.json', 'w') as f1,\
    open(folder + 'nodes_primary.json', 'w') as f2, \
    open(folder + 'conflicts_all.json', 'w') as f3,\
    open(folder + 'alliances_all.json', 'w') as f4,\
    open(folder + 'ccode_converter.json', 'w') as f5, \
    open(folder + 'conflicts_primary.json', 'w') as f6, \
    open(folder + 'alliances_primary.json', 'w') as f7:
        f1.write(json.dumps(nodes_all_json))
        f2.write(json.dumps(nodes_primary_json))
        f3.write(json.dumps(all_conflicts_json))
        f4.write(json.dumps(all_alliances_json))
        f5.write(json.dumps(ccode_converter))
        f6.write(json.dumps(primary_conflicts_json))
        f7.write(json.dumps(primary_alliances_json))