# Parse Data

## Purpose is to create directories with the data that can be used for S,D,MD

In [1]:
# will use GDELT template
# external libraries
import pandas as pd
import numpy as np
import importlib
import matplotlib.pyplot as plt
import importlib

# internal libraries
import PullData as pldta
import DataAggregation as da
import CreateGraphs as cg
import CreateNodeEmbeddings as cne
import CreateFeatures as cf
import TrainTest as tt

# Upload Data

In [2]:
year_range = range(1995,2017)
month_range = range(1,13)

num_time_steps = 0
year_dict = {}

for year in year_range:
    for month in month_range:
        year_dict[num_time_steps]=year
        num_time_steps += 1
print(num_time_steps)

264


## Event Data

In [3]:
# dict of dfs
importlib.reload(pldta)
year_df_dict = pldta.GetDfDictByYear(year_range)

downloaded: 1995 (151310, 20)
date parsed
agents cleaned
columns dropped
before dropna (131008, 10)
after dropna (131008, 10)
(131008, 10)
249
downloaded: 1996 (235269, 20)
date parsed
agents cleaned
columns dropped
before dropna (201703, 10)
after dropna (201703, 10)
(201703, 10)
255
downloaded: 1997 (251425, 20)
date parsed
agents cleaned
columns dropped
before dropna (216843, 10)
after dropna (216843, 10)
(216843, 10)
259
downloaded: 1998 (368318, 20)
date parsed
agents cleaned
columns dropped
before dropna (310790, 10)
after dropna (310790, 10)
(310790, 10)
261
downloaded: 1999 (545102, 20)
date parsed
agents cleaned
columns dropped
before dropna (446734, 10)
after dropna (446734, 10)
(446734, 10)
271
downloaded: 2000 (654613, 20)
date parsed
agents cleaned
columns dropped
before dropna (520772, 10)
after dropna (520772, 10)
(520772, 10)
279
downloaded: 2001 (797924, 20)
date parsed
agents cleaned
columns dropped
before dropna (604423, 10)
after dropna (604423, 10)
(604423, 10)
291

## Labeled Data

In [4]:
importlib.reload(pldta)
country_predict_list,country_region_dict,region_country_set_dict = pldta.GetCountryData()

### Influence

In [5]:
importlib.reload(pldta)
nmc_dict=pldta.GetNMC(year_range)

### Violence Classes

In [6]:
importlib.reload(pldta)
# by region
region_violence_dict = pldta.GetViolQuadDataRegion(year_df_dict,country_region_dict)

['Latin America & Caribbean', 'Sub-Saharan Africa', 'North America', 'East Asia & Pacific', 'South Asia', 'Europe & Central Asia', 'Middle East & North Africa']
1995 -115600.9
1996 -154798.2
1997 -154466.4
1998 -198164.3
1999 -314337.6
2000 -348016.69999999995
2001 -443193.0
2002 -415621.1
2003 -423400.89999999997
2004 -615199.8
2005 -593916.3
2006 -696772.3999999999
2007 -632468.7
2008 -655376.5000000001
2009 -596213.4
2010 -528130.5
2011 -553619.0
2012 -569978.1
2013 -583401.4000000001
2014 -748147.6
2015 -783763.5
2016 -622450.2000000001


In [7]:
importlib.reload(pldta)
# by country
country_violence_dict = pldta.GetViolQuadDataCountry(year_df_dict,country_predict_list)

1995 -115600.9
1996 -154798.2
1997 -154466.4
1998 -198164.3
1999 -314337.6
2000 -348016.69999999995
2001 -443193.0
2002 -415621.1
2003 -423400.89999999997
2004 -615199.8
2005 -593916.3
2006 -696772.3999999999
2007 -632468.7
2008 -655376.5000000001
2009 -596213.4
2010 -528130.5
2011 -553619.0
2012 -569978.1
2013 -583401.4000000001
2014 -748147.6
2015 -783763.5
2016 -622450.2000000001


for region,year_data in region_violence_dict.items():
    print(region)
    vec = np.zeros(num_time_steps)
    idx = 0
    for year,month_data in year_data.items():
        for month,lev in month_data.items():
            vec[idx] = lev
            idx+=1
    plt.plot(vec)
    plt.show()

In [8]:
importlib.reload(pldta)
country_class_labels_dict,chaos_dict = pldta.GetClassVec(country_violence_dict,num_time_steps)

In [9]:
importlib.reload(pldta)
region_class_labels_dict,region_dict = pldta.GetClassVec(region_violence_dict,num_time_steps)

In [10]:
region_dict

{'Latin America & Caribbean': None,
 'Sub-Saharan Africa': None,
 'North America': None,
 'East Asia & Pacific': None,
 'South Asia': None,
 'Europe & Central Asia': None,
 'Middle East & North Africa': None}

for country,labels in country_class_labels_dict.items():
    c=0
    num=0
    cat_list = ['first ','second','third ','fourth']
    for i,label_1 in enumerate(labels):
        if i < 49:
            continue
        vec = np.array([(np.array(labels[:i])==cat).sum() for cat in cat_list])
        if label_1==cat_list[vec.argmax()]:
            c+=1
        num+=1
    print(country + "::" + str(c) + "::" + str(num))
    
    
    
    
    
    

for region,ts in region_class_labels_dict.items():
    print(region)
    num=0
    c=0
    for idx,tok in enumerate(ts[:-1]):
        if idx < 36:
            continue
        if ts[idx]==ts[idx+1]:
            c+=1
        num+=1
        
    print(c/num,num)

## Tabular Data

importlib.reload(da)
indicator_year_month_dict = da.AggregateWGIData(year_range,country_predict_list)

## Graph Data

importlib.reload(cg)
import os
country_done_list = [country for country in os.listdir("CountryGraphData")]
country_done_list.remove('.ipynb_checkpoints')
country_done_list.sort()

for country in country_done_list:
    if country in country_predict_list:
        country_predict_list.remove(country)

In [11]:
global_event_dict,region_event_dict,country_event_dict = cg.GetAllEventDictFromDf(year_df_dict,
                                                                                  country_region_dict,
                                                                                 country_predict_list)

year: 1995 month 1
year: 1995 month 2
year: 1995 month 3
year: 1995 month 4
year: 1995 month 5
year: 1995 month 6
year: 1995 month 7
year: 1995 month 8
year: 1995 month 9
year: 1995 month 10
year: 1995 month 11
year: 1995 month 12
year: 1996 month 1
year: 1996 month 2
year: 1996 month 3
year: 1996 month 4
year: 1996 month 5
year: 1996 month 6
year: 1996 month 7
year: 1996 month 8
year: 1996 month 9
year: 1996 month 10
year: 1996 month 11
year: 1996 month 12
year: 1997 month 1
year: 1997 month 2
year: 1997 month 3
year: 1997 month 4
year: 1997 month 5
year: 1997 month 6
year: 1997 month 7
year: 1997 month 8
year: 1997 month 9
year: 1997 month 10
year: 1997 month 11
year: 1997 month 12
year: 1998 month 1
year: 1998 month 2
year: 1998 month 3
year: 1998 month 4
year: 1998 month 5
year: 1998 month 6
year: 1998 month 7
year: 1998 month 8
year: 1998 month 9
year: 1998 month 10
year: 1998 month 11
year: 1998 month 12
year: 1999 month 1
year: 1999 month 2
year: 1999 month 3
year: 1999 month 4


In [12]:
importlib.reload(cg)
# indexing dicts
# for each level of abstraction {global,region,country}, get a set of actors, and then an index for them
global_idx_dict,region_idx_dict,country_idx_dict = cg.GetAllIdxDict(global_event_dict,region_event_dict,country_event_dict)

for region,agent_idx_dict in region_idx_dict.items():
    print(region,len(agent_idx_dict.keys()))

In [13]:
del year_df_dict

## Graph Features

# convert dicts to mats
importlib.reload(cg)

#adjmatgraph_global_dict = cg.GetAdjMatGraphDict(global_event_dict,global_idx_dict,num_time_steps)
adjmatgraph_region_dict = cg.GetAdjMatGraphDict(region_event_dict,region_idx_dict,num_time_steps)
#adjmatgraph_country_dict = cg.GetAdjMatGraphDict(country_event_dict,country_idx_dict,num_time_steps)

## Embeddings

# tabular, aggreagate data
importlib.reload(cne)
cne.GenerateNodeAggregateEmbeddings(year_range,
                                    indicator_year_month_dict,
                                    country_predict_list)

lse = set()
for year, year_data in adjmatgraph_country_dict.items():
    for month, month_data in year_data.items():
        if year==2014 and month==3:
            break
        for country in country_predict_list:
            if country not in month_data.keys():
                lse.add(country)        
for country in lse:
    country_predict_list.remove(country)
    chaos_dict.pop(country)

#country_predict_list.pop(country_predict_list.index('Burkina Faso'))
print(country_predict_list,lse)

In [14]:
# get node embeddings for factions
importlib.reload(cne)

<module 'CreateNodeEmbeddings' from '/Volumes/Big Vol/ICEWS/Jupyter-ICEWS/CreateNodeEmbeddings.py'>

cne.GenerateEmbeddings(adjmatgraph_global_dict,
                       global_idx_dict,
                       "Global",
                       country_predict_list,
                       nmc_dict,
                      num_time_steps,
                      year_dict)

cne.GenerateEmbeddings(adjmatgraph_region_dict,
                       region_idx_dict,
                       "Region",
                       country_predict_list,
                       nmc_dict,
                      num_time_steps,
                      year_dict)

importlib.reload(cne)
cne.GenerateEmbeddings(adjmatgraph_country_dict,
                       country_idx_dict,
                       "Country",
                       country_predict_list,
                       nmc_dict,
                       num_time_steps,
                      year_dict)

## Features


region_agent_idx_dict = {region:idx for idx,region in enumerate(region_idx_dict.keys())}

#region_embed_ndarr = cf.GetEmbedNDArr("RegionGraphData",(158,7,1,6),region_agent_idx_dict)

# maps country to region to region idx
country_region_idx_dict = {country:region_agent_idx_dict[country_region_dict[country]] for country in country_predict_list}

In [15]:
importlib.reload(cf)
importlib.reload(cne)
importlib.reload(tt)

<module 'TrainTest' from '/Volumes/Big Vol/ICEWS/Jupyter-ICEWS/TrainTest.py'>

### Set-up Country-Level Predictions

In [16]:
import os
w=36
T=48
aug_T=12
offset=0

#country_predict_list = [country for country in os.listdir("CountryGraphData")]
#country_predict_list.remove('.ipynb_checkpoints')
#country_predict_list.sort()

active_countries = country_predict_list
active_regions = list(region_idx_dict.keys())
country_agent_idx_dict = {country:idx for idx,country in enumerate(active_countries)}
region_agent_idx_dict = {region:idx for idx,region in enumerate(active_regions)}
global_agent_idx_dict = {'global':0}
country_global_dict = {country:"global" for country in country_predict_list}

In [30]:
global_node_embed_ndarr = cf.GetNodeEmbedNDArr("GlobalNodeData",(num_time_steps,len(active_countries),1),
                                             global_agent_idx_dict,
                                             global_idx_dict,
                                        country_agent_idx_dict,
                                               country_global_dict,
                                             country_predict_list)
global_node_embed_ndarr.shape

(264, 162, 1)

In [31]:
global_graph_embed_ndarr = cf.GetGraphEmbedNDArr("GlobalGraphData",(num_time_steps,len(active_countries),1),
                                             global_agent_idx_dict,
                                             global_idx_dict,
                                        country_agent_idx_dict,
                                                 country_global_dict,
                                             country_predict_list)
global_graph_embed_ndarr.shape

(264, 162, 1)

In [32]:
region_node_embed_ndarr = cf.GetNodeEmbedNDArr("RegionNodeData",(num_time_steps,len(active_countries),1),
                                             region_agent_idx_dict,
                                             region_idx_dict,
                                        country_agent_idx_dict,
                                               country_region_dict,
                                             country_predict_list)
region_node_embed_ndarr.shape

(264, 162, 1)

In [33]:
region_graph_embed_ndarr = cf.GetGraphEmbedNDArr("RegionGraphData",(num_time_steps,len(active_countries),1),
                                             region_agent_idx_dict,
                                             region_idx_dict,
                                        country_agent_idx_dict,
                                                 country_region_dict,
                                             country_predict_list)
region_graph_embed_ndarr.shape

(264, 162, 1)

In [17]:
country_graph_embed_ndarr = cf.GetCountryGraphEmbedNDArr("CountryGraphData",
                                            (num_time_steps,len(active_countries),13),
                                            country_agent_idx_dict,
                                            country_predict_list)
country_graph_embed_ndarr.shape

(264, 162, 13)

### Set up Region-level Predictions

import os
w=36
T=48
aug_T=12
offset=0

#country_predict_list = [country for country in os.listdir("CountryGraphData")]
#country_predict_list.remove('.ipynb_checkpoints')
#country_predict_list.sort()

region_predict_list = [region for region in region_idx_dict.keys()]
active_regions = list(region_idx_dict.keys())
region_agent_idx_dict = {region:idx for idx,region in enumerate(active_regions)}
global_agent_idx_dict = {'global':0}
region_region_dict = {region:region for region in active_regions}
region_idx_dict = {region:{region:0} for region in active_regions}
region_global_dict = {region:"global" for region in region_predict_list}

region_graph_embed_ndarr = cf.GetGraphEmbedNDArr("RegionGraphData",(num_time_steps,7,13),
                                             region_agent_idx_dict,
                                             region_idx_dict,
                                        region_agent_idx_dict,
                                            region_region_dict,
                                             region_predict_list)
region_graph_embed_ndarr.shape

In [18]:
importlib.reload(cf)

# 
dict_ndarr_triple_list = [
                        (country_agent_idx_dict,country_graph_embed_ndarr,'country')
    (country_agent_idx_dict,region_node_embed_ndarr,'region'),
    (country_agent_idx_dict,region_graph_embed_ndarr,'region'),
    (country_agent_idx_dict,global_node_embed_ndarr,'global'),
    (country_agent_idx_dict,global_graph_embed_ndarr,'global')
]
agent_X_dict = cf.GenerateFeatures(dict_ndarr_triple_list,w,num_time_steps)

  lV = lV / lV.sum()


# Test

## Class Labels

In [108]:
importlib.reload(tt)

<module 'TrainTest' from '/Volumes/Big Vol/ICEWS/Jupyter-ICEWS/TrainTest.py'>

In [None]:
importlib.reload(tt)

T=48
test_type="all_trim_Bag_100_global_region" #S_all_BagRandom" #_aug_best_S" #graphs accumulated from the gross, not scaled graphs
print(test_type)

tt.TestClassLabels(agent_X_dict,country_class_labels_dict,T,w,offset,aug_T,country_region_dict,test_type) # flat_d or inf_d

In [None]:
# 13, 6, 13, 6, 13
# Country Graph S:  0:13
# Region Node S:    13:19
# Region Graph S:   19:31
# Global Node S:    31:37
# Global Graph S:   37:50