In [1]:
import pandas as pd
import numpy as np
import json

from bz2file import open as bzopen

import pprint

# Data preparation

## Load the csv data files

In [2]:
print("loading resulting data sets from csv files")

cdf = pd.read_csv('company_df.csv',
                  parse_dates = ['created_datetime'],
                  dtype={'index': np.int32,
                         'year': np.int32,
                         'company_ua': np.bool
                        })

odf = pd.read_csv('organization_df.csv',
                  parse_dates = ['created_datetime'],
                  dtype={'index': np.int32,
                         'year': np.int32,
                         'company_ua': np.bool
                        })

cmap = pd.read_csv('company_map.csv', 
                   dtype={'company_ua': np.bool, 'counts': np.int32})
cmap.rename(columns={'company_name':'uni_company_name'}, inplace=True)

omap = pd.read_csv('organization_map.csv', 
                   dtype={'company_ua': np.bool, 'counts': np.int32})
omap.rename(columns={'org_name':'uni_org_name'}, inplace=True)

print(len(set(cmap.company_id)))
print(len(set(omap.org_id)))
print(len(set(list(set(cdf.id)) + list(set(odf.id)))))

loading resulting data sets from csv files


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


46302
17526
113052


In [3]:
def shorten_name(full_name):
    namelist = full_name.split()
    surname = namelist[0] + ' '
    name1 = namelist[1][:1] + '. '
    if len(namelist) == 3:
        name2 = namelist[2][:1] + '.'
    else:
        name2 = ''
    
    return surname + name1 + name2 


def extract_city(city_full, entity):

    citylist = city_full.split('/')
    citylist_length = len(citylist)
    
    if citylist_length == 4:
        city = citylist[0]
        rayon = citylist[1]
        oblast = citylist[2]
        country = citylist[3]
        
    elif citylist_length == 3:
        city = citylist[0]
        rayon = citylist[0]
        oblast = citylist[1]
        country = citylist[2]
    
    elif citylist_length == 2:
        city = citylist[0]
        rayon = citylist[0]
        oblast = citylist[0]
        country = citylist[1]
        
    else: 
        city = citylist[0]
        rayon = citylist[0]
        oblast = citylist[0]
        country = citylist[0]

    if entity == 'city':
        return city
    elif entity == 'rayon':
        return rayon
    elif entity == 'oblast':
        return oblast
    elif entity == 'country':
        return country
    
    return 'wrong entity argument'

In [4]:
odf = odf[odf.org_id != 0]

In [5]:
cdf['decl_shortname'] = cdf.full_name.apply(lambda x: shorten_name(x))
odf['decl_shortname'] = odf.full_name.apply(lambda x: shorten_name(x))

cdf['surname'] = cdf.full_name.apply(lambda x: x.split()[0])
odf['surname'] = odf.full_name.apply(lambda x: x.split()[0])

In [6]:
# add descriptive field about declarant
cdf['decl_info'] = cdf.full_name + '\n' + cdf.position + '\n' + cdf.office
odf['decl_info'] = odf.full_name + '\n' + odf.position + '\n' + odf.office

cdf = cdf.merge(cmap[['company_id', "uni_company_name", "counts"]], how='left', on='company_id')
odf = odf.merge(omap[['org_id', "uni_org_name", "counts"]], how='left', on='org_id')

In [7]:
cdf['city'] = cdf.city_type.apply(lambda x: extract_city(x, 'city'))
cdf['rayon'] = cdf.city_type.apply(lambda x: extract_city(x, 'rayon'))
cdf['oblast'] = cdf.city_type.apply(lambda x: extract_city(x, 'oblast'))
cdf['country'] = cdf.city_type.apply(lambda x: extract_city(x, 'country'))

odf['city'] = odf.city_type.apply(lambda x: extract_city(x, 'city'))
odf['rayon'] = odf.city_type.apply(lambda x: extract_city(x, 'rayon'))
odf['oblast'] = odf.city_type.apply(lambda x: extract_city(x, 'oblast'))
odf['country'] = odf.city_type.apply(lambda x: extract_city(x, 'country'))

## Build a Parliament mapper for the data

In [8]:
# load information about political party of the politician
party_dict = pd.read_csv('political_party_info.csv')
party_dict = party_dict[party_dict['Особа'].notna()]

party_dict.head()

Unnamed: 0,Партія,Особа,№ у списку,№ округу
0,Слуга народу,Пашковський Максим Ігорович,,11.0
1,Слуга народу,Драбовський Анатолій Григорович,,12.0
2,Самовисування,Юрчишин Петро Васильович,,13.0
3,Слуга народу,Борзова Ірина Наумівна,,14.0
4,Самовисування,Білозір Лариса Миколаївна,,15.0


In [9]:
party_dict.groupby(['Партія'])[['Особа']].nunique()

Unnamed: 0_level_0,Особа
Партія,Unnamed: 1_level_1
ЄС,26
Єдиний центр,1
Біла Церква разом,1
ВО «Батьківщина»,27
ВО «Свобода»,1
Голос,21
Об'єднання «Самопоміч»,1
Опозиційна платформа — За життя,43
Опозиційний блок,6
Порядок,1


In [10]:
party_map = {'ЄС':'ЄС',
             'Єдиний центр': 'Інші',
             'Біла Церква разом': 'Інші',
             'ВО «Батьківщина»': 'ВО «Батьківщина»',
             'ВО «Свобода»': 'Інші',
             'Голос': 'Голос',
             "Об'єднання «Самопоміч»": 'Інші',
             'Опозиційна платформа\xa0— За життя': 'Опозиційна платформа — За життя',
             'Опозиційний блок':'Опозиційний блок',
             'Порядок': 'Інші',
             'Самовисування': 'Самовисування',
             'Слуга народу': 'Слуга народу',
             np.nan : 'Інші'
            }

def map_party(party):
    return party_map[party]

party_dict['party'] = party_dict['Партія'].apply(lambda x: map_party(x))
party_dict['surname'] = party_dict['Особа'].apply(lambda x: x.split()[0])
party_dict = party_dict.rename({'Особа': 'full_name'}, axis=1) 
party_dict.groupby(['party'])[['full_name']].nunique()

Unnamed: 0_level_0,full_name
party,Unnamed: 1_level_1
ЄС,26
Інші,5
ВО «Батьківщина»,27
Голос,21
Опозиційна платформа — За життя,43
Опозиційний блок,6
Самовисування,47
Слуга народу,264


In [11]:
def map_parliament_party(parl_df, party_dict=party_dict):
    #print(len(set(parl_df.full_name)))
    #print(len(set(party_dict.full_name)))

    #print(len(set(parl_df.surname)))

    print('intersections with political parties')
    print(len(set.intersection(set(parl_df.full_name), set(party_dict.full_name))))
    
    # not many party information matched
    # add the info that was matched to the main dataframes
    parl_df = parl_df.merge(party_dict[['full_name', 'party']], on='full_name', how='left')
    parl_df.party.replace(np.nan, 'Не відомо', inplace=True)
    return parl_df

## Data set overview

In [12]:
cdf.head(2)

Unnamed: 0,id,user_declarant_id,created_datetime,index,document_type,year,full_name,office,position,position_type,...,paid,unique_guids_per_user,decl_shortname,surname,decl_info,uni_company_name,counts,rayon,oblast,country
0,nacp_4a9ca6c4-8792-45f0-a09d-3a5767f2415e,1106195.0,2019-03-25 17:02:04,5120403,Щорічна,2019,Рогович Олександра Степанівна,Коломийська ЦРЛ,"Начальник відділу кадрів ,депутат Раківчицької...",,...,Оплачувана,1.0,Рогович О. С.,Рогович,Рогович Олександра Степанівна\nНачальник відді...,"КЗ КРР ""КРЦ ПМСД""",1,Коломийський район,Івано-Франківська область,Україна
1,nacp_af1d2e52-44f1-4980-88d4-d2c2645a307b,969526.0,2019-03-27 15:39:58,1511346,Щорічна,2019,Дубинська Світлана Василівна,"Раківчицька сільськка рада, Будинок культури с...","Депутат сільської ради, директор Будинку культ...",,...,Оплачувана,1.0,Дубинська С. В.,Дубинська,Дубинська Світлана Василівна\nДепутат сільсько...,Відділ культури Коломийської районної державно...,2,Коломийський район,Івано-Франківська область,Україна


In [13]:
cdf.columns

Index(['id', 'user_declarant_id', 'created_datetime', 'index', 'document_type',
       'year', 'full_name', 'office', 'position', 'position_type',
       'position_category', 'corruptionAffected', 'dnt_organization_group',
       'city', 'city_type', 'relation_type', 'person', 'persons', 'company_ua',
       'company_id', 'company_name', 'description', 'emitent_citizen', 'paid',
       'unique_guids_per_user', 'decl_shortname', 'surname', 'decl_info',
       'uni_company_name', 'counts', 'rayon', 'oblast', 'country'],
      dtype='object')

In [81]:
odf.groupby('dnt_organization_group')[['id']].nunique().sort_values(['id'],ascending=False).head(10)

Unnamed: 0_level_0,id
dnt_organization_group,Unnamed: 1_level_1
"Кабмін, міністерства та підлеглі органи",41357
Без категорії,13207
Місцеві адміністрації та ради,9896
Прокуратура,2442
Суд,2322
"Інші державні служби, комісії, і т.п.",2000
Парламент,419
Пенсійний фонд,277
НАБУ,97
НБУ,78


In [82]:
cdf.groupby('dnt_organization_group')[['id']].nunique().sort_values(['id'],ascending=False).head(10)

Unnamed: 0_level_0,id
dnt_organization_group,Unnamed: 1_level_1
Місцеві адміністрації та ради,17814
Без категорії,15729
"Кабмін, міністерства та підлеглі органи",8482
Суд,2145
"Інші державні служби, комісії, і т.п.",1779
Прокуратура,677
Пенсійний фонд,673
Парламент,426
НБУ,240
Фонд державного майна,139


# Build Network Graphs

In [16]:
import networkx as nx
from networkx.algorithms import community 

import matplotlib.pyplot as plt
import seaborn as sns

from pyvis.network import Network

## Functions for graphs creation and viz

In [17]:
def draw_graph(networkx_graph,notebook=True,output_filename='graph.html',show_buttons=False,only_physics_buttons=False):
        """
        This function accepts a networkx graph object,
        converts it to a pyvis network object preserving its node and edge attributes,
        and both returns and saves a dynamic network visualization.
        
        https://stackoverflow.com/questions/59598019/how-to-plot-large-networks-clearly
        
        Valid node attributes include:
            "size", "value", "title", "x", "y", "label", "color".

            (For more info: https://pyvis.readthedocs.io/en/latest/documentation.html#pyvis.network.Network.add_node)

        Valid edge attributes include:
            "arrowStrikethrough", "hidden", "physics", "title", "value", "width"

            (For more info: https://pyvis.readthedocs.io/en/latest/documentation.html#pyvis.network.Network.add_edge)


        Args:
            networkx_graph: The graph to convert and display
            notebook: Display in Jupyter?
            output_filename: Where to save the converted network
            show_buttons: Show buttons in saved version of network?
            only_physics_buttons: Show only buttons controlling physics of network?
        """

        # import

        # make a pyvis network
        pyvis_graph = Network(notebook=notebook)
        pyvis_graph.width = '1000px'
        # for each node and its attributes in the networkx graph
        for node,node_attrs in networkx_graph.nodes(data=True):
            pyvis_graph.add_node(node,**node_attrs)
    #         print(node,node_attrs)

        # for each edge and its attributes in the networkx graph
        for source,target,edge_attrs in networkx_graph.edges(data=True):
            # if value/width not specified directly, and weight is specified, set 'value' to 'weight'
            if not 'value' in edge_attrs and not 'width' in edge_attrs and 'weight' in edge_attrs:
                # place at key 'value' the weight of the edge
                edge_attrs['value']=edge_attrs['weight']
            # add the edge
            pyvis_graph.add_edge(source,target,**edge_attrs)

        # turn buttons on
        if show_buttons:
            if only_physics_buttons:
                pyvis_graph.show_buttons(filter_=['physics'])
            else:
                pyvis_graph.show_buttons()

        # return and also save
        return pyvis_graph.show(output_filename)

In [18]:
def plot_degree_dist(G, kind='hist'):
    # kind{“hist”, “kde”, “ecdf”}
    
    degrees = [G.degree(n) for n in G.nodes()]

    # seaborn histogram
    sns.displot(degrees, 
                kind=kind, 
                color = '#ffaaa5'
               )
    
    # Add labels
    plt.title('Degree Distribution of the Nodes')
    plt.xlabel('Degree')
    plt.ylabel('Nodes')

In [19]:
def graph_info(g, vis=False):
    print(nx.info(g))

    density = nx.density(g)
    print("\nNetwork density:", density)
    
    if vis == True:
        plot_degree_dist(g,kind='ecdf')

In [51]:
def build_graphs_for_group(group_name, cdf=cdf, odf=odf):
    current_group = group_name
    #'Прокуратура' #'Парламент' #'Суд' #'Прокуратура'
    
    print('current group: ', current_group)

    print('creating group datfames')
    
    if group_name == 'all':
        group_cdf = cdf
        group_odf = odf
    else:
        group_cdf = cdf.loc[(cdf.dnt_organization_group == current_group)]
        group_odf = odf.loc[(odf.dnt_organization_group == current_group)]
    
    # add political party info
    print('add political party info')
    group_cdf = map_parliament_party(group_cdf) 
    group_odf = map_parliament_party(group_odf) 
    
    print('shape of dataframes')
    print(group_cdf.shape, group_odf.shape)
    
    
    # companies Graph
    print('build companies gaph')

    graph_group_cdf = group_cdf[['id','company_id']].copy()
    graph_group_cdf.dropna(inplace=True)

    tuples_comp_group = [tuple(x) for x in graph_group_cdf.values]
    g_comp_group = nx.Graph(tuples_comp_group)

    # set node attributes

    declarant_nodes = group_cdf[['id','decl_shortname', 'party', 'dnt_organization_group']].drop_duplicates(subset = ['id','decl_shortname', 'party', 'dnt_organization_group'])
    declarant_nodes['title'] = 'declarant' 
    declarant_nodes['color'] = '#a8e6cf'
    declarant_nodes.columns = ['id', 'label', 'party', 'dnt_organization_group', 'title', 'color']

    comp_nodes = cmap.merge(group_cdf[['company_id']], 
                            on='company_id', 
                            how='inner')[['company_id', 
                                          'uni_company_name']].drop_duplicates(subset = ['company_id', 
                                                                                     'uni_company_name'])
    comp_nodes['party'] = 'Не застосовується'
    comp_nodes['dnt_organization_group'] = 'Не застосовується'
    comp_nodes['title'] = 'company'
    comp_nodes['color'] = '#ff8b94'
    comp_nodes.columns = ['id', 'label', 'party', 'dnt_organization_group', 'title', 'color']

    nodes = pd.concat([declarant_nodes, comp_nodes])

    nx.set_node_attributes(g_comp_group, dict(zip(list(nodes['id']), list(nodes['title']))), 'title')
    nx.set_node_attributes(g_comp_group, dict(zip(list(nodes['id']), list(nodes['label']))), 'label')
    nx.set_node_attributes(g_comp_group, dict(zip(list(nodes['id']), list(nodes['party']))), 'party')
    nx.set_node_attributes(g_comp_group, dict(zip(list(nodes['id']), list(nodes['dnt_organization_group']))), 'dnt_organization_group')
    nx.set_node_attributes(g_comp_group, dict(zip(list(nodes['id']), list(nodes['color']))), 'color')

    print('\n\nBuilt a graph of all ' + current_group + ' people and companies')
    graph_info(g_comp_group, vis=False)
    

    graph_group_odf = group_odf[['id','org_id']].copy()
    graph_group_odf.dropna(inplace=True)

    tuples_org_group = [tuple(x) for x in graph_group_odf.values]
    g_org_group = nx.Graph(tuples_org_group)


    # set node attributes
    declarant_nodes = group_odf[['id','decl_shortname', 'party', 'dnt_organization_group']].drop_duplicates(subset = ['id','decl_shortname', 'party', 'dnt_organization_group'])
    declarant_nodes['title'] = 'declarant' 
    declarant_nodes['color'] = '#a8e6cf'
    declarant_nodes.columns = ['id', 'label', 'party', 'dnt_organization_group', 'title', 'color']

    comp_nodes = omap.merge(group_odf[['org_id']], 
                            on='org_id', 
                            how='inner')[['org_id', 
                                          'uni_org_name']].drop_duplicates(subset = ['org_id', 
                                                                                     'uni_org_name'])
    comp_nodes['party'] = 'Не застосовується'
    comp_nodes['dnt_organization_group'] = 'Не застосовується'
    comp_nodes['title'] = 'organization'
    comp_nodes['color'] = '#ffd3b6' 
    comp_nodes.columns = ['id', 'label', 'party', 'dnt_organization_group', 'title', 'color']

    nodes = pd.concat([declarant_nodes, comp_nodes])

    nx.set_node_attributes(g_org_group, dict(zip(list(nodes['id']), list(nodes['title']))), 'title')
    nx.set_node_attributes(g_org_group, dict(zip(list(nodes['id']), list(nodes['label']))), 'label')
    nx.set_node_attributes(g_org_group, dict(zip(list(nodes['id']), list(nodes['party']))), 'party')
    nx.set_node_attributes(g_org_group, dict(zip(list(nodes['id']), list(nodes['dnt_organization_group']))), 'dnt_organization_group')
    nx.set_node_attributes(g_org_group, dict(zip(list(nodes['id']), list(nodes['color']))), 'color')

    print('\n\nBuilt a graph of all ' + current_group + ' people and organizations')
    graph_info(g_org_group, vis=False)
    
    # Orgs and companies Graph
    g_comp_org_group = nx.compose(g_comp_group, g_org_group)
    print('\n\nBuilt a composed graph of all ' + current_group + ' people, companies, and organizations')
    graph_info(g_comp_org_group, vis=False)

    return g_comp_group, g_org_group, g_comp_org_group


In [52]:
def remove_low_deg_nodes(full_graph):
    #g_hideg = nx.Graph()                                                                                                                                     
    #hideg_edges = filter(lambda x: full_graph.degree()[x[0]] > 1 and full_graph.degree()[x[1]] > 1, full_graph.edges())
    #g_hideg.add_edges_from(hideg_edges)
    
    degrees = full_graph.degree()    
    to_keep = [n for (n, deg) in degrees if deg > 1]
    print('Removed nodes with degree 0 or 1 from the graph of people and comapnies')

    g_hideg = full_graph.subgraph(to_keep)
    graph_info(g_hideg)

    
    return g_hideg

In [53]:
# Create data for graph with companies > 1 owner
mask_muli_owners = (cdf['counts'] > 1) # mask over to df that filters companies that apper more than once
print(len(cdf[mask_muli_owners]))

33389


## General Graphs: people-company, people-organizations, people-compan-organizations

In [54]:
group = 'all'
all_g_comp, all_g_org, all_g_comp_org = build_graphs_for_group(group, cdf=cdf, odf=odf)

current group:  all
creating group datfames
add political party info
intersections with political parties
234
intersections with political parties
234
shape of dataframes
(71789, 34) (95311, 31)
build companies gaph


Built a graph of all all people and companies
Name: 
Type: Graph
Number of nodes: 94798
Number of edges: 71789
Average degree:   1.5146

Network density: 1.5976959375153783e-05


Built a graph of all all people and organizations
Name: 
Type: Graph
Number of nodes: 89902
Number of edges: 93361
Average degree:   2.0770

Network density: 2.310264020605408e-05


Built a composed graph of all all people, companies, and organizations
Name: 
Type: Graph
Number of nodes: 176855
Number of edges: 165150
Average degree:   1.8676

Network density: 1.0560302145648425e-05


In [55]:
all_g_comp_org_hideg = remove_low_deg_nodes(all_g_comp_org)

Removed nodes with degree 0 or 1 from the graph of people and comapnies
Name: 
Type: Graph
Number of nodes: 41838
Number of edges: 50273
Average degree:   2.4032

Network density: 5.744250188436327e-05


In [56]:
nx.write_gexf(all_g_comp, "gephi_graphs/all_g_comp.gexf")
nx.write_gexf(all_g_org, "gephi_graphs/all_g_org.gexf")
nx.write_gexf(all_g_comp_org, "gephi_graphs/all_g_comp_org.gexf")
nx.write_gexf(all_g_comp_org_hideg, "gephi_graphs/all_g_comp_org_hideg.gexf")

##  Graphs of filtered groups: people-company, people-organizations, people-compan-organizations

## Prosecutors Graphs

In [57]:
group = 'Прокуратура' 
pros_g_comp, pros_g_org, pros_g_comp_org = build_graphs_for_group(group, cdf=cdf, odf=odf)

current group:  Прокуратура
creating group datfames
add political party info
intersections with political parties
0
intersections with political parties
0
shape of dataframes
(895, 34) (2881, 31)
build companies gaph


Built a graph of all Прокуратура people and companies
Name: 
Type: Graph
Number of nodes: 1437
Number of edges: 895
Average degree:   1.2457

Network density: 0.0008674447500692986


Built a graph of all Прокуратура people and organizations
Name: 
Type: Graph
Number of nodes: 2751
Number of edges: 2651
Average degree:   1.9273

Network density: 0.0007008360596146856


Built a composed graph of all Прокуратура people, companies, and organizations
Name: 
Type: Graph
Number of nodes: 4003
Number of edges: 3546
Average degree:   1.7717

Network density: 0.00044269646340956427


In [58]:
pros_g_comp_org_hideg = remove_low_deg_nodes(pros_g_comp_org)

Removed nodes with degree 0 or 1 from the graph of people and comapnies
Name: 
Type: Graph
Number of nodes: 587
Number of edges: 534
Average degree:   1.8194

Network density: 0.003104813623968696


In [59]:
pros_g_comp_org_hideg = remove_low_deg_nodes(pros_g_comp_org)

Removed nodes with degree 0 or 1 from the graph of people and comapnies
Name: 
Type: Graph
Number of nodes: 587
Number of edges: 534
Average degree:   1.8194

Network density: 0.003104813623968696


In [60]:
nx.write_gexf(pros_g_comp, 'gephi_graphs/' + group + '_g_comp_.gexf')
nx.write_gexf(pros_g_org, 'gephi_graphs/' + group + '_g_org.gexf')
nx.write_gexf(pros_g_comp_org, 'gephi_graphs/' + group + '_g_comp_org.gexf')
nx.write_gexf(pros_g_comp_org_hideg, 'gephi_graphs/' + group + '_g_comp_org_hideg.gexf')

#draw_graph(all_g_comp, output_filename='G_comp_colored_' + group + '.html', notebook=False)

In [61]:
centralities = nx.eigenvector_centrality_numpy(pros_g_comp_org)

## Parliament Graphs

In [62]:
group = 'Парламент'
parl_g_comp, parl_g_org, parl_g_comp_org = build_graphs_for_group(group, cdf=cdf, odf=odf)

current group:  Парламент
creating group datfames
add political party info
intersections with political parties
222
intersections with political parties
214
shape of dataframes
(1987, 34) (759, 31)
build companies gaph


Built a graph of all Парламент people and companies
Name: 
Type: Graph
Number of nodes: 2337
Number of edges: 1987
Average degree:   1.7005

Network density: 0.0007279412195708115


Built a graph of all Парламент people and organizations
Name: 
Type: Graph
Number of nodes: 999
Number of edges: 759
Average degree:   1.5195

Network density: 0.0015225646488171538


Built a composed graph of all Парламент people, companies, and organizations
Name: 
Type: Graph
Number of nodes: 3142
Number of edges: 2746
Average degree:   1.7479

Network density: 0.0005564887787259974


In [63]:
parl_g_comp_org_hideg = remove_low_deg_nodes(parl_g_comp_org)

Removed nodes with degree 0 or 1 from the graph of people and comapnies
Name: 
Type: Graph
Number of nodes: 460
Number of edges: 237
Average degree:   1.0304

Network density: 0.0022449559533958513


In [64]:
nx.write_gexf(parl_g_comp, 'gephi_graphs/' + group + '_g_comp_.gexf')
nx.write_gexf(parl_g_org, 'gephi_graphs/' + group + '_g_org.gexf')
nx.write_gexf(parl_g_comp_org, 'gephi_graphs/' + group + '_g_comp_org.gexf')
nx.write_gexf(parl_g_comp_org_hideg, 'gephi_graphs/' + group + '_g_comp_org_hideg.gexf')

#draw_graph(all_g_comp, output_filename='G_comp_colored_' + group + '.html', notebook=False)

## Court Graphs

In [83]:
group = 'Суд' 
court_g_comp, court_g_org, court_g_comp_org = build_graphs_for_group(group, cdf=cdf, odf=odf)

current group:  Суд
creating group datfames
add political party info
intersections with political parties
0
intersections with political parties
2
shape of dataframes
(3092, 34) (2717, 31)
build companies gaph


Built a graph of all Суд people and companies
Name: 
Type: Graph
Number of nodes: 4600
Number of edges: 3092
Average degree:   1.3443

Network density: 0.00029231307373058413


Built a graph of all Суд people and organizations
Name: 
Type: Graph
Number of nodes: 2956
Number of edges: 2717
Average degree:   1.8383

Network density: 0.000622096444410863


Built a composed graph of all Суд people, companies, and organizations
Name: 
Type: Graph
Number of nodes: 7184
Number of edges: 5809
Average degree:   1.6172

Network density: 0.00022514338017225156


In [84]:
court_g_comp_org_hideg = remove_low_deg_nodes(court_g_comp_org)

Removed nodes with degree 0 or 1 from the graph of people and comapnies
Name: 
Type: Graph
Number of nodes: 1294
Number of edges: 1089
Average degree:   1.6832

Network density: 0.001301742470154954


In [85]:
nx.write_gexf(court_g_comp, 'gephi_graphs/' + group + '_g_comp_.gexf')
nx.write_gexf(court_g_org, 'gephi_graphs/' + group + '_g_org.gexf')
nx.write_gexf(court_g_comp_org, 'gephi_graphs/' + group + '_g_comp_org.gexf')
nx.write_gexf(court_g_comp_org_hideg, 'gephi_graphs/' + group + '_g_comp_org_hideg.gexf')

#draw_graph(all_g_comp, output_filename='G_comp_colored_' + group + '.html', notebook=False)

## KabMin Graphs

In [134]:
group = 'Кабмін, міністерства та підлеглі органи' 
kabm_g_comp, kabm_g_org, kabm_g_comp_org = build_graphs_for_group(group, cdf=cdf, odf=odf)

current group:  Кабмін, міністерства та підлеглі органи
creating group datfames
add political party info
intersections with political parties
6
intersections with political parties
19
shape of dataframes
(10825, 34) (57446, 31)
build companies gaph


Built a graph of all Кабмін, міністерства та підлеглі органи people and companies
Name: 
Type: Graph
Number of nodes: 15816
Number of edges: 10825
Average degree:   1.3689

Network density: 8.65549775628709e-05


Built a graph of all Кабмін, міністерства та підлеглі органи people and organizations
Name: 
Type: Graph
Number of nodes: 45448
Number of edges: 55979
Average degree:   2.4634

Network density: 5.420447409126454e-05


Built a composed graph of all Кабмін, міністерства та підлеглі органи people, companies, and organizations
Name: 
Type: Graph
Number of nodes: 59702
Number of edges: 66804
Average degree:   2.2379

Network density: 3.748538513128238e-05


In [135]:
kabm_g_comp_org_hideg = remove_low_deg_nodes(kabm_g_comp_org)

Removed nodes with degree 0 or 1 from the graph of people and comapnies
Name: 
Type: Graph
Number of nodes: 15146
Number of edges: 26771
Average degree:   3.5351

Network density: 0.000233414246377626


In [136]:
nx.write_gexf(kabm_g_comp, 'gephi_graphs/' + group + '_g_comp_.gexf')
nx.write_gexf(kabm_g_org, 'gephi_graphs/' + group + '_g_org.gexf')
nx.write_gexf(kabm_g_comp_org, 'gephi_graphs/' + group + '_g_comp_org.gexf')
nx.write_gexf(kabm_g_comp_org_hideg, 'gephi_graphs/' + group + '_g_comp_org_hideg.gexf')

#draw_graph(all_g_comp, output_filename='G_comp_colored_' + group + '.html', notebook=False)

## Other Services Graphs

In [144]:
group = 'Інші державні служби, комісії, і т.п.' 
other_g_comp, other_g_org, other_g_comp_org = build_graphs_for_group(group, cdf=cdf, odf=odf)
other_g_comp_org_hideg = remove_low_deg_nodes(other_g_comp_org)

current group:  Інші державні служби, комісії, і т.п.
creating group datfames
add political party info
intersections with political parties
1
intersections with political parties
0
shape of dataframes
(2286, 34) (2233, 31)
build companies gaph


Built a graph of all Інші державні служби, комісії, і т.п. people and companies
Name: 
Type: Graph
Number of nodes: 3429
Number of edges: 2286
Average degree:   1.3333

Network density: 0.00038895371450797355


Built a graph of all Інші державні служби, комісії, і т.п. people and organizations
Name: 
Type: Graph
Number of nodes: 2487
Number of edges: 2233
Average degree:   1.7957

Network density: 0.0007223402400446926


Built a composed graph of all Інші державні служби, комісії, і т.п. people, companies, and organizations
Name: 
Type: Graph
Number of nodes: 5735
Number of edges: 4519
Average degree:   1.5759

Network density: 0.0002748408140129283
Removed nodes with degree 0 or 1 from the graph of people and comapnies
Name: 
Type: Graph
Numbe

In [145]:
nx.write_gexf(other_g_comp, 'gephi_graphs/' + group + '_g_comp_.gexf')
nx.write_gexf(other_g_org, 'gephi_graphs/' + group + '_g_org.gexf')
nx.write_gexf(other_g_comp_org, 'gephi_graphs/' + group + '_g_comp_org.gexf')
nx.write_gexf(other_g_comp_org_hideg, 'gephi_graphs/' + group + '_g_comp_org_hideg.gexf')

#draw_graph(all_g_comp, output_filename='G_comp_colored_' + group + '.html', notebook=False)

## PensFond Graphs

In [146]:
group = 'Пенсійний фонд' 
pens_g_comp, pens_g_org, pens_g_comp_org = build_graphs_for_group(group, cdf=cdf, odf=odf)
pens_g_comp_org_hideg = remove_low_deg_nodes(pens_g_comp_org)

current group:  Пенсійний фонд
creating group datfames
add political party info
intersections with political parties
0
intersections with political parties
0
shape of dataframes
(840, 34) (301, 31)
build companies gaph


Built a graph of all Пенсійний фонд people and companies
Name: 
Type: Graph
Number of nodes: 1300
Number of edges: 840
Average degree:   1.2923

Network density: 0.0009948481080120804


Built a graph of all Пенсійний фонд people and organizations
Name: 
Type: Graph
Number of nodes: 419
Number of edges: 301
Average degree:   1.4368

Network density: 0.003437210948830092


Built a composed graph of all Пенсійний фонд people, companies, and organizations
Name: 
Type: Graph
Number of nodes: 1696
Number of edges: 1141
Average degree:   1.3455

Network density: 0.0007938164412534091
Removed nodes with degree 0 or 1 from the graph of people and comapnies
Name: 
Type: Graph
Number of nodes: 238
Number of edges: 92
Average degree:   0.7731

Network density: 0.003262064319398645

In [147]:
nx.write_gexf(pens_g_comp, 'gephi_graphs/' + group + '_g_comp_.gexf')
nx.write_gexf(pens_g_org, 'gephi_graphs/' + group + '_g_org.gexf')
nx.write_gexf(pens_g_comp_org, 'gephi_graphs/' + group + '_g_comp_org.gexf')
nx.write_gexf(pens_g_comp_org_hideg, 'gephi_graphs/' + group + '_g_comp_org_hideg.gexf')

#draw_graph(all_g_comp, output_filename='G_comp_colored_' + group + '.html', notebook=False)

## Person-to-Person graphs: Prosecutors and Parliament

In [41]:
# TO USE IF WORKING WITH NO SUBSET BUT WITH ALL DATA

# final_cdf = cdf.loc[(cdf.dnt_organization_group == 'Парламент') |
#                     (cdf.dnt_organization_group == 'Прокуратура') |
#                     #(cdf.dnt_organization_group == 'Адміністрація / Секретаріат Президента') |
#                     #(cdf.dnt_organization_group == 'Кабмін, міністерства та підлеглі органи') |
#                     #(cdf.position.str.lower().str.contains('офіс президента')) |
#                     (cdf.office.str.lower().str.contains('президент україни'))][['id',
#                                                                            'dnt_organization_group',
#                                                                            'full_name',
#                                                                            'office',
#                                                                            'position',
#                                                                            'uni_company_name', 
#                                                                            'company_id']]

# final_odf = odf.loc[(odf.dnt_organization_group == 'Парламент') |
#                     (odf.dnt_organization_group == 'Прокуратура') |
#                     #(odf.dnt_organization_group == 'Адміністрація / Секретаріат Президента') |
#                     #(odf.dnt_organization_group == 'Кабмін, міністерства та підлеглі органи') |
#                     #(odf.position.str.lower().str.contains('офіс президента')) |
#                     (odf.office.str.lower().str.contains('президент україни'))][['id',
#                                                                            'dnt_organization_group',
#                                                                            'full_name',
#                                                                            'office',
#                                                                            'position',
#                                                                            'uni_org_name', 
#                                                                            'org_id']]

# # add the info about political party that was matched to the main dataframes
# final_cdf = final_cdf.merge(party_dict[['full_name', 'party']], on='full_name', how='left')
# final_odf = final_odf.merge(party_dict[['full_name', 'party']], on='full_name', how='left')

# final_cdf.party.replace(np.nan, 'Не відомо', inplace=True)
# final_odf.party.replace(np.nan, 'Не відомо', inplace=True)

# # create a subgraph from the oversll graph with the nodes of g_c_all

# id_nodes_people = final_cdf.id + final_odf.id
# id_nodes_people = set(id_nodes_people)

# print(len(id_nodes_people))

# #g_final = g_comp_org_all.subgraph(id_nodes_people).copy()
# g_final = g_comp_org_all.subgraph(id_nodes_people).copy()
# print(nx.info(g_final))

In [124]:
def remove_high_deg_nodes(full_graph):
    #g_hideg = nx.Graph()                                                                                                                                     
    #hideg_edges = filter(lambda x: full_graph.degree()[x[0]] > 1 and full_graph.degree()[x[1]] > 1, full_graph.edges())
    #g_hideg.add_edges_from(hideg_edges)
    
    titles = nx.get_node_attributes(full_graph, 'title').items()
    degrees = full_graph.degree()  
    keep_deg_list = [n for (n, deg) in degrees if deg <= 30]
    
    to_keep_people = [n for (n, t) in titles if t == 'declarant'] # all declarants
    to_keep_companies = [n for (n, t) in titles if (t == 'company') and (n in keep_deg_list)]
    to_keep_orgs = [n for (n, t) in titles if (t == 'organization') and (n in keep_deg_list)]

    to_keep_all = to_keep_people + to_keep_companies + to_keep_orgs
    
    print('people to keep: ', len(set(to_keep_people)))
    g_lowdeg = full_graph.subgraph(to_keep_all)
    print('Removed comp and org nods with degree > 30 from the graph')

    graph_info(g_lowdeg)

    
    return g_lowdeg


def count_nodes(full_graph, ntype):
    i = 0

    for d in nx.get_node_attributes(full_graph, 'title').values():
        if d==ntype:
            i +=1

    return i


def count_group(full_graph, gr):
    i = 0

    for d in nx.get_node_attributes(composed_groups_comp_org, 'dnt_organization_group').values():
        if d==gr:
            i +=1

    return i

In [148]:
#composed_groups_comp_org = nx.compose(pros_g_comp_org, parl_g_comp_org)
#composed_groups_comp_org = nx.compose(court_g_comp_org, parl_g_comp_org)

### add groups
composed_groups_comp_org = parl_g_comp_org
composed_groups_comp_org = nx.compose(composed_groups_comp_org, pros_g_comp_org)
composed_groups_comp_org = nx.compose(composed_groups_comp_org, court_g_comp_org)
composed_groups_comp_org = nx.compose(composed_groups_comp_org, pens_g_comp_org)
composed_groups_comp_org = nx.compose(composed_groups_comp_org, other_g_comp_org)
#composed_groups_comp_org = nx.compose(composed_groups_comp_org, kabm_g_comp_org)

###

In [149]:
print('declarants ', count_declarants(composed_groups_comp_org))
graph_info(composed_groups_comp_org)

declarants  12205
Name: 
Type: Graph
Number of nodes: 20932
Number of edges: 17761
Average degree:   1.6970

Network density: 8.107681995138532e-05


In [150]:
composed_groups_comp_org = remove_high_deg_nodes(composed_groups_comp_org)
print('declarants ', count_declarants(composed_groups_comp_org))

people to keep:  12205
Removed comp and org nods with degree > 30 from the graph
Name: 
Type: Graph
Number of nodes: 20887
Number of edges: 11816
Average degree:   1.1314

Network density: 5.417128499106382e-05
declarants  12205


In [151]:
print(count_group(composed_groups_comp_org,'Парламент'))
print(count_group(composed_groups_comp_org,'Прокуратура'))
print(count_group(composed_groups_comp_org,'Суд'))
print(count_group(composed_groups_comp_org,'Пенсійний фонд'))
print(count_group(composed_groups_comp_org,'Інші державні служби, комісії, і т.п.'))
print(count_group(composed_groups_comp_org,'Кабмін, міністерства та підлеглі органи'))


651
2934
4095
927
3598
0


In [152]:
parent_graph = composed_groups_comp_org

titles = nx.get_node_attributes(parent_graph, 'title').items()
to_keep = [n for (n, t) in titles if t == 'declarant']

people_nodes_graph = parent_graph.subgraph(to_keep)
graph_info(people_nodes_graph)

Name: 
Type: Graph
Number of nodes: 12205
Number of edges: 0
Average degree:   0.0000

Network density: 0


In [153]:
final_graph = people_nodes_graph.copy()

for node1 in people_nodes_graph.nodes():
    for node2 in people_nodes_graph.nodes():
        if ((node1 != node2) & (nx.has_path(parent_graph, node1, node2))):
            shortest_path_length = nx.shortest_path_length(parent_graph, node1, node2)
            if shortest_path_length <= 2:
                final_graph.add_edge(node1, node2)
                final_graph.edges[node1, node2]['path_length']= shortest_path_length
                
                
print("Built a sub-graph of parliament members and prossecutors")
graph_info(final_graph)

Built a sub-graph of parliament members and prossecutors
Name: 
Type: Graph
Number of nodes: 12205
Number of edges: 14847
Average degree:   2.4329

Network density: 0.0001993557293322006


In [154]:
print(len(list(final_graph.nodes())))

print(len(set(list(set(cdf.loc[cdf.id.isin(list(final_graph.nodes()))]['id'])) + \
        list(set(odf.loc[odf.id.isin(list(final_graph.nodes()))]['id']))
       )))

c_final = cdf.loc[cdf.id.isin(list(final_graph.nodes()))][['id', 'decl_shortname', 'full_name', 
                                                       'office', 'position', 'position_type',
                                                       'position_category', 'corruptionAffected', 
                                                       'dnt_organization_group', 'city_type',
                                                       'city', 'rayon', 'oblast', 'country'
                                                      ]]

o_final = odf.loc[odf.id.isin(list(final_graph.nodes()))][['id', 'decl_shortname', 'full_name', 
                                                       'office', 'position', 'position_type',
                                                       'position_category', 'corruptionAffected', 
                                                       'dnt_organization_group', 'city_type',
                                                       'city', 'rayon', 'oblast', 'country'
                                                      ]]

nodes_df_final = c_final.append(o_final, ignore_index=True).drop_duplicates('id')
len(list(nodes_df_final.id))

12205
12205


12205

In [155]:
nodes_df_final['label'] = 'declarant' 
nodes_df_final.columns = ['id', 'decl_shortname', 'full_name', 
                          'office', 'position', 'position_type',
                          'position_category', 'corruptionAffected', 
                          'organization_group', 'city_type',
                          'city', 'rayon', 'oblast', 'country', 'label']

nodes_df_final.office = nodes_df_final.office.str.lower()
nodes_df_final.position = nodes_df_final.position.str.lower()
nodes_df_final.position_type = nodes_df_final.position_type.str.lower()
nodes_df_final.position_category = nodes_df_final.position_category.str.lower()
nodes_df_final.city_type = nodes_df_final.city_type.str.lower()
nodes_df_final.city = nodes_df_final.city.str.lower()
nodes_df_final.rayon = nodes_df_final.rayon.str.lower()
nodes_df_final.oblast = nodes_df_final.oblast.str.lower()
nodes_df_final.country = nodes_df_final.country.str.lower()



nx.set_node_attributes(final_graph, dict(zip(list(nodes_df_final['id']), list(nodes_df_final['decl_shortname']))), 'decl_shortname')
nx.set_node_attributes(final_graph, dict(zip(list(nodes_df_final['id']), list(nodes_df_final['full_name']))), 'full_name')
nx.set_node_attributes(final_graph, dict(zip(list(nodes_df_final['id']), list(nodes_df_final['office']))), 'office')
nx.set_node_attributes(final_graph, dict(zip(list(nodes_df_final['id']), list(nodes_df_final['position']))), 'position')
nx.set_node_attributes(final_graph, dict(zip(list(nodes_df_final['id']), list(nodes_df_final['position_type']))), 'position_type')
nx.set_node_attributes(final_graph, dict(zip(list(nodes_df_final['id']), list(nodes_df_final['position_category']))), 'position_category')
nx.set_node_attributes(final_graph, dict(zip(list(nodes_df_final['id']), list(nodes_df_final['corruptionAffected']))), 'corruptionAffected')
nx.set_node_attributes(final_graph, dict(zip(list(nodes_df_final['id']), list(nodes_df_final['organization_group']))), 'organization_group')
nx.set_node_attributes(final_graph, dict(zip(list(nodes_df_final['id']), list(nodes_df_final['city_type']))), 'city_type')
nx.set_node_attributes(final_graph, dict(zip(list(nodes_df_final['id']), list(nodes_df_final['city']))), 'city')
nx.set_node_attributes(final_graph, dict(zip(list(nodes_df_final['id']), list(nodes_df_final['rayon']))), 'rayon')
nx.set_node_attributes(final_graph, dict(zip(list(nodes_df_final['id']), list(nodes_df_final['oblast']))), 'oblast')
nx.set_node_attributes(final_graph, dict(zip(list(nodes_df_final['id']), list(nodes_df_final['country']))), 'country')
nx.set_node_attributes(final_graph, dict(zip(list(nodes_df_final['id']), list(nodes_df_final['label']))), 'label')

In [156]:
# save the final graph to a file
nx.write_gexf(final_graph, 'gephi_graphs/Парламент_Прокуратура_Суд_Інше_ПенФонд_30_final_graph.gexf')
#nx.write_gexf(final_graph, 'gephi_graphs/Парламент_Суд_final_graph_hideg.gexf')