In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import networkx as nx

In [7]:
file = 'GLO3k1p119p_i.csv'
df = pd.read_csv(file, low_memory=False)

In [8]:
df

Unnamed: 0,vote_id,Nrleg,NazwiskoImie,Glos,Klub
0,3001001,1,Adamczyk Franciszek,Za,AWS
1,3001001,2,Adamska-Wedler Elżbieta,Za,AWS
2,3001001,3,Adamski Władysław,Za,SLD
3,3001001,4,Ajchler Romuald,Za,SLD
4,3001001,5,Anusz Andrzej,Za,AWS
...,...,...,...,...,...
5427086,3119055,456,Żak Piotr,Za,niez.
5427087,3119055,457,Żelazowski Andrzej,Za,SLD
5427088,3119055,458,Żelichowski Stanisław,Nieobecny,PSL
5427089,3119055,460,Żyliński Adam Jacek,Nieobecny,UW


## Simple data validation

In [27]:
assert np.all(df.dtypes.values == [np.dtype('int64'), np.dtype('int64'), np.dtype('O'), np.dtype('O'), np.dtype('O')])

In [17]:
assert df.isnull().sum().sum() == 0

In [18]:
assert df[df.duplicated()].size == 0

## Assign node ID 
### Handling deputies swaps

In [37]:
# helper functions

def create_deputy_df(df):
    df_deputies = df[['Nrleg', 'NazwiskoImie', 'Klub']]
    df_deputies = df_deputies.groupby(['Nrleg', 'NazwiskoImie'])['Klub'].agg(['unique']).reset_index().rename(columns={"unique": "Klub"})
    return df_deputies

def make_queues(df_deputies, vote_ids, deputy_ids_per_vote):
    removed_ids = []
    new_ids = []

    for i in range(1, len(vote_ids)):
        ids_0 = deputy_ids_per_vote[i-1]
        ids_1 = deputy_ids_per_vote[i]

        removed = set(ids_0).difference(ids_1)
        new = set(ids_1).difference(ids_0)

        if len(removed) > 0:
            for idd in removed:
                r = (idd, list(df_deputies[df_deputies['Nrleg'] == idd]['Klub'].values[0]), vote_ids[i])
                removed_ids.append(r)
        if len(new) > 0:
            for idd in new:
                n = (idd, list(df_deputies[df_deputies['Nrleg'] == idd]['Klub'].values[0]), vote_ids[i])
                new_ids.append(n)
    return removed_ids, new_ids

def find_pairs(removed_ids, new_ids):
    ids_pairs = []
    node_id_dict = {i:i for i in range(1, 461)}

    while len(removed_ids) > 0:
        r_id, r_c, r_v_id = removed_ids.pop(0)
        search = 1
        i = 0
        while search and i < len(new_ids):
            n_id, n_c, n_v_id = new_ids[i]
            if len(set(r_c).intersection(set(n_c))) > 0 and r_v_id <= n_v_id:
                ids_pairs.append((r_id, n_id))
                node_id_dict[n_id] = r_id
                new_ids.pop(i)
                search = 0
            i += 1
    return node_id_dict

In [38]:
def assign_node_ids(df):
    df_deputies = create_deputy_df(df)
    
    vote_ids = sorted(df['vote_id'].unique())
    deputy_ids_per_vote = [df[df['vote_id'] == vote_id]['Nrleg'].unique() for vote_id in vote_ids]
    
    removed_ids, new_ids = make_queues(df_deputies, vote_ids, deputy_ids_per_vote)
    node_id_dict = find_pairs(removed_ids, new_ids)
    
    node_id_dict_func = lambda x: node_id_dict[x]
    node_id_dict_func = np.vectorize(node_id_dict_func)
    
    df_deputies['node_id'] = node_id_dict_func(df_deputies['Nrleg'])
    
    df_node_id = pd.merge(df[['vote_id', 'Nrleg', 'NazwiskoImie', 'Glos']], df_deputies, on=['Nrleg', 'NazwiskoImie'], how='left')
    df_node_id['node_id'] = df_node_id['node_id'] - 1
    return df_node_id

In [41]:
df = assign_node_ids(df)

In [42]:
df

Unnamed: 0,vote_id,Nrleg,NazwiskoImie,Glos,Klub,node_id
0,3001001,1,Adamczyk Franciszek,Za,[AWS],1
1,3001001,2,Adamska-Wedler Elżbieta,Za,"[AWS, KPN-OP]",2
2,3001001,3,Adamski Władysław,Za,[SLD],3
3,3001001,4,Ajchler Romuald,Za,[SLD],4
4,3001001,5,Anusz Andrzej,Za,[AWS],5
...,...,...,...,...,...,...
5427086,3119055,456,Żak Piotr,Za,"[AWS, niez.]",456
5427087,3119055,457,Żelazowski Andrzej,Za,[SLD],457
5427088,3119055,458,Żelichowski Stanisław,Nieobecny,[PSL],458
5427089,3119055,460,Żyliński Adam Jacek,Nieobecny,[UW],460


## Calulating edge weights

"As the first step in our methodology we construct a graph where **each node represents one of the deputies** and **edges are drawn every time two deputies display the same voting behavior (i.e. both vote in favor, against or abstain from vote. No edges are drawn for absent deputies)**. We then normalize edges by the total number of votes in the reference period in order to obtain a weighted graph where weights are . Full weight is given to two deputies  if they participated in all sessions and voted exactly the same way in all of sessions. When a deputy quits the parliament, because of incompatibility, resignation etc., and his or her seat is taken by a new person, we consider the two deputies as being just one node (we check whether this transition leads to some votes in which none of the two deputies had their chairs without finding any discontinuity)."

In [85]:
# czym się różni 'Nie oddał głosu' od 'Wstrzymał się' ?!
df['Glos'].unique()

array(['Za', 'Nieobecny', 'Nie oddał głosu', 'Wstrzymał się', 'Przeciw'],
      dtype=object)

In [125]:
def egde_weights(df):
    """Returns matrix with edge values."""
    common_votes = np.zeros((460, 460))
    vote_ids = df['vote_id'].unique()
    
    for vote_id in tqdm(vote_ids):
        dep_yes = df[(df['vote_id'] == vote_id) & (df['Glos'] == 'Za')]['node_id'].values
        dep_no = df[(df['vote_id'] == vote_id) & (df['Glos'] == 'Przeciw')]['node_id'].values
        dep_abstain = df[(df['vote_id'] == vote_id) & ((df['Glos'] == 'Wstrzymał się') | (df['Glos'] == 'Nie oddał głosu'))]['node_id'].values
        # dep_absent = df[(df['vote_id'] == vote_id) & (df['Glos'] == 'Nieobecny')]['node_id'].values

        # votes in favor
        for i in range(len(dep_yes)):
            for j in range(i+1, len(dep_yes)):
                common_votes[dep_yes[i], dep_yes[j]] += 1
                # common_votes[dep_yes[j], dep_yes[i]] += 1

        # votes in against
        for i in range(len(dep_no)):
            for j in range(i+1, len(dep_no)):
                common_votes[dep_no[i], dep_no[j]] += 1
                # common_votes[dep_no[j], dep_no[i]] += 1

        # abstain from vote
        for i in range(len(dep_abstain)):
            for j in range(i+1, len(dep_abstain)):
                common_votes[dep_abstain[i], dep_abstain[j]] += 1
                # common_votes[dep_abstain[j], dep_abstain[i]] += 1

    return common_votes / len(vote_ids)

In [None]:
edge_matrix = egde_weights(df)

 14%|██████████▊                                                                | 1704/11801 [27:00<2:37:31,  1.07it/s]

In [126]:
np.allclose(edge_matrix, np.triu(edge_matrix)) # upper triangular matrix

## Create graph

In [None]:
G = nx.from_numpy_matrix(A, create_using=nx.Graph())
G.edges()

TO DO:
- save matrix
- add date to df
- add atributes to the graph
- deputy changing party? detection? calculate graph each time? take last party?
- split data by month
- calculate monthly matices