# Analyzing the European Parliament

In [None]:
import json

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

import umap
import umap.plot

from tqdm.auto import tqdm

In [None]:
sns.set_context('talk')

## Download data

In [None]:
# %%bash

# wget --no-clobber https://parltrack.org/dumps/ep_votes.json.lz
# lzip -d ep_votes.json.lz

# wget --no-clobber https://parltrack.org/dumps/ep_meps.json.lz
# lzip -d ep_meps.json.lz

## Transform JSON to dataframes

### MEPs

In [None]:
fname = 'ep_meps.json'

tmp = []
with open(fname) as fd:
    for line in tqdm(fd.readlines()):
        line = line.lstrip('[,]')
        if len(line) == 0:
            continue

        data = json.loads(line)

        #         if not data['active']:
        #             continue

        tmp.append(
            {
                'UserID': data['UserID'],
                'name': data['Name']['full'],
                'birthday': data['Birth']['date'] if 'Birth' in data else np.nan,
                'active': data['active'],
                'group': data.get('Groups', [{'groupid': np.nan}])[-1][
                    'groupid'
                ],  # assumption: last group is latest one. Is this true?
            }
        )

In [None]:
df_meps = pd.DataFrame(tmp)
df_meps['birthday'] = pd.to_datetime(df_meps['birthday'])

df_meps.set_index('UserID', inplace=True)

df_meps['group'].replace(
    {'Group of the European United Left - Nordic Green Left': 'GUE/NGL'}, inplace=True
)  # is there a difference?

df_meps.head()

### Votes

In [None]:
fname = 'ep_votes.json'

tmp = []
tmp_matrix = {}
with open(fname) as fd:
    for line in tqdm(fd.readlines()):
        line = line.lstrip('[,]')
        if len(line) == 0:
            continue

        data = json.loads(line)
        tmp.append(
            {'date': data['ts'], 'voteid': data['voteid'], 'title': data['title']}
        )

        if 'votes' in data:
            tmp_matrix[data['voteid']] = {
                **{
                    mep['mepid']: '+'
                    for mep_list in data['votes']
                    .get('+', {'groups': {'foo': []}})['groups']
                    .values()
                    for mep in mep_list
                    if 'mepid' in mep
                },
                **{
                    mep['mepid']: '-'
                    for mep_list in data['votes']
                    .get('-', {'groups': {'foo': []}})['groups']
                    .values()
                    for mep in mep_list
                    if 'mepid' in mep
                },
                **{
                    mep['mepid']: '0'
                    for mep_list in data['votes']
                    .get('0', {'groups': {'foo': []}})['groups']
                    .values()
                    for mep in mep_list
                    if 'mepid' in mep
                },
            }

In [None]:
df_votematrix = pd.DataFrame.from_dict(tmp_matrix, orient='index')

df_votematrix.index.name = 'voteid'
df_votematrix.columns.name = 'mepid'

# df_votematrix.sort_values('voteid', axis=0, inplace=True)
df_votematrix.sort_values('mepid', axis=1, inplace=True)

df_votematrix.head()

In [None]:
df_votes = pd.DataFrame(tmp)
df_votes['date'] = pd.to_datetime(df_votes['date'])

df_votes.set_index('voteid', inplace=True)

df_votes.tail()

## Exploration

### MEP party distribution

In [None]:
df_meps['active'].sum()

In [None]:
group_counts = df_meps.loc[df_meps['active'], 'group'].value_counts()
labels = group_counts.to_frame().apply(lambda x: f'{x.name} ({x.iloc[0]})', axis=1)

ax = group_counts.plot.pie(figsize=(8, 6), labels=labels, wedgeprops=dict(width=0.5))
ax.axis('equal')

### MEP age distribution

In [None]:
df_meps['age'] = (pd.Timestamp.today() - df_meps['birthday']) / np.timedelta64(1, 'Y')

In [None]:
g = sns.displot(
    data=df_meps[df_meps['active']],
    x='age',
    col='group',
    col_wrap=3,
    height=3,
    aspect=4 / 3,
)

g.set_xlabels('MEP age [years]')

## Voting patterns

In [None]:
df_votematrix.head()

### Zoomed-out voting data overview

In [None]:
hm_sub = (
    df_votematrix.loc[
        df_votes[df_votes['date'] > '20200721'].index, df_meps[df_meps['active']].index
    ]
    .dropna(axis=1)
    .replace({'+': 1, '-': 0, '0': -1})
    .T.merge(df_meps[['group']], how='inner', left_index=True, right_index=True)
    .set_index('group', append=True)
    .reorder_levels(['group', 'UserID'])
    .sort_index()
)

In [None]:
hm_sub.head()

In [None]:
hm_sub_grpd = hm_sub.groupby('group').agg(lambda x: x.value_counts().index[0])
hm_sub_grpd.head()

In [None]:
sns.clustermap(hm_sub_grpd.T)

### Who is the most active MEP?

Here we equate "active" with "has voted most often". This is most likely quite misleading.

In [None]:
df_hasvoted = ~df_votematrix[df_meps[df_meps['active']].index].isna()

In [None]:
df_hasvoted.sum(axis=0).sort_values(ascending=False).to_frame('vote_count').merge(
    df_meps, how='left', left_index=True, right_index=True
).head(10)

### Cluster MEPs by votes

#### Prepare data

In [None]:
# TODO: how to handle NaN values
matrix = (
    df_votematrix[df_meps[df_meps['active']].index]
    .replace({'+': 1, '-': 0, '0': -1, np.nan: -1})
    .tail(1000)
)
matrix

#### Do clustering

Possible metrics:
hamming
jaccard
dice
russellrao
kulsinski
rogerstanimoto
sokalmichener
sokalsneath
yule

In [None]:
reducer = umap.UMAP(metric='hamming')

In [None]:
embedding = reducer.fit_transform(matrix.T)
embedding.shape

In [None]:
df_umap = pd.DataFrame(
    embedding,
    index=matrix.columns,
    columns=[f'UMAP_{i}' for i in range(embedding.shape[1])],
)

df_umap['group'] = df_meps['group']

df_umap.head()

#### Static visualization

In [None]:
plt.figure(figsize=(8, 6))

sns.scatterplot(data=df_umap, x='UMAP_0', y='UMAP_1', hue='group')

plt.legend(loc='upper left', bbox_to_anchor=(1.05, 1), ncol=2, title=None)
plt.title('MEPs according to vote patterns')

# umap.plot.points(reducer, labels=df_umap['group'], theme='fire', width=1000, height=1000)

#### Interactive visualization

In [None]:
hover_data = df_meps.loc[df_umap.index].reset_index()

hover_data['birthday'] = hover_data['birthday'].apply(
    lambda x: x.strftime("%Y-%m-%d") if not pd.isnull(x) else 'undef'
)
hover_data['age'] = hover_data['age'].apply(
    lambda x: int(x) if not pd.isnull(x) else -1
)  # hover_data['age'].round().astype(pd.Int64Dtype())

hover_data.head()

In [None]:
# https://github.com/lmcinnes/umap/issues/422
# umap.plot.output_notebook()

from bokeh.plotting import output_notebook

output_notebook()

In [None]:
p = umap.plot.interactive(
    reducer,
    labels=df_umap['group'].tolist(),
    hover_data=hover_data,
    point_size=5,
    theme='fire',
)
umap.plot.show(p)