In [None]:
# First: pip install wheel
# Next: Download Numpy and Scipy form Gholke's repo Numpy and SciPy
# Then:
# pip install numpy_package.whl
# pip install scipy_package.whl
# https://towardsdatascience.com/6-tips-to-optimize-an-nlp-topic-model-for-interpretability-20742f3047e2

In [3]:
# hide
from pathlib import Path
import numpy as np
import pandas as pd
import pickle

In [6]:
# hide
DATA_DIR = Path('data')
def load(filename):
    f = open(DATA_DIR/filename,"rb")
    return pickle.load(f)
    
def save(data, filename):
    with open(DATA_DIR/filename, 'wb') as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)


party_colors = {
  'CDA':'#5cb957',
  'ChristenUnie':'#00a5e8',
  'D66':'#04a438',
  'GroenLinks':'#48a641',
  'PVV':'#002759',
  'PvdA':'#df111a',
  'PvdD':'#006b2d',
  'SGP':'#d86120',
  'SP':'#e3001b',
  'VVD':'#ff7f0e',
  'DENK':'#17becf',
  'FVD':'#800000',
  'Groep Krol/vKA':'pink'}

In [8]:
# hide
df = load("df_including_topics_full.pickle")
print(len(df))

29484


In [34]:
# hide
def get_largest_parties_chamber(chamber='Rutte III',top=False):
    tmp = df[df['Kamer']== chamber]
    parties = [p for p in tmp.columns if 'Aantal_stemmen' in p][1:]
    tmp = tmp[parties].mean().sort_values(ascending=False)
    tmp = tmp[tmp.notna()]
    tmp.index = tmp.index.str[15:]
    if top: return tmp[:top].index
    else: return tmp.index
parties = list(get_largest_parties_chamber(top=12))
parties

['VVD',
 'PVV',
 'CDA',
 'D66',
 'GroenLinks',
 'SP',
 'PvdA',
 'ChristenUnie',
 'PvdD',
 'DENK',
 'SGP',
 'FVD']

## Over welke onderwerpen worden het meeste moties ingediend?

In [16]:
# hide_input
# df.groupby(['Jaar', 'Topic']).size().unstack(fill_value=0)
source = df.groupby(['Jaar', 'Topic']).size().reset_index()
source = source.rename(columns={0:'Aantal moties'})
source.head()

# Overview of topic distribution over all years
import altair as alt
alt.Chart(source).mark_bar().encode(
    x=alt.X('Aantal moties:Q', stack='normalize',axis=alt.Axis(format='%')),
    y='Jaar:O',
    color=alt.Color('Topic:N',sort=alt.EncodingSortField('Aantal moties', order='descending'))
    # order=alt.Order('Aantal moties:Q',sort='descending')

)

## Meest actieve partijen per onderwerp

In [36]:
# # Overview of topic 'owner' throughout the years
selected_topic = 'b'
source = df[df['Topic'] == selected_topic]
source = source[['Jaar','Indienende_partij','Titel']].groupby(['Jaar', 'Indienende_partij']).count().reset_index()
source = source.rename(columns={'Titel':'Aantal'})

alt.Chart(source).mark_bar().encode(
    x = alt.X('Aantal:Q', stack='normalize',sort=alt.SortField(field="Aantal", order='ascending'),axis=alt.Axis(format='%')),
    y = 'Jaar:O',
    color=alt.Color("Indienende_partij", 
                      scale = alt.Scale(domain=parties,range= [party_colors[p] for p in parties])),
    order=alt.Order('Aantal:Q',sort='descending')
).transform_filter(
    alt.datum.Aantal > 10
).configure_axis(
    grid=False
).configure_view(
    strokeWidth=1
).properties(width=300)

## Hoofdonderwerpen per partij tijdens Rutte III

In [38]:
# # Overview of topic 'owner' throughout the years
source = df[df['Kamer'] == 'Rutte III']
source = source[['Indienende_partij','Topic','Titel']].groupby(['Indienende_partij','Topic']).count().reset_index()
source = source.rename(columns={'Titel':'Aantal'})
print(source.head())

alt.Chart(source).mark_bar().encode(
    x = alt.X('Aantal:Q', stack='normalize',sort=alt.SortField(field="Aantal", order='ascending'),axis=alt.Axis(format='%')),
    y = 'Indienende_partij:N',
    color=alt.Color("Topic:N"),
                      # scale = alt.Scale(domain=parties,range= [party_colors[p] for p in parties])),
    order=alt.Order('Aantal:Q',sort='descending')
    
).transform_filter(
    alt.datum.Aantal > 4
).configure_axis(
    grid=False
).configure_view(
    strokeWidth=1
).properties(width=300)

  Indienende_partij Topic  Aantal
0            50PLUS     a       1
1            50PLUS     b      29
2            50PLUS     c     129
3            50PLUS     d      18
4            50PLUS     e      16


## Onderzoek naar klimaat
Omdat ik denk dat klimaat en natuur op de lange termijn het belangrijkst zijn voor de mens heb ik deze nog eens handmatig doorgenomen. Ik heb alle moties doorgelezen die over natuur en klimaat gaan en gelabeld of de motie 'voor' of 'tegen' het klimaat is. Op deze manier kunnen we kijken of wat het stemgedrag is van partijen.

In [None]:
!pip install pandas-alive
# https://medium.com/dunder-data/create-a-bar-chart-race-animation-in-python-with-matplotlib-477ed1590096
# https://towardsdatascience.com/creating-bar-chart-race-animation-with-python-cdb01144074e

In [None]:
import pandas_alive
import pandas as pd

data_raw = pd.read_csv(
    "https://raw.githubusercontent.com/owid/owid-datasets/master/datasets/Long%20run%20life%20expectancy%20-%20Gapminder%2C%20UN/Long%20run%20life%20expectancy%20-%20Gapminder%2C%20UN.csv"
)

list_G7 = [
    "Canada",
    "France",
    "Germany",
    "Italy",
    "Japan",
    "United Kingdom",
    "United States",
]

data_raw = data_raw.pivot(
    index="Year", columns="Entity", values="Life expectancy (Gapminder, UN)"
)

data = pd.DataFrame()
data["Year"] = data_raw.reset_index()["Year"]
for country in list_G7:
    data[country] = data_raw[country].values

data = data.fillna(method="pad")
data = data.fillna(0)
data = data.set_index("Year").loc[1900:].reset_index()

data["Year"] = pd.to_datetime(data.reset_index()["Year"].astype(str))

data = data.set_index("Year")

animated_bar_chart = data.plot_animated(
    period_fmt="%Y%M",perpendicular_bar_func="mean", period_length=200,fixed_max=True
)

animated_line_chart = data.plot_animated(
    kind="line", period_fmt="%Y", period_length=200,fixed_max=True
)

pandas_alive.animate_multiple_plots(
    "life-expectancy.gif",
    plots=[animated_bar_chart, animated_line_chart],
    title="Life expectancy in G7 countries up to 2015",
    adjust_subplot_left=0.2, adjust_subplot_top=0.9
)

Generating BarChartRace, plotting ['Canada', 'France', 'Germany', 'Italy', 'Japan', 'United Kingdom', 'United States']
Generating LineChart, plotting ['Canada', 'France', 'Germany', 'Italy', 'Japan', 'United Kingdom', 'United States']
