In [None]:
# First: pip install wheel
# Next: Download Numpy and Scipy form Gholke's repo Numpy and SciPy
# Then:
# pip install numpy_package.whl
# pip install scipy_package.whl
# https://towardsdatascience.com/6-tips-to-optimize-an-nlp-topic-model-for-interpretability-20742f3047e2

In [2]:
# hide
from pathlib import Path
import numpy as np
import pandas as pd
import pickle

In [3]:
# hide
DATA_DIR = Path('data')
def load(filename):
    f = open(DATA_DIR/filename,"rb")
    return pickle.load(f)
    
def save(data, filename):
    with open(DATA_DIR/filename, 'wb') as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)


party_colors = {
  'CDA':'#5cb957',
  'ChristenUnie':'#00a5e8',
  'D66':'#04a438',
  'GroenLinks':'#48a641',
  'PVV':'#002759',
  'PvdA':'#df111a',
  'PvdD':'#006b2d',
  'SGP':'#d86120',
  'SP':'#e3001b',
  'VVD':'#ff7f0e',
  'DENK':'#17becf',
  'FVD':'#800000',
  'Groep Krol/vKA':'pink'}

In [4]:
# hide
df = load("df_including_topics_full.pickle")
print(len(df))

29484


In [5]:
# hide
def get_largest_parties_chamber(chamber='Rutte III',top=False):
    tmp = df[df['Kamer']== chamber]
    parties = [p for p in tmp.columns if 'Aantal_stemmen' in p][1:]
    tmp = tmp[parties].mean().sort_values(ascending=False)
    tmp = tmp[tmp.notna()]
    tmp.index = tmp.index.str[15:]
    if top: return tmp[:top].index
    else: return tmp.index
parties = list(get_largest_parties_chamber(top=12))
parties

['VVD',
 'PVV',
 'CDA',
 'D66',
 'GroenLinks',
 'SP',
 'PvdA',
 'ChristenUnie',
 'PvdD',
 'DENK',
 'SGP',
 'FVD']

In [46]:
parties = ['VVD',
 'CDA',
 'ChristenUnie',
 'D66',
 'SGP',
 'FVD',
 'PVV',
 'PvdA',
 'DENK',
 'GroenLinks',
 'SP',
 'PvdD']

## Over welke onderwerpen worden het meeste moties ingediend?

In [115]:
# hide_input
# df.groupby(['Jaar', 'Topic']).size().unstack(fill_value=0)
source = df.groupby(['Jaar', 'Topic']).size().reset_index()
source = source.rename(columns={0:'Aantal moties'})
source.head()

# Overview of topic distribution over all years
import altair as alt
alt.Chart(source).mark_bar().encode(
    x='Jaar:O',
    y=alt.Y('Aantal moties:Q', stack='normalize',axis=alt.Axis(format='%')),
    color=alt.Color('Topic:N'),
    # sort=alt.EncodingSortField('Aantal moties', order='descending'))
    # order=alt.Order('Aantal moties:Q',sort='descending')
    tooltip = 'Topic'
)

## Meest actieve partijen per onderwerp

In [338]:
# # Overview of topic 'owner' throughout the years
selected_topic = 'Onderwijs'
source = df[df['Topic'] == selected_topic]
source = source[['Jaar','Indienende_partij','Titel']].groupby(['Jaar', 'Indienende_partij']).count().reset_index()
source = source.rename(columns={'Titel':'Aantal'})

alt.Chart(source).mark_bar().encode(
    x = 'Jaar:O',
    y = alt.Y('Aantal:Q', stack='normalize',sort=alt.SortField(field="Aantal", order='ascending'),axis=alt.Axis(format='%')),
    color=alt.Color("Indienende_partij", 
                      scale = alt.Scale(domain=parties,range= [party_colors[p] for p in parties])),
    # order=alt.Order('Aantal:Q',sort='descending'),
    tooltip = 'Indienende_partij'

).transform_filter(
    alt.datum.Aantal > 5
).configure_axis(
    grid=False
).configure_view(
    strokeWidth=1
).properties(width=300)

In [300]:
import pprint
pprint.pprint(list(df[(df['Topic'] == 'Dierenwelzijn') & (df['Indienende_partij'] == 'D66')]['Text']))

   MOTIE VAN HET LID SCHOUW \n'
 'Voorgesteld 4 maart 2014 \n'
 'De Kamer, \n'
 'gehoord de beraadslaging, \n'
 'constaterende dat na de ingestelde verhoging boetes bij overtreding van \n'
 'de Warenwet voor grote bedrijven slechts € 2.100,– zullen bedragen; \n'
 'overwegende dat hogere boetes een afschrikkende werking kunnen \n'
 'hebben en dat kosten voor inspectie meer bij overtreders gelegd kunnen \n'
 'worden; \n'
 'verzoekt de regering, een pilot te starten met hogere boetes bij \n'
 'overtreding van de Warenwet en de Kamer hierover te informeren, \n'
 'en gaat over tot de orde van de dag. \n'
 'Schouw\n'
 ' \n'
 ' \n'
 ' \n'
 ' \n'
 'kst-31532-129\n'
 'ISSN 0921 - 7371\n'
 '’s-Gravenhage 2014 Tweede Kamer, vergaderjaar 2013–2014, 31 532, nr. 129',
 '2\n'
 'Tweede Kamer der Staten-Generaal\n'
 'Vergaderjaar 2013–2014 \n'
 '31 839 Jeugdzorg \n'
 'Nr. 399   GEWIJZIGDE MOTIE VAN HET LID BERGKAMP C.S. TER \n'
 'VERVANGING VAN DIE GEDRUKT ONDER NR. 398 \n'
 'De Kamer, \n'
 'gehoord de

## Hoofdonderwerpen per partij tijdens Rutte III

In [112]:
alt.Chart(source).mark_circle().encode(
    alt.X('Topic:N'),
    alt.Y('Indienende_partij:N'),
    size=alt.Size('sum(Aantal)', scale=alt.Scale(range=[0,500])),
    color=alt.Color("Indienende_partij", 
                      scale = alt.Scale(domain=parties,range= [party_colors[p] for p in parties])),
)

In [105]:
#normalized version
source = df[(df['Kamer'] == 'Rutte III') & (df['Indienende_partij'].isin(parties))]

source = source.groupby(['Indienende_partij','Topic']).size().reset_index(name='Aantal')
sums = source.groupby('Indienende_partij')['Aantal'].transform('sum')
source['Aantal'] = source['Aantal'].div(sums)


In [106]:
alt.Chart(source).mark_circle().encode(
    alt.X('Topic:N'),
    alt.Y('Indienende_partij:N'),
    size=alt.Size('sum(Aantal)', scale=alt.Scale(range=[0,500])),
    color=alt.Color("Indienende_partij", 
                      scale = alt.Scale(domain=parties,range= [party_colors[p] for p in parties])),
)

In [324]:
aantal_moties = df.groupby(['Kamer','Indienende_partij', 'Topic']).count()['Titel'].reset_index()
aantal_moties.rename(columns={'Titel':'aantal'}, inplace=True)
aantal_succesvol = df[df['BesluitSoort']==1].groupby(['Kamer','Indienende_partij','Topic']).count()['Titel'].reset_index()
aantal_succesvol.rename(columns={'Titel':'aantal_succesvol'}, inplace=True)
max_votes_per_party = df[(['Aantal_stemmen_'+p for p in get_largest_parties_chamber('Rutte III',12)]+['Kamer'])].groupby('Kamer').quantile(.95)
max_votes_per_party = pd.melt(max_votes_per_party, ignore_index=False).reset_index()
max_votes_per_party = max_votes_per_party.rename(columns={'variable':'Indienende_partij', 'value':'num_votes'})
max_votes_per_party['Indienende_partij'] = max_votes_per_party['Indienende_partij'].str[15:]
stats = aantal_moties.merge(max_votes_per_party).merge(aantal_succesvol)

stats['aantal_zetel'] = stats['aantal'] /stats['num_votes']
stats['aantal_succesvol_zetel'] = stats['aantal_succesvol'] /stats['num_votes']
stats.head()

Unnamed: 0,Kamer,Indienende_partij,Topic,aantal,num_votes,aantal_succesvol,aantal_zetel,aantal_succesvol_zetel
0,Balkenende IV,CDA,Banken,5,41.0,5,0.121951,0.121951
1,Balkenende IV,CDA,Bedrijven,12,41.0,12,0.292683,0.292683
2,Balkenende IV,CDA,Belastingen,6,41.0,6,0.146341,0.146341
3,Balkenende IV,CDA,Buitenlands conflict,9,41.0,9,0.219512,0.219512
4,Balkenende IV,CDA,Buitenlandse zaken,11,41.0,9,0.268293,0.219512


In [325]:
source = stats[stats['Kamer']=='Rutte III']
alt.Chart(source).mark_circle().encode(
    alt.X('Topic:N'),
    alt.Y('Indienende_partij:N'),
    size=alt.Size('sum(aantal_succesvol_zetel)', scale=alt.Scale(range=[0,500])),
    color=alt.Color("Indienende_partij", 
                      scale = alt.Scale(domain=parties,range= [party_colors[p] for p in parties])),
)

## Ontwikkeling per partij

In [333]:
# hide_input
# this is not normalized yet!
partij = 'PVV'
source = df[df['Indienende_partij'] == partij]
source = source.groupby(['Jaar', 'Topic']).size().reset_index()
source = source.rename(columns={0:'Aantal moties'})
source.head()

# Overview of topic distribution over all years
import altair as alt
alt.Chart(source).mark_circle().encode(
    y=alt.Y('Topic:N'),
    x='Jaar:O',
    size=alt.Size('Aantal moties:Q', scale=alt.Scale(range=[0,500])),
    tooltip = 'Topic')

## Onderzoek naar klimaat
Omdat ik denk dat klimaat en natuur op de lange termijn het belangrijkst zijn voor de mens heb ik deze nog eens handmatig doorgenomen. Ik heb alle moties doorgelezen die over natuur en klimaat gaan en gelabeld of de motie 'voor' of 'tegen' het klimaat is. Op deze manier kunnen we kijken of wat het stemgedrag is van partijen.

In [14]:
partij = 'VVD'
topic = 'Klimaat'
source = df[(df['Topic'] == topic) & (df['Indienende_partij'] == partij)]


In [18]:
moties = list(source['Text'])
moties[:10]

['2\nTweede Kamer der Staten-Generaal\nVergaderjaar 2009–2010\n31 963 Wijziging van de Wet milieubeheer, de Wet\nluchtvaart en de Wet op de economische\ndelicten ten behoeve van de implementatie van\nrichtlijn nr. 2008/101/EG van het Europees\nParlement en de Raad van de Europese Unie van\n19 november 2008 (PbEU 2009, L 8) tot\nwijziging van Richtlijn 2003/87/EG teneinde ook\nluchtvaartactiviteiten op te nemen in de\nregeling voor de handel in\nbroeikasgasemissierechten binnen de\nGemeenschap (handel in emissierechten\nluchtvaart)\nNr. 9 MOTIE VAN HET LID NEPPÉRUS\nVoorgesteld 9 december 2009\nDe Kamer,\ngehoord de beraadslaging,\nconstaterende, dat emissiehandel (ETS) alleen wordt ingevoerd in de\nEuropese Unie;\noverwegende, dat dit een nadelig effect zal hebben op de Nederlandse\nluchthavens, met name voor de netwerkfunctie die Schiphol vervult;\noverwegende, dat de uitstoot van broeikasgassen onverminderd hoog zal\nblijven en dat er sprake zal zijn van carbon leakage;\nconstaterend

In [119]:
source = df[(df['Topic'] == 'Islam') & (df['Kamer'] == 'Rutte III')]
source.groupby('Indienende_partij').mean()['Stem_PVV'].sort_values()

Indienende_partij
PvdA            0.000000
DENK            0.083333
D66             0.250000
CDA             0.285714
GroenLinks      0.333333
ChristenUnie    0.500000
PvdD            0.500000
SP              0.500000
50PLUS          0.666667
VVD             0.684211
Van Haga        0.750000
FVD             0.909091
Krol            1.000000
PVV             1.000000
SGP             1.000000
Name: Stem_PVV, dtype: float64

In [144]:
def get_stem_column(largest):
    return [c for c in column_list if 'Stem_' in c and c != 'Stem_persoon' and c[5:] in largest]
    
def get_pca(df, n_components=1, num_largest=None, return_ratio=False):
    largest = parties
    stem_column = get_stem_column(largest)
    source_year = df[stem_column].dropna(axis=1, how='all').T
    X_year = SimpleImputer(strategy='most_frequent').fit_transform(source_year)
    pca = PCA(n_components = n_components)
    pca = pca.fit(X_year)
    print('explained variance by factors', pca.explained_variance_ratio_,pca.explained_variance_ratio_.sum())  
    res_year = pca.transform(X_year)
    source = pd.DataFrame(res_year)
    source['partij'] = source_year.T.columns.str[5:]
    source = source.rename(index=str, columns={0: "x", 1: "y"}).sort_values('x',ascending=False)
    return (source, pca.explained_variance_ratio_) if return_ratio else source

In [262]:
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
size=800
def pca_topic(df, topic, kamer, twodim=False):
    column_list = df.columns
    source = df[(df['Topic'] == topic) & (df['Kamer'] == kamer)]
    num_moties = len(source)
    if twodim:
        source, explained_variance_ratio_ = get_pca(source, n_components = 2, return_ratio=True)
    else:
        source, explained_variance_ratio_ = get_pca(source, n_components = 1, return_ratio=True)
    mid = (source['x'].max() + source['x'].min())/2
    median = source['x'].median()
    if source[source['partij'] =='VVD']['x'].values > median: # make sure that VVD is on the right part of the x-axis
        source['x'] += 2 * (mid - source['x'])
    if twodim:
        points = alt.Chart(source,width= size * explained_variance_ratio_[0],height = size * explained_variance_ratio_[1]).mark_point().encode(
        # x=alt.X('x:Q', axis=alt.Axis(title='Eerste factor')),
        # y=alt.Y('y:Q', axis=alt.Axis(title='Tweede factor')),
        x=alt.X('x:Q', axis=None),
        y=alt.Y('y:Q', axis=None),
        color=alt.Color("partij", scale = alt.Scale(domain=parties,range= [party_colors[p] for p in parties]), legend=None),
        tooltip=['partij:N']
        )

        text = points.mark_text(
            align='left',
            baseline='middle',
            dx=np.random.uniform(0,10),
            dy=np.random.uniform(0,10)
            # opacity=0.5
        ).encode(
            text='partij:N'
        ).properties(
            title='Welke partijen lijken op elkaar'
        )

        return (points + text).configure_axis(
            grid=False).configure_view(
            strokeWidth=1)
        
    else:
        chart = alt.Chart(source,width=200,height = 50).mark_bar().encode(
        x=alt.X('partij:N',sort=alt.SortField(field="x", order='descending')),
        y=alt.Y('x:Q', axis = None),
        color=alt.Color("partij", scale = alt.Scale(domain=parties,range= [party_colors[p] for p in parties]), legend=None),
    )
        chart.encoding.x.title = f'{topic} #{num_moties} {round(explained_variance_ratio_[0]*100)}%'

        
        return chart



In [273]:
topics = df[df['Kamer']=='Rutte III'].groupby('Topic').size().sort_values(ascending=False).index

In [270]:

for topic in topics:
    print(topic, end=' ')
    pca_topic(df, topic, 'Rutte III', twodim=True)

Werk explained variance by factors [0.47641636 0.21911353] 0.6955298917226885
Justitie explained variance by factors [0.35813327 0.23986011] 0.5979933827773828
Milieu explained variance by factors [0.49430625 0.17464729] 0.668953542558133
Dierenwelzijn explained variance by factors [0.54674332 0.12710069] 0.6738440064781093
Langdurige zorg explained variance by factors [0.55098669 0.10906146] 0.6600481454280163
Landbouw explained variance by factors [0.47449532 0.18884546] 0.6633407839020595
Ziekenhuizen explained variance by factors [0.53725429 0.14910264] 0.6863569360886003
Buitenlandse zaken explained variance by factors [0.34694015 0.24635504] 0.5932951915427274
Studenten explained variance by factors [0.40832259 0.18727276] 0.5955953459063944
Inlichtingendiensten explained variance by factors [0.32695102 0.21540861] 0.5423596287213484
Buitenlands conflict explained variance by factors [0.43775091 0.22199992] 0.6597508211674632
Jeugdzorg explained variance by factors [0.45539249 0.

In [271]:
charts = [pca_topic(df, topic, 'Rutte III') for topic in topics]

explained variance by factors [0.47641636] 0.4764163611278579
explained variance by factors [0.35813327] 0.35813327321890737
explained variance by factors [0.49430625] 0.49430625323615546
explained variance by factors [0.54674332] 0.5467433184927001
explained variance by factors [0.55098669] 0.550986685389407
explained variance by factors [0.47449532] 0.4744953229986375
explained variance by factors [0.53725429] 0.5372542914513683
explained variance by factors [0.34694015] 0.3469401539543283
explained variance by factors [0.40832259] 0.4083225901487602
explained variance by factors [0.32695102] 0.32695102069313564
explained variance by factors [0.43775091] 0.4377509054131931
explained variance by factors [0.45539249] 0.4553924937436109
explained variance by factors [0.50234756] 0.50234755707879
explained variance by factors [0.44490758] 0.4449075797806835
explained variance by factors [0.38851322] 0.3885132212010862
explained variance by factors [0.33207388] 0.3320738756213485
explaine

In [282]:
import math
n_rows = 8
column_charts = []
t =[]
for cols in range(0,math.ceil(len(charts)/n_rows)):
    row_charts = [charts[cols*n_rows+rows] for rows in range(row_count) if cols*n_rows+rows < len(charts)]
    test = [cols*n_rows+rows for rows in range(row_count) if cols*n_rows+rows < len(charts)]
    t.append(test)
    column_charts.append(alt.hconcat(*row_charts))
alt.vconcat(*column_charts).configure_axis(
        grid=False).configure_view(
        strokeWidth=0)