# Set up environment

In [None]:
import sys
print(sys.prefix)

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import urllib, os,sys, pdfplumber, glob, requests, wordcloud, re, dateparser, scipy

# Set up working dir

In [None]:
base_dir = os.path.realpath('../..')
print(base_dir)
data_dir = base_dir + '\Data'

In [None]:
in_dir = data_dir + '\\TK_commissieVWS\\auto_download_20230118'

## Choose lexicon setting

In [None]:
lex_dict = {'original':{'lexicon':pd.read_csv(data_dir + '\\lexicons\\lexicon.csv',  index_col = 0),
                        'data':pd.read_csv(in_dir + '\\speaking_turns_coded_labeled-original.csv', index_col = 0)},
            'sens_noSEGV':{'lexicon':pd.read_csv(data_dir + '\\lexicons\\lexicon_no_SEGV.csv', index_col = 0),
                           'data':pd.read_csv(in_dir + '\\speaking_turns_coded_labeled-sens_noSEGV.csv', index_col = 0)}}

In [None]:
lex_name = 'original'

In [None]:
results_dir = base_dir + '\\Results\\dataset_20230118\\lexicon-%s'%lex_name

## Load speaking turns data

In [None]:
vws_data = lex_dict[lex_name]['data']

In [None]:
vws_data.head(3).append(vws_data.tail(3))

In [None]:
vws_data.shape

In [None]:
vws_data['LEEF'].sum()

In [None]:
vws_data.loc[vws_data['LEEF']==1,'text']

In [None]:
vws_data.loc[2007,'text']

In [None]:
cabinets = vws_data['cabinet'].unique().tolist()
print(cabinets)

In [None]:
vws_data.shape

In [None]:
sum(vws_data['text'].str.split(' ').str.len())

In [None]:
np.mean(vws_data[['LEEF','SDOH']])

In [None]:
# np.mean(vws_data[['LEEF','SDOH']])

## Assign class

In [None]:
vws_data['class'] = 'other'
vws_data.loc[vws_data['LEEF']==1,'class'] = 'LEEF'
vws_data.loc[vws_data['SDOH']==1,'class'] = 'SDOH'
vws_data.loc[((vws_data['LEEF']==1) & (vws_data['SDOH']==1)),'class'] = 'both'

# Question 1. Testing the influence of political leaning

##### Plot LR by speaking turn category

In [None]:
from scipy.stats import mannwhitneyu as mwu

In [None]:
cats = ['LEEF','SDOH','other']
comparisons = pd.DataFrame([['LEEF','other','greater'],
                            ['SDOH','other','less'],
                            ['LEEF','SDOH','greater']],
                          columns = ['catA','catB','direction'])
comparisons

In [None]:
out_var = 'LR'
if out_var == 'LR':
    x_poles = ['Left','Right']
elif out_var == 'TANGAL':
    x_poles = ['Conservative','Progressive']

In [None]:
for cat in cats:
    dat = vws_data.loc[vws_data['class']==cat,out_var]
    print('%s: mean LR = %.3f, sem = %.3f'%(cat, dat.mean(), dat.sem()))
print('')
for ri,row in comparisons.iterrows():
    cat1 = row['catA']
    cat2 = row['catB']
    print(cat1, ' vs ', cat2)
    dat1 = vws_data.loc[vws_data['class']==cat1,out_var]
    dat2 = vws_data.loc[vws_data['class']==cat2,out_var]
    print('delta = %.3f'%(dat1.mean()-dat2.mean()))
    print(mwu(dat1,dat2, alternative = row['direction']))
    print('')

In [None]:
fig, ax = plt.subplots(1,1,figsize=[12,4])
cats = ['LEEF','SDOH','other']
sns.stripplot(data = vws_data.loc[vws_data['class'].isin(cats),:],
              y = 'class', x = out_var, alpha = .05, jitter = True, order = cats)
for ci,cat in enumerate(cats):
    mean_lr = vws_data.loc[vws_data['class']==cat, out_var].mean()
    ax.scatter([mean_lr],[ci],40,color = 'k', zorder = 10)
sns.barplot(data = vws_data, y = 'class', x = out_var, order = cats,
             capsize = .1, alpha = 0, zorder = 10)
vline_x = vws_data.loc[vws_data['class']=='other','LR'].mean()
plt.plot([vline_x,vline_x],[-.2,2.2],'k--', lw = 1)
ax.set(ylabel = 'Category', yticklabels = ['Lifestyle','SDOH','Other'],
      title = 'Political orientation of MPs per contribution category',
      xlabel = '<-- %s            Political orientation          %s -->'%(x_poles[0],x_poles[1]),
      xlim = [-2.2,2.2]);
plt.tight_layout()
# fig.savefig(results_dir + '//Theme_LR_orientation.pdf', bbox_inches = 'tight', transparent = True)

##### Sensitivity analysis: big parties only

In [None]:
big_parties_only = vws_data.copy()
big_parties_only = big_parties_only.loc[big_parties_only['party'].isin(['pvda','pvv','cda','vvd','d66','gl','sp'])]
print(big_parties_only.shape)

In [None]:
for cat in cats:
    dat = big_parties_only.loc[big_parties_only['class']==cat,out_var]
    print('%s: mean LR = %.3f, sem = %.3f'%(cat, dat.mean(), dat.sem()))
print('')
for ri,row in comparisons.iterrows():
    cat1 = row['catA']
    cat2 = row['catB']
    print(cat1, ' vs ', cat2)
    dat1 = big_parties_only.loc[big_parties_only['class']==cat1,out_var]
    dat2 = big_parties_only.loc[big_parties_only['class']==cat2,out_var]
    print('delta = %.3f'%(dat1.mean()-dat2.mean()))
    print(mwu(dat1,dat2, alternative = row['direction']))
    print('')

In [None]:
fig, ax = plt.subplots(1,1,figsize=[12,4])
cats = ['LEEF','SDOH','other']
sns.stripplot(data = big_parties_only.loc[big_parties_only['class'].isin(cats),:],
              y = 'class', x = out_var, alpha = .05, jitter = True, order = cats)
for ci,cat in enumerate(cats):
    mean_lr = big_parties_only.loc[big_parties_only['class']==cat, out_var].mean()
    ax.scatter([mean_lr],[ci],40,color = 'k', zorder = 10)
sns.barplot(data = big_parties_only, y = 'class', x = out_var, order = cats,
             capsize = .1, alpha = 0, zorder = 10)
vline_x = big_parties_only.loc[big_parties_only['class']=='other','LR'].mean()
plt.plot([vline_x,vline_x],[-.2,2.2],'k--', lw = 1)
ax.set(ylabel = 'Category', yticklabels = ['Lifestyle','SDOH','Other'],
      title = 'Political orientation of MPs per contribution category',
      xlabel = '<-- %s            Political orientation          %s -->'%(x_poles[0],x_poles[1]),
      xlim = [-2.2,2.2]);
plt.tight_layout()
# fig.savefig(results_dir + '//Theme_LR_orientation_big_parties_only.pdf', bbox_inches = 'tight', transparent = True)

##### Exclude D66

In [None]:
exclude_d66 = vws_data.copy()
exclude_d66 = exclude_d66.loc[exclude_d66['party']!='d66']
print(exclude_d66.shape)

In [None]:
for cat in cats:
    dat = exclude_d66.loc[exclude_d66['class']==cat,out_var]
    print('%s: mean LR = %.3f, sem = %.3f'%(cat, dat.mean(), dat.sem()))
print('')
for ri,row in comparisons.iterrows():
    cat1 = row['catA']
    cat2 = row['catB']
    print(cat1, ' vs ', cat2)
    dat1 = exclude_d66.loc[exclude_d66['class']==cat1,out_var]
    dat2 = exclude_d66.loc[exclude_d66['class']==cat2,out_var]
    print('delta = %.3f'%(dat1.mean()-dat2.mean()))
    print(mwu(dat1,dat2, alternative = row['direction']))
    print('')

##### Exclude FVD

In [None]:
exclude_fvd = vws_data.copy()
exclude_fvd = exclude_fvd.loc[exclude_fvd['party']!='fvd']
print(exclude_fvd.shape)

In [None]:
for cat in cats:
    dat = exclude_fvd.loc[exclude_fvd['class']==cat,out_var]
    print('%s: mean LR = %.3f, sem = %.3f'%(cat, dat.mean(), dat.sem()))
print('')
for ri,row in comparisons.iterrows():
    cat1 = row['catA']
    cat2 = row['catB']
    print(cat1, ' vs ', cat2)
    dat1 = exclude_fvd.loc[exclude_fvd['class']==cat1,out_var]
    dat2 = exclude_fvd.loc[exclude_fvd['class']==cat2,out_var]
    print('delta = %.3f'%(dat1.mean()-dat2.mean()))
    print(mwu(dat1,dat2, alternative = row['direction']))
    print('')

##### Test by GALTAN

In [None]:
out_var = 'TANGAL'
for cat in cats:
    dat = vws_data.loc[vws_data['class']==cat,out_var]
    print('%s: mean LR = %.3f, sem = %.3f'%(cat, dat.mean(), dat.sem()))
print('')
for ri,row in comparisons.iterrows():
    cat1 = row['catA']
    cat2 = row['catB']
    print(cat1, ' vs ', cat2)
    dat1 = vws_data.loc[vws_data['class']==cat1,out_var]
    dat2 = vws_data.loc[vws_data['class']==cat2,out_var]
    print('delta = %.3f'%(dat1.mean()-dat2.mean()))
    print(mwu(dat1,dat2, alternative = 'two-sided'))
    print('')

## Plot average contributions of parties by themes

In [None]:
party_dat = vws_data[['party','LR','TANGAL','text','LEEF','SDOH']].groupby(['party'],as_index=False) \
        .agg({'text':'size', 'LR':'mean', 'TANGAL':'mean', 'LEEF':'mean', 'SDOH':'mean'}) \
        .rename(columns={'text':'Nr. of speaking turns'}) \
        .reset_index()
party_dat[['LEEF','SDOH']] = party_dat[['LEEF','SDOH']]*100
party_dat['Nr. of speaking turns'] = party_dat['Nr. of speaking turns'].astype(int)
party_dat = party_dat.rename(columns = {'LR':'Left-Right ideology'})
party_dat.head(5)

In [None]:
plot_dat = party_dat.loc[party_dat['Nr. of speaking turns']>500,:]
plot_dat.shape

In [None]:
fig,ax = plt.subplots(nrows = 1, ncols = 1, figsize=[10,6])
sns.scatterplot(data = plot_dat, x = 'LEEF', y = 'SDOH', size = 'Nr. of speaking turns', edgecolor = 'grey',
                hue = 'Left-Right ideology', ax = ax, palette = 'RdBu', sizes = (100,300), hue_norm = (-2,2))
randfact = 0.1
for ri,row in plot_dat.iterrows():
    randdist = 0.01 + np.random.rand()*randfact
    textloc = [row['LEEF'] + randdist,row['SDOH'] + randdist]
    ax.text(*textloc,row['party'], fontdict = {'color':'k', 'size':12, 'weight':'normal'})

norm = plt.Normalize(-2, 2)
sm = plt.cm.ScalarMappable(cmap="RdBu", norm=norm)
sm.set_array([])

ax.set(xlabel = 'Lifestyle (% of contributions)',
       ylabel = 'Social determinants of health (% of contributions)',
       ylim = [0, 4], xlim = [0, 6], aspect = 1,
       title = 'All parties with > 500 contributions in VWS committee',
       xticks = np.arange(0,6.1,.5), yticks = np.arange(0,4.1,.5))

plt.legend(ax.get_legend_handles_labels()[0][6:], ax.get_legend_handles_labels()[1][6:], loc = [.7, .17])
ax.figure.colorbar(sm, orientation = 'horizontal', shrink = .2, anchor = [0.823, 0], pad = -.15, drawedges = False,
                   label = 'Left-Right ideology')

plt.tight_layout()
fig.savefig(results_dir + '//Parties_themes.pdf', bbox_inches = 'tight', transparent = True)