In [1]:
import word_tokenizer
import pandas as pd
from gensim import models

### Load climate-related FT meetings

In [2]:
df = pd.read_pickle("../climate_classifier/scored_data/ft_meetings_total_2012-2022_2.1.pkl")
df_c = df[df["y_pred"]=="climate"].reset_index(drop=True)
df_c.head(1)

Unnamed: 0,speaker,politician,party,text,date,y_pred
0,Transportministeren Henrik Dam Kristensen,Henrik Dam Kristensen,(S),"Jeg synes, det er lidt ærgerligt, at hr. Lars ...",2012-10-03,climate


In [3]:
## Add a comma + space in end of all texts
df_c["text"] = df_c["text"]+"<> "
df_c["politician"] = df_c["politician"].str.strip()
pol_c = df_c.groupby(['politician'])['text'].sum()
pol_c

politician
Aaja Chemnitz Larsen     Altså, den er jo inspireret lidt af tanken med...
Aki-Matilda Høegh-Dam    Tak for ordet. I rigsfællesskabet går vi en sp...
Aleqa Hammond            Arktis er jo så meget mere end bare forsvar, b...
Alex Ahrendtsen          Tak for det. Jeg må indrømme, at det er et sne...
Alex Vanopslagh          Tak til ordføreren for en passioneret tale om ...
                                               ...                        
Victoria Velasquez       Tak for spørgsmålet. Jeg synes egentlig ikke, ...
Villum Christensen       Tak for det. Liberal Alliance synes, at intent...
Zenia Stampe             Tak for det. Jeg kunne forstå, at SF's formand...
Özlem Sara Cekic         Tak, og tak til forespørgerne fra Dansk Folkep...
Øjvind Vilsholm          Tak til SF for at fremsætte et rigtig godt for...
Name: text, Length: 314, dtype: object

In [4]:
df_pol_c = pd.DataFrame(pol_c)
df_pol_c["politician"] = df_pol_c.index
df_pol_c = df_pol_c[["politician", "text"]].reset_index(drop=True)
print(len(df_pol_c.loc[4]["text"]))
df_pol_c

34753


Unnamed: 0,politician,text
0,Aaja Chemnitz Larsen,"Altså, den er jo inspireret lidt af tanken med..."
1,Aki-Matilda Høegh-Dam,Tak for ordet. I rigsfællesskabet går vi en sp...
2,Aleqa Hammond,"Arktis er jo så meget mere end bare forsvar, b..."
3,Alex Ahrendtsen,"Tak for det. Jeg må indrømme, at det er et sne..."
4,Alex Vanopslagh,Tak til ordføreren for en passioneret tale om ...
...,...,...
309,Victoria Velasquez,"Tak for spørgsmålet. Jeg synes egentlig ikke, ..."
310,Villum Christensen,"Tak for det. Liberal Alliance synes, at intent..."
311,Zenia Stampe,"Tak for det. Jeg kunne forstå, at SF's formand..."
312,Özlem Sara Cekic,"Tak, og tak til forespørgerne fra Dansk Folkep..."


### Vectorize texts

In [5]:
MODEL_FILE = '../../../Models/word2vec/dsl_skipgram_2020_m5_f500_epoch2_w5.model/dsl_skipgram_2020_m5_f500_epoch2_w5.model'
model = models.Word2Vec.load(MODEL_FILE)

        ## Takes 2 min to load on Jonathan PC 2

## Used for vectorizing the description text (filtered)
words = set(model.wv.index_to_key)

In [6]:
## Remove stopwords and create lists containing words
df_pol_c['text_clean'] = df_pol_c['text'].apply(lambda x: word_tokenizer.tokenize(x))
df_pol_c.head(3)

Unnamed: 0,politician,text,text_clean
0,Aaja Chemnitz Larsen,"Altså, den er jo inspireret lidt af tanken med...","[altså, inspirere, lidt, tank, klimafond, hand..."
1,Aki-Matilda Høegh-Dam,Tak for ordet. I rigsfællesskabet går vi en sp...,"[tak, ord, rigsfællesskab, går, spændende, tid..."
2,Aleqa Hammond,"Arktis er jo så meget mere end bare forsvar, b...","[arktis, se, mere, bare, forsvare, beredskab, ..."


In [7]:
import numpy as np
pol_vect = np.array([np.array([model.wv[i] for i in ls if i in words])
                         for ls in df_pol_c["text_clean"]])
                         ## Takes 32 sec on Jonathan PC 2

  pol_vect = np.array([np.array([model.wv[i] for i in ls if i in words])


In [8]:
# Why is the length of the sentence different than the length of the sentence vector?
for i, v in enumerate(pol_vect):
    print(len(df_pol_c["text_clean"].iloc[i]), len(v))

2822 2681
2110 2005
715 668
2500 2373
2995 2773
1427 1322
1182 1114
1601 1518
26 22
5804 5387
673 630
143 131
308 296
517 481
6623 6073
2585 2401
346 313
1282 1212
304 289
768 716
677 650
1411 1360
194 177
9446 8869
1060 999
1220 1136
57 49
103 97
1341 1256
2425 2229
2888 2714
119 110
660 597
598 567
218 211
461 425
3724 3440
2486 2333
6872 6322
1345 1277
2863 2707
338 311
345 327
1308 1279
538 493
2716 2544
389 366
18147 16884
2921 2721
1774 1686
481 449
1142 1073
366 325
747 706
12201 11382
61 55
162 151
16119 15293
1238 1156
485 457
573 546
6171 5892
77 72
103 93
2674 2519
1018 932
1220 1151
204 179
174 162
4636 4438
101 90
1381 1315
268 250
1907 1804
259 232
932 869
233 221
1144 1067
1280 1193
4983 4663
1390 1304
727 681
399 371
1087 1012
22121 20738
4844 4600
2025 1916
192 179
1069 1025
2255 2175
25 22
2551 2424
19 16
6465 6081
319 295
2359 2261
7660 7209
2161 2057
2880 2674
1172 1102
1869 1783
7228 6849
503 484
1445 1368
851 806
13161 12243
3199 2968
3871 3700
369 346
2793 2578
2

In [9]:
pol_vec_avg = []
for v in pol_vect:
    if v.size:
        pol_vec_avg.append(v.mean(axis=0))
    else:
        pol_vec_avg.append(np.zeros(100, dtype=float))

        ## Takes 0.4 sec Jonathan PC 2

In [10]:
# Are our sentence vector lengths consistent?
for i, v in enumerate(pol_vec_avg):
    print(len(df_pol_c["text_clean"].iloc[i]), len(v))

2822 500
2110 500
715 500
2500 500
2995 500
1427 500
1182 500
1601 500
26 500
5804 500
673 500
143 500
308 500
517 500
6623 500
2585 500
346 500
1282 500
304 500
768 500
677 500
1411 500
194 500
9446 500
1060 500
1220 500
57 500
103 500
1341 500
2425 500
2888 500
119 500
660 500
598 500
218 500
461 500
3724 500
2486 500
6872 500
1345 500
2863 500
338 500
345 500
1308 500
538 500
2716 500
389 500
18147 500
2921 500
1774 500
481 500
1142 500
366 500
747 500
12201 500
61 500
162 500
16119 500
1238 500
485 500
573 500
6171 500
77 500
103 500
2674 500
1018 500
1220 500
204 500
174 500
4636 500
101 500
1381 500
268 500
1907 500
259 500
932 500
233 500
1144 500
1280 500
4983 500
1390 500
727 500
399 500
1087 500
22121 500
4844 500
2025 500
192 500
1069 500
2255 500
25 500
2551 500
19 500
6465 500
319 500
2359 500
7660 500
2161 500
2880 500
1172 500
1869 500
7228 500
503 500
1445 500
851 500
13161 500
3199 500
3871 500
369 500
2793 500
214 500
239 500
4571 500
245 500
2636 500
1373 500
464 500

In [11]:
df_pol_c["vec"] = pol_vec_avg
df_pol_c.head(1)

Unnamed: 0,politician,text,text_clean,vec
0,Aaja Chemnitz Larsen,"Altså, den er jo inspireret lidt af tanken med...","[altså, inspirere, lidt, tank, klimafond, hand...","[-0.099937215, 0.020752087, 0.070243455, -0.15..."


In [29]:
### Test politicians 
## Upper bound: Lars Christian Lilleholt, Mai Villadsen
## Not so much: Mette Gjerskov, Karen Hækkerup

up1 = df_pol_c[df_pol_c["politician"]=="Lars Christian Lilleholt"].reset_index(drop=True)
up2 = df_pol_c[df_pol_c["politician"]=="Mai Villadsen"].reset_index(drop=True)
bot1 = df_pol_c[df_pol_c["politician"]=="Mette Gjerskov"].reset_index(drop=True)
bot2 = df_pol_c[df_pol_c["politician"]=="Karen Hækkerup"].reset_index(drop=True)

print(model.wv.most_similar(positive=[up1["vec"][0]])[:5])
print(model.wv.most_similar(positive=[up2["vec"][0]])[:5])
print(model.wv.most_similar(positive=[bot1["vec"][0]])[:5])
print(model.wv.most_similar(positive=[bot2["vec"][0]])[:5])


[('fossiltfrit', 0.8480125069618225), ('retsundtagelse', 0.8469846248626709), ('libyenstrategi', 0.8469712734222412), ('2025aftale', 0.8466503024101257), ('parismålene', 0.8458715081214905)]
[('alenejeg', 0.8413612842559814), ('licensmuligheder', 0.8398793339729309), ('supertrist', 0.838851273059845), ('dobbeltstraffe', 0.83835768699646), ('allerallerhelst', 0.8383135199546814)]
[('upatruljeret', 0.8371339440345764), ('alenejeg', 0.835919201374054), ('dobbeltstraffe', 0.8354094624519348), ('allerallerhelst', 0.8345998525619507), ('0skabeloner', 0.8344907760620117)]
[('0skabeloner', 0.8473332524299622), ('upatruljeret', 0.8432225584983826), ('retssituation', 0.8423854112625122), ('barnebrudene', 0.8411011099815369), ('pensionsrettighed', 0.8409388661384583)]


In [36]:
print(model.wv.cosine_similarities(up1["vec"][0], [up2["vec"][0]])) ## Mai
print(model.wv.cosine_similarities(up1["vec"][0], [bot1["vec"][0]])) ## Mette G
print(model.wv.cosine_similarities(up1["vec"][0], [bot2["vec"][0]]) )## Karen Hæk

[0.97670025]
[0.968652]
[0.9421233]


In [39]:
df_votes_c = pd.read_pickle("./scikitv2/c_all_trans.pkl")
df_votes_c.head(1)

Unnamed: 0,party,politician,vote,description,date,vote_caller,y_pred,set,vote_id,Socialdemokratiet_weight_for,Socialdemokratiet_weight_against,Venstre_weight_for,Venstre_weight_against,vec,vec2,w_for,w_against,party2,vote_caller2
1,Dansk Folkeparti (DF),Alex Ahrendtsen,For,"Folketinget konstaterer, at den hidtil førte e...",2015-12-17,['Mikkel Dencker (DF)'],climate,train,"Afstemning nr. 183, 2015-16",0.570332,0.429668,0.079625,0.920375,[-1.04358420e-01 -1.38448412e-02 7.39112943e-...,"[-0.10435842, -0.0138448412, 0.0739112943, -0....",0.079625,0.920375,(DF),(DF)


In [45]:
vote1 = df_votes_c.loc[1]["vec2"]
print(model.wv.cosine_similarities(vote1, [up1["vec"][0]]))
print(model.wv.cosine_similarities(vote1, [up2["vec"][0]]))
print(model.wv.cosine_similarities(vote1, [bot1["vec"][0]]))
print(model.wv.cosine_similarities(vote1, [bot2["vec"][0]]))

[0.93099058]
[0.87828027]
[0.86871]
[0.84866758]


In [47]:
df_pol_c
df_pol_c.to_pickle("pol2vec.pkl")