In [1]:
import json
import numpy as np
from collections import Counter
from nltk.tokenize import word_tokenize
import re
import fasttext
import pickle

In [2]:
%config Completer.use_jedi = False

# Config

In [3]:
 # This analysis was done only for EP8 and the new_edit task
    
legislature = '8'
task = 'new_edit'

In [4]:
# Directory containing the embedding models trained on full data
model_dir = '../data/text-embeddings'

# Directory containing the canonical data
data_dir = '../data/canonical'

# Directory containing the dossier to title mapping
titles_dir = '../data/helpers'

In [5]:
# Loading saved fastText model for the edit and title to get the word and bigram embeddings
model_edit = fasttext.load_model(model_dir + '/ep' + legislature + '-' + task + '-full' + '-edit.bin')
model_title = fasttext.load_model(model_dir + '/ep' + legislature + '-' + task + '-full' + '-title.bin')



In [6]:
# Loading the learned parameters corresponding to the edit and title text embedding
weights_edit = np.loadtxt('edit-parameters.txt')
weights_title = np.loadtxt('title-parameters.txt')

In [7]:
# Loading data

data = []
with open(data_dir + '/war-of-words-2-ep' + legislature + '.txt','r') as json_file:
    for line in json_file:
        data.append(json.loads(line))


In [8]:
# Loading mapping from dossier references to titles

ref2title = {}
with open(titles_dir+ '/dossier-titles.json','r') as json_file:
    ref2title = json.load(json_file)

In [9]:
# Adding dossier title to the edits and keeping track of missing dossiers

missing_refs = set()

for conflict in data:
    for edit in conflict:
        dossier_ref = edit['dossier_ref']
        if dossier_ref in ref2title:
            edit['dossier_title'] = ref2title[dossier_ref]
        else:
            edit['dossier_title'] = ""
            missing_refs = missing_refs.union({dossier_ref})
            
if len(missing_refs) > 0:
    print('Warning !', len(missing_refs),'references do not have an associated title!')

In [10]:
def _filter_dossiers(dataset, thr):
    # Count occurence of each dossiers.
    dossiers = list()
    for data in dataset:
        for datum in data:
            dossiers.append(datum['dossier_ref'])
    counter = Counter(dossiers)
    # Define list of dossiers to keep.
    keep = set([d for d, c in counter.items() if c > thr])
    k, d = len(keep), len(set(dossiers))
    print(f'Removed {d-k} ({(d-k)/d*100:.2f}%) dossiers.')
    return keep
def _filter_meps(dataset, thr):
    # Count occurence of each dossiers.
    meps = list()
    for data in dataset:
        for datum in data:
            for at in datum['authors']:
                meps.append(at['id'])
    counter = Counter(meps)
    # Define list of dossiers to keep.
    keep = set([d for d, c in counter.items() if c > thr])
    k, m = len(keep), len(set(meps))
    print(f'Removed {m-k} ({(m-k)/m*100:.2f}%) MEPs.')
    return keep
def filter_dataset(dataset, thr=10):
    """Remove dossiers with less than `thr` edits."""
    keep_doss = _filter_dossiers(dataset, thr)
    keep_mep = _filter_meps(dataset, thr)
    filtered_dataset = list()
    for data in dataset:
        kd, km = True, True
        for datum in data:
            if datum['dossier_ref'] not in keep_doss:
                kd = False
            if not all(at['id'] in keep_mep for at in datum['authors']):
                km = False
        if kd and km:
            filtered_dataset.append(data)
    d, f = len(dataset), len(filtered_dataset)
    print(f'Removed {d-f} ({(d-f)/d*100:.2f}%) conflicts.')
    print('Number of data points:', len(filtered_dataset))
    return filtered_dataset

def unroll(dataset):
    unrolled = list()
    for conflict in dataset:
        for edit in conflict:
            unrolled.append(edit)
    return unrolled

In [11]:
data_filtered = filter_dataset(data)

Removed 51 (6.38%) dossiers.
Removed 14 (1.77%) MEPs.
Removed 271 (0.19%) conflicts.
Number of data points: 140763


In [12]:
data_filtered_unrolled = unroll(data_filtered)

In [13]:
def extract_bigrams(unrolled_data):
    bigrams = set()
    for j,datum in enumerate(unrolled_data):

        i1 = datum['edit_indices']['i1']
        i2 = datum['edit_indices']['i2']
        j1 = datum['edit_indices']['j1']
        j2 = datum['edit_indices']['j2']

        text_del = datum['text_original'][i1:i2]
        text_ins = datum['text_amended'][j1:j2]
        text_context_l = datum['text_original'][:i1] 
        text_context_r = datum['text_original'][i2:]


        if j%10000==0:
            print(j)
        if len(text_del) > 1:
            org_text_with_tag = ['<del>'+w for w in text_del]
            for i in range(len(org_text_with_tag)-1):
                bigrams.add(' '.join([org_text_with_tag[i],org_text_with_tag[i+1]]))
        if len(text_ins) > 1:
            org_text_with_tag = ['<ins>'+w for w in text_ins]
            for i in range(len(org_text_with_tag)-1):
                bigrams.add(' '.join([org_text_with_tag[i],org_text_with_tag[i+1]]))
        if len(text_context_l) > 1:
            org_text_with_tag = ['<con>'+w for w in text_context_l]
            for i in range(len(org_text_with_tag)-1):
                bigrams.add(' '.join([org_text_with_tag[i],org_text_with_tag[i+1]]))
        if len(text_context_r) > 1:
            org_text_with_tag = ['<con>'+w for w in text_context_r]
            for i in range(len(org_text_with_tag)-1):
                bigrams.add(' '.join([org_text_with_tag[i],org_text_with_tag[i+1]]))       

        title_word_list = [re.sub('\d','D',w.lower()) for w in word_tokenize(datum['dossier_title'])]
        if len(title_word_list) > 1:
            for i in range(len(title_word_list)-1):
                bigrams.add(' '.join([title_word_list[i],title_word_list[i+1]]))
    bigrams = list(bigrams)
    return bigrams

In [14]:
def get_denominator(bigram,model_obj):
    
    # Get number of words in the bigram that is in the vocabulary
    n_words = 0
    words = bigram.split()
    for w in words:
        if model_obj.get_word_id(w) > -1:
            n_words += 1
    
    n_bigrams = len(words)
    
    # Consider 'w1 w2 EOS' - the averaging is over the words among w1 and w2 that are in the vocabulary (n_words), 
    # EOS token (1), and the number of bigrams - (w1, w2) and (w2, EOS) (which is len(words)=2) 
    denominator = n_words + 1 + n_bigrams
    
    return denominator
    

def get_bigram_vector(bigram,model_obj):
    
    # Getting the bigram vector in a roundabout way from the fasttext model object
    
    words = bigram.split()
    
    sent_vec = model_obj.get_sentence_vector(bigram)
    
    denom = get_denominator(bigram,model_obj)
    
    # Reversing the averaging operation to get the sum
    sum_vec = sent_vec*denom
    
    sub_sent_vec = model_obj.get_sentence_vector(words[1])
    
    if model_obj.get_word_id(words[1]) > -1:
        # If word is part of vocabulary the average is over the word itself, EOS token and word+EOS
        sub_sent_vec = sub_sent_vec*3
    else:
        sub_sent_vec = sub_sent_vec*2
    
    if model_obj.get_word_id(words[0]) > -1:
        return sum_vec - sub_sent_vec - model_obj.get_word_vector(words[0])
    else:
        return sum_vec - sub_sent_vec
    
def get_topk_bigram(bigrams,model_obj,weights,ftype='added',outcome='accept',k=50):
    if ftype=='removed':
        bigram_indices = [i for (i,bw) in enumerate(bigrams) if bw[:5]=='<del>']
    elif ftype=='added':
        bigram_indices = [i for (i,bw) in enumerate(bigrams) if bw[:5]=='<ins>']
    elif ftype=='context':
        bigram_indices = [i for (i,bw) in enumerate(bigrams) if bw[:5]=='<con>']
    elif ftype=='title':
        bigram_indices = [i for (i,bw) in enumerate(bigrams) if (bw[:5]!='<del>' and bw[:5]!='<ins>' and bw[:5]!='<con>')]
    else:
        print('Invalid feature type')
        return []
        
    #print('Collected bigram indices')

    bigrams = np.array(bigrams)
    bigram_indices = np.array(bigram_indices)
    
    bigram_vectors = np.array([get_bigram_vector(bigrams[i],model_obj) for i in bigram_indices])

    #print('Collected bigram vectors')

    dotprods = bigram_vectors.dot(weights)
    
    #print('Computed dot products')
    
    argsorted_dotprods = np.argsort(dotprods)
    
    if ftype=='title':
        argsorted_dotprods = argsorted_dotprods
    else:
        argsorted_dotprods = argsorted_dotprods[::-1]
    
    if outcome=='accept':
        argsorted_dotprods = argsorted_dotprods
    elif outcome=='reject':
        argsorted_dotprods = argsorted_dotprods[::-1]
    else:
        print('Invalid outcome')
        return []
    
    if ftype=='title':
        bigram_list = bigrams[bigram_indices[argsorted_dotprods]][:k]
    else:
        bigram_list = remove_tags(bigrams[bigram_indices[argsorted_dotprods]][:k])
    
    return bigram_list   


def get_topk_word(model_obj,weights,ftype='added',outcome='accept',k=50):
    # Get vocabulary
    vocab = model_obj.get_words()
    
    # Get input matrix (the word embeddings)
    im = model_obj.get_input_matrix()
    
    if ftype=='removed':
        word_indices = [i for (i,w) in enumerate(vocab) if w[:5]=='<del>']
    elif ftype=='added':
        word_indices = [i for (i,w) in enumerate(vocab) if w[:5]=='<ins>']
    elif ftype=='context':
        word_indices = [i for (i,w) in enumerate(vocab) if w[:5]=='<con>']
    elif ftype=='title':
        word_indices = [i for (i,w) in enumerate(vocab)]
    else:
        print('Invalid ftype')
        return []


    word_indices = np.array(word_indices)

    word_vectors = im[word_indices,:]

    dotprods = word_vectors.dot(weights)
    
    argsorted_dotprods = np.argsort(dotprods)
    
    # The ordering is reversed for the title as it is in the denominator of the edit acceptance probability (see eq.(7) in the paper)
    if ftype=='title':
        argsorted_dotprods = argsorted_dotprods
    else:
        argsorted_dotprods = argsorted_dotprods[::-1]
    
    if outcome=='accept':
        argsorted_dotprods = argsorted_dotprods
    elif outcome=='reject':
        argsorted_dotprods = argsorted_dotprods[::-1]
    else:
        print('Invalid outcome')
        return []
    vocab = np.array(vocab)
    
    if ftype=='title':
        word_list = vocab[word_indices[argsorted_dotprods]][:k]
    else:
        word_list = remove_tags(vocab[word_indices[argsorted_dotprods]][:k])
    
    return word_list
    

In [15]:
def remove_tags(l):
    # Given a list of words or bigrams, returns the list with the tags removed from each element
    l_new = []
    for wb in l:
        wl = wb.split()
        wb_new = ''
        for w in wl:
            wb_new += w[5:] 
            wb_new += ' '
        wb_new = wb_new[:-1]
        l_new.append(wb_new)
    return l_new

In [16]:
bigrams = extract_bigrams(data_filtered_unrolled)

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000


# Words and Bigrams Predictive of Acceptance

## Words

In [17]:
# Words Added

' | '.join(get_topk_word(model_edit,weights_edit,'added','accept',50))

'berec | fishing | should | office | registered | 2018 | inserted | equipment | actions | transparency | important | advisory | fisheries | x | bargaining | best | processes | fuel | financial | regulators | ” | communication | pension | agricultural | supervisory | positive | gender | creative | reduce | plan | impact | withdrawal | external | eets | second | investigation | procurement | ppe | improves | blue | skills | common | hubs | toll | therefore | indicators | contributions | 20 | circular | lisa'

In [18]:
# Words Removed

' | '.join(get_topk_word(model_edit,weights_edit,'removed','accept',50))

'safety | berec | consumers | eurojust | area | breeding | council | surveillance | powers | 2 | human | authorised | animals | bodies | hosting | ; | conditions | articles | annexes | if | added | medium | provision | origin | fisheries | representative |  | benefitting | manufacturer | conformity | fitting | derogation | 29 | plant | virtual | action | recommendation | sex | breed | chapter | amending | current | processing | specific | point | during | implementation | 2025 | covered | financing'

In [19]:
# Context Words

' | '.join(get_topk_word(model_edit,weights_edit,'context','accept',50))

'” | appliance | prima | rco | harmonised | appliances | threats | breeding | controls | voice | alternative | eurojust | safety | egf | published | processed | outside | fitting | instructions | 63 | accounts | associated | institutions | destination | observations | cash | recipients | engines | creditors | berec | manufacturer | expenditure | customs | guidelines | appeal | alcohol | cableway | name | secretariat | multi | iccat | instrument | number | positive | audit | document | uniform | operating | notified | stock'

In [20]:
# Title Words

' | '.join(get_topk_word(model_title,weights_title,'title','accept',50))

"customs | community | mediterranean | DDDD-DDDD | supervision | service | control | installations | 'customs | parliament | cableway | equipment | pollutants | anti-fraud | annex | competition | multiannual | council | statistics | temporary | recovery | documents | field | area | fuels | gaseous | DDDD/DDD | appliances | policy | drinks | plan | ukraine | genealogical | animals | germinal | financial | burning | fisheries | spirit | context | laws | investigations | it | management | other | ensure | medicines | DD/DDDD | agency | insolvency"

## Bigrams

In [21]:
# Bigrams Added

' | '.join(get_topk_bigram(bigrams,model_edit,weights_edit,'added','accept',50))

'their sector | opposition , | the berec | of meeting | , humification | this regulation | berec office | this expert | avoid social | way behind | a . | in easy | transmission of | were neither | eu´s greenhouse | violence is | applicable the | one of | within the | . 2 | economic operators | relative deviation | where applicable | risk premia | positive impact | further amended | accept , | is inserted | not properly | multinationals at | institution , | by sub | the third | regions , | or federal | family associations | intelligent mobility | - carrier | carrier economic | signal processing | acoustic signals | residency or | complaint was | , raising | , 51 | people and | assessment of | board of | care on | the commission'

In [22]:
# Bigrams Removed

' | '.join(get_topk_bigram(bigrams,model_edit,weights_edit,'removed','accept',50))

'2 . | . 2 | . . | international efforts | . where | , member | the case | the member | hosting service | the following | 5 . | human rights | ; the | market surveillance | data protection | in such | 4 . | to be | requirements of | the hosting | which are | : the | subject to | whether the | to that | and related | service provider | of that | provided for | evaluations ; | plan ; | covered by | . 3 | eurojust shall | relevant for | conditions , | sharing and | the council | 6 . | notified as | the implementing | the development | directive . | take a | application for | the efsd | article 11 | the data | in hormonal | of consumers'

In [23]:
# Context Bigrams

' | '.join(get_topk_bigram(bigrams,model_edit,weights_edit,'context','accept',50))

". ' | : ' | . those | . 3 | 2 , | notified body | . 2 | . ” | is in | authority to | under other | management board | requirements of | year . | their citizenship | annual work | renewable energy | public sector | ' the | supervisory authorities | promoter . | within a | be deferred | investment firms | economic operators | the egf | 3 . | of new | voice communications | shall : | and shall | authorities should | / 22 | in other | 4 . | quantified , | , storage | this paragraph | plan . | financing types | regulation . | consumers ' | commission in | the market | of participants | resident or | monitor the | ' interests | article 38 | states to"

In [24]:
# Title Bigrams

' | '.join(get_topk_bigram(bigrams,model_title,weights_title,'title','accept',50))

"council on | to regulation | cableway installations | supervision of | , ( | ' programme | 'customs ' | of customs | recovery plan | european parliament | multiannual recovery | and of | general budget | the mediterranean | annex a | the 'customs | the field | and establishing | parliament and | on insolvency | replacing annex | insolvency proceedings | a to | and administrative | regulation of | rules applicable | budget of | customs control | control equipment | burning gaseous | field of | the reform | gaseous fuels | DDDD/DDD on | financial rules | for cooperation | the use | zootechnical and | plan for | and supervision | for trade | the council | appliances burning | procedures for | in and | no DD/DDDD | DDDD/DDDD , | imports into | union of | to ensure"

# Words and Bigrams Predictive of Rejection

## Words

In [25]:
# Words Added

' | '.join(get_topk_word(model_edit,weights_edit,'added','reject',50))

'these | cabotage | deleted | ; | eu | except | societal | &#160 | territorial | illegal | – | payment | mercury | must | asylum | hydrogen | e | benchmark | commercial | include | according | service | benefits | determined | operational | solidarity | negative | binding | circumstances | professionals | firearms | consent | case | participants | interest | ) | ten | days | settlement | after | basic | children | s | if | defined | additionality | agreements | amended | deputy | roma'

In [26]:
# Words Removed

' | '.join(get_topk_word(model_edit,weights_edit,'removed','reject',50))

'energy | should | migration | additional | public | corps | workers | competitiveness | irregular | different | product | % | joint | systems | forest | worker | remuneration | international | research | eib | efsi | before | growth | economic | passenger | electronic | cultural | solidarity | therefore | matter | months | online | impact | works | concerted | waste | through | can | reporting | value | building | eurodac | europe | allowances | identity | more | account | return | objectives | facial'

In [27]:
# Context Words

' | '.join(get_topk_word(model_edit,weights_edit,'context','reject',50))

'_________________ | allocation | resettlement | posting | benchmark | allowances | rightholders | allocations | firearms | reserve | foreign | hosting | free | driver | pnr | ) | educational | core | verification | labels | works | ancillary | forest | collective | advanced | broadcast | terrorist | investments | fine | excellence | 25 | parental | condition | preservation | mercury | 2030 | million | renovation | remote | employees | fingerprints | settings | redress | exception | travel | target | solidarity | sectors | 00 | aims'

In [28]:
# Title Words

' | '.join(get_topk_word(model_title,weights_title,'title','reject',50))

'and | directive | market | DDDD | framework | services | DDDD/DDD/ec | </s> | agricultural | requirements | energy | protection | a | for | as | gas | greenhouse | contracts | decision | online | name | operation | digital | regulation | strategic | view | development | of | at | structural | emission | instruments | establishment | trading | the | record | from | in | regards | supplementary | specific | plans | private | posting | screening | DDD/DDDD | pnr | certificate | under | investments'

## Bigrams

In [29]:
# Bigrams Added

' | '.join(get_topk_bigram(bigrams,model_edit,weights_edit,'added','reject',50))

"“10a . | communication , | welfare regulations | normalisation process | become apparent | that activity | is deleted | certificates were | general production | &#160 ; | may propose | and in | separation of | ' s | with the | and logistics | hatred . | . in | or morality | as jointly | different generators | fuels for | engine replacement | procedures overcoming | according to | as authors | is amended | directly awarding | annex , | european union | lifting a | parties to | - contributions | ii may | , except | made explicitly | place of | value cases | in any | employer shall | judges each | or new | operations of | are non | between solid | , point | , for | 32a is | new genetic | the annex"

In [30]:
# Bigrams Removed

' | '.join(get_topk_bigram(bigrams,model_edit,weights_edit,'removed','reject',50))

"no reason | and now | as the | terrorism - | contribute to | , possessed | under the | digital content | other subject | guidance , | electronic monitoring | legal body | . this | the digital | , shall | which the | the cir | remote electronic | 0 . | than ten | . member | agricultural guarantee | solidarity corps | in case | the passenger | authorised periods | intention or | and other | the supplier | information on | of directive | 1 . | policy objectives | . in | least likely | the product | discussion . | and of | investigations or | - sharing | ' association | state to | - matter | - user | carbon impact | the forest | 2 of | same shall | purpose of | ] and"

In [31]:
# Context Bigrams

' | '.join(get_topk_bigram(bigrams,model_edit,weights_edit,'context','reject',50))

"contents of | a sub | therefore , | hosting service | . _________________ | of directive | the funds | pnr data | article 4 | investment board | parental leave | regional operational | report within | state which | commission may | paragraph 3 | scientific evaluation | works or | main third | produced from | the driver | procedure , | 000 for | their common | they shall | have given | states may | or other | ' shall | free allocation | deemed to | 27 . | states introduce | commission should | which establishes | - and | their rights | for free | data for | programme’s research | finance may | - use | down rules | consumers , | countries in | . member | be deemed | article 2 | / 123 | 123 /"

In [32]:
# Title Bigrams

' | '.join(get_topk_bigram(bigrams,model_title,weights_title,'title','reject',50))

'and regulation | ) and | directive DDDD/DDD/ec | greenhouse gas | rules for | european union | corps programme | services in | , regulation | the eu | DDDD/DDD and | eu pnr | by member | passenger name | record data | data ( | structural reform | the structural | of passenger | name record | strategic plans | to georgia | european agricultural | the framework | pnr ) | DDDD/DDDD with | DDD/DDDD as | and weekly | daily and | DDDD/DDD/ec and | efficiency labelling | screening of | developments in | concerning the | , and | directive DDDD/DD/eu | direct investments | internal market | , laying | rules on | council amending | of a | gas emission | as regards | of energy | to adapting | for medicinal | supplementary protection | protection certificate | of water'