In [None]:
import numpy as np
import pandas as pd
import re

import requests
from bs4 import BeautifulSoup


import string
import unidecode


import transformers
import spacy


import torch

In [None]:
DATE='2022-06-09'

MAINLINK='https://www.ejustice.just.fgov.be/cgi/summary_body.pl?language=nl&pub_date='
DETAILLINK='https://www.ejustice.just.fgov.be/cgi/article_body.pl?language=nl&caller=summary&pub_date='



res = requests.get(MAINLINK + DATE)


In [None]:

def get_numac_numbers(res:str):
    _numacs=[]

    soup = BeautifulSoup(res, 'html.parser')

    try:
        value = soup.find_all('input', {'name': 'numac'})
        
    except Exception as e:
        print("Got unhandled exception %s" % str(e))

    for v in value:
        _numacs.append(v['value'].strip())
    return _numacs



In [None]:
def create_numac_links(_numacs:list):
    links=[]

    for _a in _numacs:
        link = f"{DETAILLINK}{DATE}&numac={_a}"
        links.append(link)
    return links

In [None]:
numacs=get_numac_numbers(res.text)
numac_links=create_numac_links(numacs)
numac_links

In [None]:
#Cleaning the scraped text
def clean(_a:str):
    d=re.sub(r'(?<=[.,;,:])(?=[^\s])', r' ', _a)
    
    document_test= unidecode.unidecode(d)
    document_test = document_test.replace('\\n', ' ').replace('\n', ' ').replace('\t',' ').replace('\\', ' ').replace('. com', '.com')

    pattern = re.compile(r'\s+') 
    Without_whitespace = re.sub(pattern, ' ', document_test)
    # There are some instances where there is no space after '?' & ')', 
    # I am replacing these with one space so that It will not consider two words as one token.
    document_test = Without_whitespace.replace('?', ' ? ').replace(')', ') ')
    
 
    document_test = re.sub(r"[^a-zA-Z0-9:$-,%.?!]+", ' ', document_test) 
   
    # Remove Mentions
    document_test = re.sub(r'@\w+', '', document_test)
    #document_test = re.sub(r"[^a-zA-Z:$-,%.?!]+", ' ', document_test)

    return document_test

In [None]:
#Scrape the article 

def scrape_numac(_numac_links:list):
    _count=0 # to check which line
    nl_list=[]
    for a in _numac_links:
        _count+=1
        res = requests.get(a)
        soup = BeautifulSoup(res.text, 'html.parser')
        for sup in soup.find_all('sup'):
            sup.unwrap()
    
        
        text=soup.text
        text=text.replace('\n',"")
        lst=text.split('Numac :')[1].split(text.split('Numac :')[2])
    

        article=lst[1].split('begin eerste woord laatste')[0].strip()
        article=clean(article)
        nl_list.append(article)
        print(_count)
    return nl_list
    

In [None]:
nl_list=scrape_numac(numac_links)

In [None]:
#SUMMARY
from transformers import MBartConfig
config = MBartConfig.from_pretrained("ml6team/mbart-large-cc25-cnn-dailymail-nl-finetune", output_hidden_states=True)

undisputed_best_model = transformers.MBartForConditionalGeneration.from_pretrained(
    "ml6team/mbart-large-cc25-cnn-dailymail-nl-finetune",config=config
)




tokenizer = transformers.MBartTokenizer.from_pretrained("facebook/mbart-large-cc25")
summarization_pipeline = transformers.pipeline(
    task="summarization",
    model=undisputed_best_model,
    tokenizer=tokenizer,
)
summarization_pipeline.model.config.decoder_start_token_id = tokenizer.lang_code_to_id[
    "nl_XX"
]

In [None]:

#Creating the summary with pretrained model and saving to a csv file in case an error and appending to dataframe 

def summarise(_nl_list:list):
    counter=0
    summary=[]
    for text in _nl_list:
        

        t= summarization_pipeline(
            text,
            do_sample=True,
            top_p=0.75,
            top_k=50,
            num_beams=4,
            min_length=50,
            early_stopping=True,
            truncation=True,
        )[0]["summary_text"]

        summary.append(t)

        counter=counter+1
        print(counter)
    return summary

In [None]:
summary=summarise(nl_list)

### Tagging

In [None]:
summary

In [None]:
SIMILARITY=0.80
tags='aanslagjaar arbeidsongeschiktheidsuitkeringen bedrijfsinkomsten bedrijfskosten bedrijfstoeslag bedrijfsvoorheffing belasting belastingverdragen belastingverhoging'\
     'belastingvermindering belastingvoet belastingvoordeel belastingvrije beroepsinkomsten beroepskosten bezoldiging btw derdebetalersregeling dienstverplichtingen erfbelasting'\
     'financieringskosten heffing inkomsten inkomstenderving investeringsaftrek kapitaalaflossingen kapitaalvermindering kostenvermindering omzetbelasting personenbelasting'\
     'prestatievergoeding rechtspersonenbelasting registratierechten schenkbelasting socialezekerheidsbijdragen solidariteitsbijdrage uitbetalingsinstelling vennootschapsbelasting'\
     'verminderingen vervangingsinkomsten voorafbetalingen voorbelasting voorheffing vrijstellingsregeling waardevermindering werkgeversbijdrage werkingskosten zekerheidsbijdragen'\
     'invaliditeit verzekering'

In [None]:
_tags=tags.split(' ')

real_tag_tensors=[]
for a in _tags:
    input_ids = torch.tensor(tokenizer.encode(a)).unsqueeze(0)  # Batch size 1
    outputs = undisputed_best_model(input_ids)
    real_tag_tensors.append(outputs.logits)
    
    

In [102]:


input_ids = torch.tensor(tokenizer.encode('belangrijk')).unsqueeze(0)  # Batch size 1
outputs = undisputed_best_model(input_ids)
last_hidden_states = outputs[2]  # The last hidden-state is the first element of the output tuple

input_ids2 = torch.tensor(tokenizer.encode('dagelijk')).unsqueeze(0)  # Batch size 1
outputs2 = undisputed_best_model(input_ids2)
last_hidden_states2 = outputs2[2]  # The last hidden-state is the first element of the output tuple



In [105]:
outputs


Seq2SeqLMOutput(loss=None, logits=tensor([[[ 42.1872,  20.6635,  54.8743,  ...,  32.3599,  33.2639,  24.5037],
         [-14.4125,   9.7157,  21.9094,  ...,  -3.2535,  -1.1117,   6.3129],
         [  4.9049,  11.5012,  15.6374,  ...,  19.9058,  18.1002,   7.0776]]],
       grad_fn=<AddBackward0>), past_key_values=((tensor([[[[-1.1173, -0.1579, -0.5422,  ..., -0.7314, -0.7746,  0.3829],
          [ 0.6704, -0.1339, -0.2589,  ...,  0.2384, -0.0977, -0.5655],
          [ 0.6349, -0.2383, -0.5577,  ...,  0.3275,  1.0944,  0.1245]],

         [[-0.3125, -0.1694,  1.1855,  ..., -1.5813, -1.2669,  2.0109],
          [ 0.3140,  1.0704,  2.3796,  ..., -2.6772, -0.4662, -1.1391],
          [ 0.3703,  0.0513,  1.6098,  ..., -0.4212, -0.2218, -0.1952]],

         [[ 0.6365,  0.6843, -0.3397,  ...,  0.2008, -1.8343,  0.3040],
          [-1.5459, -0.0324,  0.5153,  ...,  0.3349, -1.4821, -0.3019],
          [ 0.8988, -0.4903, -1.4935,  ...,  5.4315, -0.4303, -1.6672]],

         ...,

         [[-3.

In [104]:
input_ids2

tensor([[    48,  67097,      2, 250004]])

In [82]:

embedding_output_1 = last_hidden_states[0]
embedding_output_2 = last_hidden_states2[0]



In [None]:
outputs.logits.size()

In [None]:
outputs2.logits.size()

In [86]:


hs1=embedding_output_1.mean(1)
hs2=embedding_output_2.mean(1)

cos = torch.nn.CosineSimilarity(dim=1)

q=cos(hs1, hs2)
q.tolist()[0]

0.8246103525161743

In [None]:
cos(l1, l2)

In [None]:
import spacy
nlp = spacy.load("nl_core_news_lg")
real_tags=nlp(tags)


In [None]:
def tagging(real_tags,summary_tags):
    summary_tag_list={}

    for _a in summary_tags:
        
        for token in real_tags:
            q=round(token.similarity(_a),3)
        
            if q > SIMILARITY:
                
                #add token to dict
                summary_tag_list[token]=q
                #print(_a,token,_a.similarity(token))
    return summary_tag_list

In [None]:
keys=[]
t=0
for a in summary:
    print(t)
    text=a.lower()
    summary_tags=nlp(text)      
    
    summary_tag_list=tagging(real_tags,summary_tags)
    dict1 = summary_tag_list
    sorted_dict = {}
    sorted_keys = sorted(dict1, key=dict1.get,reverse=True)  # [1, 3, 2]

    for w in sorted_keys:
        sorted_dict[w] = dict1[w]

    first5pairs = {k: sorted_dict[k] for k in list(sorted_dict)[:5]}


    keys.append(first5pairs)
    t=t+1

In [None]:
data = pd.DataFrame(
    {'date':DATE,
    'numac':numacs,
    'nltext':nl_list,
        'nllink':numac_links,
        'summary': summary,
     'nltags': keys
    })


In [None]:
keys

In [None]:
import numpy as np
data['nltags']=data['nltags'].astype(str)
data['nltags']=data["nltags"].str.strip('{}')
data['nltags'] = data['nltags'].replace('',np.nan,regex = True)
data.dropna(subset = ["nltags"], inplace=True)
data

In [None]:
#from btax.taxtag.models import Article
for index, row in data.iterrows():
    print(row['date'], row['numac'])

 


# class Article(models.Model):
#     date = models.DateField()
#     numac = models.CharField(max_length=15)
#     link=models.CharField(max_length=150)
#     nl_text=models.TextField()
#     nl_sum=models.TextField()
#     nl_tags=models.TextField()
#     created_at = models.DateTimeField(auto_now_add=True)
#     updated_at = models.DateTimeField(auto_now=True)


In [None]:
list(data['nltags'])

In [None]:
all=list(zip(keys,summary))