In [39]:
# import basic packages
import os 
import json
import lzma

import pandas as pd
import numpy as np

#------------------------------------------------------------------------------------------

import nltk
nltk.download('stopwords')

from string import punctuation
from nltk.corpus import stopwords

# define what contain punctuation and stpwords
#NUMBER is a fix to remove number in string format
#NUMBERS =[str(x) for x in (list(range(0,100)))]
PUNCTUATION = [char for char in punctuation if char not in ["/","-", "@","°","§","_"]]
STOPWORDS = stopwords.words("french")+ ['\\n']

# define function to remove punctuation
def remove_punct(text):
    # remove punctuation
    text = "".join([char for char in text if char not in PUNCTUATION])
    return(text)

# define function to remove stopwords
def remove_stops(text_tokenized):
    # remove stopwords
    text_tokenized = [word for word in text_tokenized if word not in STOPWORDS]
    return(text_tokenized)

# define tokenization function
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer

def text_tokenized(text):
    words = word_tokenize(text)
    return words

# definition to join text
def join_sentences(text):
    for word in text:
        text_join = " ".join(text)
    return text_join

#------------------------------------------------------------------------------------------

#define text stemmizationfunction
snow = SnowballStemmer('french')

def stemmization(text_tokenized_no_stops):
    stemmed_sentence = []
    # Word Tokenizer
    for word in text_tokenized_no_stops:
        # Apply Stemming
        stemmed_sentence.append(snow.stem(word))
    stemmed_text = " ".join(stemmed_sentence)
    return stemmed_text

#------------------------------------------------------------------------------------------

# define text lementization (whith tokenization and changing tokenization method to tokenize base on space)
import spacy
from spacy.lang.fr.examples import sentences
from spacy.tokens import Doc

class WhitespaceTokenizer:
    def __init__(self, vocab):
        self.vocab = vocab

    def __call__(self, text):
        words = text.split(" ")
        spaces = [True] * len(words)
        # Avoid zero-length tokens
        for i, word in enumerate(words):
            if word == "":
                words[i] = " "
                spaces[i] = False
        # Remove the final trailing space
        if words[-1] == " ":
            words = words[0:-1]
            spaces = spaces[0:-1]
        else:
           spaces[-1] = False

        return Doc(self.vocab, words=words, spaces=spaces)
    
#text lemmatization
nlp = spacy.load('fr_core_news_md')
nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)
#nlp = spacy.blank("fr")

def lemmatized(text):
    lemmatized_sentence = []
    doc = nlp(text)
    for token in doc:
        # Apply Stemming
        lemmatized_sentence.append(token.lemma_)
    return lemmatized_sentence

#------------------------------------------------------------------------------------------

def text_stemmed(text):
    # text preprocessing stemmed
    text_lower = str.lower(text)
    text_cleaned = remove_punct(text_lower)
    tokenized = text_tokenized(text_cleaned)
    tokenized_no_stop = remove_stops(tokenized)
    text_final_stemmed = stemmization(tokenized_no_stop)
    return (text_final_stemmed)

def text_lematization(text):
    # text preprocessing lemmatized
    text_lower = str.lower(text)
    text_cleaned = remove_punct(text_lower)
    text_lemmatized = lemmatized(text_cleaned)
    text_tokenized_no_stops = remove_stops(text_lemmatized)
    text_final_lemmatized = join_sentences(text_tokenized_no_stops)
    return (text_final_lemmatized)

def text_cleaned(text):
    # text preprocessing stemmed
    text_lower = str.lower(text)
    text_cleaned = remove_punct(text_lower)
    tokenized = text_tokenized(text_cleaned)
    tokenized_no_stop = remove_stops(tokenized)
    text_cleaned = join_sentences(tokenized_no_stop)

    return (text_cleaned)

#------------------------------------------------------------------------------------------

# text sementic comparison
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-mpnet-base-v2')

def text_vectorization(text):
    vector = model.encoder(text)
    return vector
    


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/juliendesmedt/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/juliendesmedt/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/juliendesmedt/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/juliendesmedt/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


update

In [140]:
# (no ref numbers) new article (5) 200-2021-013474  :  200-2021-011085 
# no updates '200-2021-015135' : 200-2021-013469
# no updates '200-2020-009334': '200-2019-010016',
# '200-2020-000391': '200-2019-010017'
# no changes '200-2021-013463': '200-2020-000391'
child_objectID =  '200-2021-013474'
parent_objectID = '200-2021-011085'

from algoliasearch.search_client import SearchClient

#search user
client = SearchClient.create("QVBA9ZZPRA", "520ea8dd3ca37da55f2c5d86729b23a8")

# index is the database
index = client.init_index("KPMG_index")


def get_content(objectID):
    return index.search('', {'filters': f'objectID:{objectID}'})['hits'][0]['content']

def text_comparison(child_objectID, parent_objectID):

    child = get_content(child_objectID)
    parent = get_content(parent_objectID)

    # key_list_child = child.keys()
    # key_list_parent = parent.keys()

    score_dict = {}
    final_score_dict = {}
    child_art = 0
    for pdf_2art in child:
        article_pdf2 = text_lematization(pdf_2art)
        each_score = {}
        parent_art = 0
        for pdf_1art in parent:
            article_pdf1 = text_lematization(pdf_1art)
            first = model.encode(article_pdf2) 
            second = model.encode(article_pdf1)
            cos_sim = util.pytorch_cos_sim(first, second)
            each_score[parent_art] = cos_sim
            parent_art += 1
            #print(f'{pdf_2art} score against {pdf_1art}',cos_sim)
        score_dict[child_art] = each_score
        corresponding= {}
        art_number = max(score_dict[child_art], key=score_dict[child_art].get)
        max_score = max(score_dict[child_art].values())
        corresponding[art_number] = max_score
        final_score_dict[child_art] = corresponding

        if list(max_score)[0] < 0.85:
            corresponding['article_status'] = 'New'
        elif 0.85 <= list(max_score)[0] < 0.999:
            corresponding['article_status'] = 'Update'
        else: 
            corresponding['article_status'] = 'NoUpdate'
            #final_score_dict.pop(key)

        final_score_dict[child_art] = corresponding
        child_art += 1
    
    Child_content = {}
    df = pd.DataFrame()
    for key, value in final_score_dict.items():
        Child_content['article'] = key
        Child_content['content_child'] = child[key]
        Child_content['content_parent'] = parent[list(value.keys())[0]]
        Child_content['score'] = list(value.values())[0]
        Child_content['status'] = list(value.values())[1]
        df = df.append(Child_content, ignore_index=True)
    return final_score_dict, df

    

In [141]:
#Lementization
final_score_dict , df= text_comparison(child_objectID, parent_objectID)
final_score_dict

  df = df.append(Child_content, ignore_index=True)
  df = df.append(Child_content, ignore_index=True)
  df = df.append(Child_content, ignore_index=True)
  df = df.append(Child_content, ignore_index=True)
  df = df.append(Child_content, ignore_index=True)


{0: {0: tensor([[0.8998]]), 'article_status': 'Update'},
 1: {1: tensor([[0.9429]]), 'article_status': 'Update'},
 2: {2: tensor([[0.8500]]), 'article_status': 'Update'},
 3: {3: tensor([[0.9178]]), 'article_status': 'Update'},
 4: {3: tensor([[0.6613]]), 'article_status': 'New'}}

In [142]:
df
    

Unnamed: 0,article,content_child,content_parent,score,status
0,0,ART.1er\nLa présente convention collective de ...,Article 1er. La présente convention collective...,[[tensor(0.8998)]],Update
1,1,"ART.2\nEn application de l'article 2, § 3 de l...","Art. 2. En application de l'article 2, § 3 de ...",[[tensor(0.9429)]],Update
2,2,"ART.3\n§ 1er.\nEn application de l'article 4, ...","Art. 3. En application de l'article 4, § 4 de ...",[[tensor(0.8500)]],Update
3,3,ART.4\nTout ce qui n'est pas explicitement pré...,Art. 4. Tout ce qui n'est pas explicitement pr...,[[tensor(0.9178)]],Update
4,4,ART.5\nLa présente convention collective de tr...,Art. 4. Tout ce qui n'est pas explicitement pr...,[[tensor(0.6613)]],New


#GPT

In [22]:
import openai
#Koumeyl
key_openIA = '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'



In [154]:
def article_summary(text):
    #summary : 

    chatGPT_api_key = key_openIA
    # Set the API key
    openai.api_key = chatGPT_api_key

    file_txt = str(text)
    txt_string = file_txt.lower()
    txt_string = txt_string[:10000]
    prompt = f"Résume ce document: {txt_string}"
    response = openai.Completion.create(
    engine="text-davinci-003",
    prompt=prompt,
    max_tokens=590,
    n=1,
    stop=None,
    temperature=0.5)
    summary = response["choices"][0]["text"]
    return summary

def article_comparison(child,parent):

    # Define the prompt
    prompt = f"Résume en quelques mots les différences entre {child} et {parent}"

    # Query the API
    response = openai.Completion.create(
        engine="text-davinci-003",
        prompt=prompt,
        max_tokens=1024,
        n=1,
        stop=None,
        temperature=0.5
    )

    # Print the response
    diffs = response["choices"][0]["text"]
    return diffs
    

In [164]:
from numpy import NaN

df.loc[df['status'].isin(['New','Update']),'summary'] = article_summary(df['content_child'])
df.loc[df['status'].isin(['Update']),'comparison'] = article_comparison(df['content_child'], df['content_parent'])


In [158]:
article_comparison(df['content_child'][0],df['content_parent'][0])

'Différences: Les différences entre les chapitres I et II sont que le chapitre I définit le cadre général de la convention collective de travail et le chapitre II définit les conditions de crédit-temps.'

"\nArt. 5 diffère par la date d'entrée en vigueur et la date de fin des effets."