In [None]:
import pandas as pd
import numpy as np
import re
from datetime import datetime
import pathlib
from pathlib import Path
import json
import os
import tiktoken
import summary_scripts as ss

In [None]:
import openai
from openai import OpenAI

In [None]:
COMPLETIONS_MODEL = "gpt-3.5-turbo-1106"

In [None]:
### Input functions. 

### read in es.txt file
def read_lines(file):
    with open(file) as f:
        lines = [l.strip() for l in f.readlines() if '---  page ' not in l and not l.strip() == '']
    return lines

def read_txt(file):
    with open(file) as f:
        lines = [l.strip() for l in f.readlines() if '---  page ' not in l and not l.strip() == '']
        txt = ' '.join(lines)
    return txt

### input path to the file
def read_html(file):
    with open(file, 'r', encoding="utf-8") as f:
        t = f.read()
    soup = BeautifulSoup(t, 'html.parser')
    txt = soup.get_text()
    return txt

In [None]:
'''
simple segmentation function 
segment the long document into chunks of similar size

txt: long document
CHUNK_SIZE: size of segments
return: list of text segments
'''

def partition_document(txt, CHUNK_SIZE = 5000):
    
    token_count = ss.count_tokens(txt)

    num_segments = int(np.round(token_count / CHUNK_SIZE))
    print(num_segments)

    char_count = len(txt)
    split_length = int(np.round(char_count/num_segments))
    # print(split_length)

    segments = []
    c = 0
    head = 0
    tail = split_length * 1
    temp = tail
    find_split = False
    while(c < num_segments):
        find_split = False
        if c == num_segments - 1:
            segments.append(txt[head:])
            c += 1
        else:
            while(not find_split):
                if txt[temp:temp+2] == '. ':
                    tail =  temp + 1
                    find_split = True
                    segments.append(txt[head:tail])
                    head = tail + 1
                    temp = head + split_length * 1
                    tail = temp
                    c += 1
                    break
                else:
                    temp += 1
    return segments

In [None]:
def get_completion_long(mssges, modelname = "gpt-3.5-turbo-1106", temp = 1.0,max_tokens = 1000):
    '''
    a simplified version of get_completion
    mssges should be prepared as argument to completion request
    '''
    completion = client.chat.completions.create(
        model=modelname,
        messages= mssges,
        max_tokens=max_tokens,
        temperature = temp)

    #print tokens used in prompt and completion
    print("Tokens in prompt: ", completion.usage.prompt_tokens)
    print("Tokens in completion: ", completion.usage.completion_tokens)

    return completion

In [None]:
def write_summary_long(segments,savedir,country,save_result = True,token_limit = 14000, model_name = "gpt-3.5-turbo-1106", t = 1.0):
    '''
    Summarizes a sequence of segments from a large document progressively
    

    segments: [text1,text2,...], a list of consecutive segments from a long document
    savedir: directory where to save the summary and completion object
    country: country for which to summarize
    token_limit: maximum number of tokens for the text (not used in this case)
    modelname: name of the model to use

    Returns a list of summaries. The last item is the final and complete version. 
    '''
    ### prepare message for the first segment summary
    msgs_start = [{"role" : "system", "content": role_system_bullets}]
    content_start = prpt_bullets_ada + segments[0]
    msgs_start.append({"role" : "system", "content": content_start})
    comp_start = get_completion_long(msgs_start)
    print(comp_start.choices[0].message.content)
    summary_chain = [] # the list of all segment summaries
    summ_temp = comp_start.choices[0].message.content
    summary_chain.append(summ_temp)
    n_ch = len(segments)
    ### progressive summary revision starts
    # summary_(i+1) is based on segment_(i+1),summary_(i) 
    for i in range(n_ch-1):
        idx = i + 1
        print("--- Preparing summary indexed " + str(idx))
        msgs_chain = [{"role" : "system", "content": role_system_bullets_chain}]
        content_chain = old_summ_chain + summ_temp + prpt_bullets_ada_chain + segments[idx] + prpt_chain
        msgs_chain.append({"role" : "user", "content": content_chain})
        comp_chain = get_completion_long(msgs_chain)
        summ_temp = comp_chain.choices[0].message.content
        completion_dict = ss.convert_chat_completion_to_dict(comp_chain)
        print(summ_temp)
        summary_chain.append(summ_temp)

        if save_result:

            savepath_summary = os.path.join(savedir, "summary_t" + str(t) + "_seg" + str(idx) + ".txt")
            if idx == n_ch - 1:
                savepath_summary = os.path.join(savedir, "summary_t" + str(t) + ".txt")
            savepath_completion = os.path.join(savedir, "completion_seg"+ str(idx)+ ".json")

            completion_json = json.dumps(completion_dict)

            #save the summary and the completion object in the savedir

            with open(savepath_completion, "w") as f:  
                f.write(completion_json)

            with open(savepath_summary, "w") as f:
                f.write(summ_temp)
    
    print("Total number of summaries written: " + str(len(summary_chain)))
    return summary_chain

In [None]:
##### Prompt engineering

##### normal size document prompt
role_system_bullets = """
You are a multi-lingual international lawyer whos legal specialty is data protection and processing.
Additionally, you know and work with the General Data Protection Regulations (GDPR) of European Union.
You will produce a summary of the document from the data protection authority by answering questions. 
The summary will be in English only.
As a general rule, be clear and precise about your writing. If the answer is unclear, explain how so."""
prpt_bullets_ada = """"
Please finish the tasks one by one.
1. In one or two sentences, identify the type of the document.
2. Identify the complainant and defendant mentioned in the document, if any. Briefly introduce their roles and functions, if provided by the document. Determine the type of complainant and the type of defendant as individual, or company, or public institution, or public authority. Describe the relationship between these two parties if possible. 
3. Please elaborate the subject matter of the dispute between these two parties, and then explain the data involved in this dispute.
4. What kind of data processing was involved (including but not limited to collection, storage, transfer, publication, dissimation, etc.) and for what purpose (including but not limited to marketing, journalistic purposes, market research, etc.)?
5. What articles of GDPR, if any, does the complainant claim that the defendant potentially violates?
6. Does the legal authority dismiss the submitted complaint or case? 
7. What is the outcome of the dispute? From the final resolution in the document, describe all the enforcements by the authority in detail (including but not limited to: fine, warning, investgation and other penalties or obligations)
8. What facts and evidence are verified or proved by the authority?
9. In case the complaint is approved or the authority releases the enforcement, identify the GDPR articles violated by the defendant, if any. Your answer should only consider the decision made by the data protection authority, together with the verified evidence or facts in the previous steps.
10. Is the document a final decision, or a preliminary decision (e.g. an order to stop data processing until a final decision is made), or an intermediate procedural step (e.g. a request for additional information), or related to a previous procedure, or something else?

Here is the document:

"""
##### progressive summary chain
role_system_bullets_chain = """You are a multi-lingual international lawyer whos legal specialty is data protection and processing.
Additionally, you know and work with the General Data Protection Regulations (GDPR) of European Union.

"""
old_summ_chain = """
You will be given three passages. 
First, you have a temporary list of responses in bullet points regarding the ealier parts of a legal decision document:

"""
prpt_bullets_ada_chain = """
Second, you have a list of tasks regarding the document as your guidance:
1. In one or two sentences, identify the type of the document.
2. Identify the complainant and defendant mentioned in the document, if any. Briefly introduce their roles and functions, if provided by the document. Determine the type of complainant and the type of defendant as individual, or company, or public institution, or public authority. Describe the relationship between these two parties if possible. 
3. Please elaborate the subject matter of the dispute between these two parties, and then explain the data involved in this dispute.
4. What kind of data processing was involved (including but not limited to collection, storage, transfer, publication, dissimation, etc.) and for what purpose (including but not limited to marketing, journalistic purposes, market research, etc.)?
5. What articles of GDPR, if any, does the complainant claim that the defendant potentially violates?
6. Does the legal authority dismiss the submitted complaint or case? 
7. What is the outcome of the dispute? Describe all the legal enforcements by the authority in detail, including fine, warning, investgation and other penalties or obligations, etc..
8. What facts and evidence are verified or proved by the authority?
9. In case the complaint is approved or the authority releases the enforcement, identify the GDPR articles violated by the defendant, if any. Your answer should only consider the decision made by the data protection authority, together with the verified evidence or facts in the previous steps.
10. Is the document a final decision, or a preliminary decision (e.g. an order to stop data processing until a final decision is made), or an intermediate procedural step (e.g. a request for additional information), or related to a previous procedure, or something else?
Lastly, here is the latest segment of the same document from which the temporary summary is produced:

"""
prpt_chain = """
Your job is to review the temporary list of responses and use the list of task prompts and the latest segment of the document to add more definite answers. You will make revisions or additions to the old responses if those answers are uncertain, not mentioned, or unclear, etc... Please output the updated whole list of responses , not only the parts you make revisions.
"""