In [None]:
!pip install openai
!pip install tiktoken
!pip install fuzzywuzzy


import pickle
import os
import openai
import pandas as pd
from google.colab import drive
from sklearn.model_selection import train_test_split
import numpy as np

drive.mount('/content/drive')

Collecting openai
  Downloading openai-0.27.8-py3-none-any.whl (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m71.7/73.6 kB[0m [31m2.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.6/73.6 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: openai
Successfully installed openai-0.27.8
Collecting tiktoken
  Downloading tiktoken-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tiktoken
Successfully installed tiktoken-0.4.0
Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0
Mou

In [None]:
## Get the data



## FULL DATA

# FINAL COMBINED DATA
# Provide the correct path
X_path = "/content/drive/MyDrive/thesis_files/X_SBERT_FULLDATA_Final.pkl"
y_path = "/content/drive/MyDrive/thesis_files/y_SBERT_FULLDATA_Final.pkl"

def split_data(X, y, train_size, val_size, test_size, random_state):
    assert train_size + val_size + test_size == 1, "train_size, val_size, and test_size must sum up to 1"

    # Split data into temporary and test datasets
    X_temp, X_test, y_temp, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )

    # Adjust train_size to account for the previous split
    train_size_adjusted = train_size / (train_size + val_size)

    # Split the temporary dataset into train and validation datasets
    X_train, X_val, y_train, y_val = train_test_split(
        X_temp, y_temp, test_size=1 - train_size_adjusted, random_state=random_state
    )

    return X_train, X_val, X_test, y_train, y_val, y_test

# Load 'X' data from pickle file
with open(X_path, 'rb') as pkl_file:
    X = pickle.load(pkl_file)

# Load 'y' data from pickle file
with open(y_path, 'rb') as pkl_file:
    y = pickle.load(pkl_file)


# Data and Device Management
train_size = 0.8
val_size = 0.1
test_size = 0.1

X_train, X_val, X_test, y_train, y_val, y_test = split_data(X, y, train_size, val_size, test_size, random_state=42)


In [None]:
import json

# Define the path where the files are stored
path = "/content/drive/MyDrive/thesis_files/"

# Load all_documents
with open(path + 'all_documents.json', 'r') as f:
    all_documents_final = json.load(f)

# Load highlighted_documents
with open(path + 'highlighted_documents.json', 'r') as f:
    highlighted_documents_final = json.load(f)

# Load gold_highlighted_documents
with open(path + 'gold_highlighted_documents.json', 'r') as f:
    gold_highlighted_documents_final = json.load(f)



In [None]:
### Setting up OpenAI API creds ###

os.environ['OPENAI_API_KEY'] = "key"

openai.organization = "org"

openai_api_key = openai_api_key = os.environ.get('OPENAI_API_KEY')

assert openai_api_key is not None, "OpenAI API key not found."
openai.api_key = openai_api_key

In [None]:



# Prepare the DataFrame to collect results
result_df = pd.DataFrame(columns=['Document', 'Highlighted_Sentences', 'Truncated'])

import openai
import pandas as pd
import tiktoken

instruction_prompt = "You are a model tasked to label sentences as highlights or normal text. Go through the full document above and return only the exact sentences that you think a human reader would highlight. Do not change sentences or hallucinate and only return full sentences."
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
instruct_count = len(encoding.encode(instruction_prompt))
print(instruct_count)

remaining_token_count = 4096 - instruct_count
print(remaining_token_count)

def analyze_document(document, instruction, instruct_count, out_count):

    docs_token_count = 4096 - (instruct_count + out_count + 10)  #margin of safety, not sure why 7 tokens get added in the end


    encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

    # Remove padding sentences and join into a single string
    prompt_sentences = [sentence for sentence in document if sentence != 0]
    prompt = " ".join(prompt_sentences)

    # Check if the prompt exceeds max_token_input and truncate if necessary
    truncated = False
    token_count = len(encoding.encode(prompt))

    while token_count > docs_token_count:
        truncated = True
        prompt_sentences = prompt_sentences[:-1] # Remove the last sentence
        prompt = " ".join(prompt_sentences)
        token_count = len(encoding.encode(prompt))


    # Set truncated information
    truncated_info = len(prompt_sentences) - 1 if truncated else False

    full_prompt = f"{prompt}\n{instruction}"

    # Create the chat completion using the prompt and instruction
    chat_completion = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": full_prompt}],
        max_tokens=out_count
    )

    # Extract the highlighted sentences from the response
    highlighted_sentences = chat_completion['choices'][0]['message']['content'].split('\n')

    return highlighted_sentences, truncated_info








48
4048


In [None]:
import re

def extract_text_within_quotes(sentences):
    cleaned_sentences = []
    for sentence in sentences:
        if sentence.count('"') >= 2: # Check for at least two quotes
            match = re.search(r'"(.*?)"', sentence)
            cleaned_sentences.append(match.group(1))
        else:
            cleaned_sentences.append(sentence) # Keep the original sentence if there are not enough quotes
    return cleaned_sentences




In [None]:
### Ready Code ###


from fuzzywuzzy import process
import re
import openai
import pandas as pd
import tiktoken

instruction_prompt = "You are a model tasked to label sentences as highlights or normal text. Go through the full document above and return only the exact sentences that you think a human reader would highlight. Do not change sentences or hallucinate and only return full sentences."


def analyze_document(document, instruction, instruct_count, out_count):

    docs_token_count = 4096 - (instruct_count + out_count + 10)  #margin of safety, not sure why 7 tokens get added in the end


    encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

    # Remove padding sentences and join into a single string
    prompt_sentences = [sentence for sentence in document if sentence != 0]
    prompt = " ".join(prompt_sentences)

    # Check if the prompt exceeds max_token_input and truncate if necessary
    truncated = False
    token_count = len(encoding.encode(prompt))

    while token_count > docs_token_count:
        truncated = True
        prompt_sentences = prompt_sentences[:-1] # Remove the last sentence
        prompt = " ".join(prompt_sentences)
        token_count = len(encoding.encode(prompt))


    # Set truncated information
    truncated_info = len(prompt_sentences) - 1 if truncated else False

    full_prompt = f"{prompt}\n{instruction}"

    # Create the chat completion using the prompt and instruction
    chat_completion = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": full_prompt}],
        max_tokens=out_count
    )

    # Extract the highlighted sentences from the response
    highlighted_sentences = chat_completion['choices'][0]['message']['content'].split('\n')

    return highlighted_sentences, truncated_info


def extract_text_within_quotes(sentences):
    cleaned_sentences = []
    for sentence in sentences:
        # Removing leading dash and spaces
        sentence = sentence.strip().lstrip('-').strip()

        if sentence.count('"') >= 2: # Check for at least two quotes
            match = re.search(r'"(.*?)"', sentence)
            if match: # Ensure that a match was found
                cleaned_sentences.append(match.group(1))
            else:
                cleaned_sentences.append(sentence) # Keep the original sentence if no quotes match
        else:
            cleaned_sentences.append(sentence) # Keep the original sentence if there are not enough quotes

    return cleaned_sentences




def verify_cleaned_sentences(cleaned_sentences, source_document):
    matched_sentences = []

    for cleaned_sentence in cleaned_sentences:
        # Try a perfect match
        if cleaned_sentence in source_document:
            matched_sentences.append(cleaned_sentence)
            continue

        # Try a fuzzy match
        best_match, score = process.extractOne(cleaned_sentence, source_document)
        if score >= 80:
            print(f"Match found via fuzzy search: '{cleaned_sentence}'")
            matched_sentences.append(best_match)
            continue

        # Print error if no match found
        print(f"Error: The cleaned sentence could not be found in the source document: '{cleaned_sentence}'")

    return matched_sentences

# Iterate over the dataset X

def get_GPT_highlights(docs_data, instruction_prompt, output_count):

    result_df = pd.DataFrame(columns=['Document_ID', 'GPT_Highlighted_Sentences', 'Truncated', 'Full_Document'])


    encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
    instruct_count = len(encoding.encode(instruction_prompt))


    for i, document in enumerate(docs_data):

        highlighted_sentences =[]
        cleaned_sentences = []
        matched_sentences = []

        highlighted_sentences, truncated_info = analyze_document(document, instruction=instruction_prompt, instruct_count=instruct_count, out_count=output_count)

        cleaned_sentences = extract_text_within_quotes(highlighted_sentences)

        matched_sentences = verify_cleaned_sentences(cleaned_sentences, document)

        result_df.loc[i] = [i, matched_sentences, truncated_info, document]



    # Save the DataFrame to a JSON file
    result_df.to_json('GPT_highlighted_documents_test.json')

    return result_df


def check_sentences_in_document(df):
    for index, row in df.iterrows():
        document_id = row['Document_ID']
        gpt_highlighted_sentences = row['GPT_Highlighted_Sentences']
        full_document = row['Full_Document']

        all_found = True
        not_found_sentences = []

        for sentence in gpt_highlighted_sentences:
            if sentence not in full_document:
                all_found = False
                not_found_sentences.append(sentence)

        print(f"Document ID: {document_id}")
        print(f"All sentences found: {all_found}")
        if not all_found:
            print(f"Sentences not found:")
            for sentence in not_found_sentences:
                print(sentence)
        print() # Blank line for separation









In [None]:
### Run ###


all_documents_final5, highlighted_documents_final5, gold_highlighted_documents_final5 = remove_first_n_sentences(all_documents_final, highlighted_documents_final, gold_highlighted_documents_final)


In [None]:
results_final = get_GPT_highlights(all_documents_final5, instruction_prompt, 200)

In [None]:
results_final.to_json('results_final.json')


In [None]:
# Saving results_final
with open('results_final.json', 'w') as f:
    json.dump(results_final, f)


In [None]:
from google.colab import files
files.download('results_final.json')


In [None]:
def remove_first_n_sentences(all_documents, highlighted_documents, gold_highlighted_documents, n=5):
    def remove_from_highlighted_documents(original_doc, highlighted_doc):
        removed_sentences = original_doc[:n]
        return [sentence for sentence in highlighted_doc if sentence not in removed_sentences]

    all_documents_modified = [doc[n:] for doc in all_documents]
    highlighted_documents_modified = [remove_from_highlighted_documents(original_doc, highlighted_doc)
                                      for original_doc, highlighted_doc in zip(all_documents, highlighted_documents)]
    gold_highlighted_documents_modified = [remove_from_highlighted_documents(original_doc, gold_highlighted_doc)
                                           for original_doc, gold_highlighted_doc in zip(all_documents, gold_highlighted_documents)]

    return all_documents_modified, highlighted_documents_modified, gold_highlighted_documents_modified


In [None]:
all_documents_final5_updated, highlighted_documents_final5_updated, gold_highlighted_documents_final5_updated = remove_first_n_sentences(all_documents_final, highlighted_documents_final, gold_highlighted_documents_final)


In [None]:
results_final['Custom_Model_Highlights'] = highlighted_documents_final5_updated
results_final['Gold_Highlights'] = gold_highlighted_documents_final5_updated
results_final['Original_Full_Document'] = all_documents_final5

In [None]:
results_final.drop(columns=['Custom_Model_Highlights', 'Gold_Highlights'], inplace=True)


In [None]:
check_sentences_in_document(results)

Document ID: 0
All sentences found: True

Document ID: 1
All sentences found: True



In [None]:
results_final.rename(columns={
    'Custom_Model_Highlights': 'Custom Highlights',
    'GPT_Highlighted_Sentences': 'Benchmark Highlights'
}, inplace=True)


In [None]:
results_final.to_json('dataset_for_humaneva_final_0308_updated_naming.json', orient='records')