In [13]:
import os
import glob
import nltk
import shutil
import textwrap
from ast import literal_eval
from langchain.llms import OpenAI
from langchain.indexes import GraphIndexCreator
from langchain.document_loaders import TextLoader
from langchain.chat_models import ChatOpenAI
from langchain.chains import create_extraction_chain

import pdf2image
import pytesseract
import pandas as pd
from tqdm.auto import tqdm

!mkdir ./KG

nltk.download('punkt')

os.environ['OPENAI_API_KEY'] = "sk-" # Your own OpenAI Key 

chunk_size = 16000

def clean_text(text):
    """Remove section titles and figure descriptions from text"""
    clean = "\n".join([row for row in text.split("\n") if (len(row.split(" "))) > 3 and not (row.startswith("(a)")) and not row.startswith("Figure")])
    return clean

def truncate_text(text, max_tokens):
    wrapper = textwrap.TextWrapper(width=max_tokens)
    truncated_text = wrapper.wrap(text)
    if len(truncated_text) > 0:
        return truncated_text[0]
    else:
        return ""

def split_text(text, chunk_size):
    chunks = []
    start = 0
    end = chunk_size
    while start < len(text):
        chunks.append(text[start:end])
        start = end
        end += chunk_size
    return chunks

def split_text_into_sentences(text, num_sentences):
    sentences = nltk.sent_tokenize(text)
    grouped_sentences = [sentences[i:i+num_sentences] for i in range(0, len(sentences), num_sentences)]
    return grouped_sentences

def flatten_list(nested_list):
    flattened_list = []
    for item in nested_list:
        if isinstance(item, list):
            flattened_list.extend(flatten_list(item))
        else:
            flattened_list.append(item)
    return flattened_list

def move_file(source_path, destination_path):

    # Make sure the destination folder exists before moving the file
    if not os.path.exists(destination_path):
        os.makedirs(destination_path)

    try:
        shutil.move(source_path, destination_path)
        print(f"File moved successfully from '{source_path}' to '{destination_path}'.")
    except Exception as e:
        print(f"Error: {e}")

llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-16k-0613")

schema = {
    "properties" : {
        "title" : {"type" : "string"},
        "author" : {"type" : "string"},
    },
    "required" : ["title"]
}

chain = create_extraction_chain(schema, llm)

pdf_path = glob.glob("/Users/kalbefarma/Documents/Python/Others/pdf/*.pdf") # Change to your own PDF directory

for pdf in tqdm(pdf_path):
    try:
        images = pdf2image.convert_from_path(pdf)

        extracted_text = ""
        for image in images[:-1]:
            text = pytesseract.image_to_string(image)
            text = clean_text(text)
            extracted_text += text + " "

        triples = []
        textb1 = split_text(extracted_text, chunk_size)

        df = pd.DataFrame(literal_eval(str(chain.run(textb1)[0]).replace("\'", "\"")), index=[0]).fillna('')
        triples.append((df['title'][0], df['author'][0], 'Written by'))

        splitted_text = split_text_into_sentences(extracted_text, 3)
        index_creator = GraphIndexCreator(llm=OpenAI(temperature=0, model="text-davinci-003"))


        for text in splitted_text:
            graph = index_creator.from_text(text)
            triples.append(graph.get_triples())

        flattenedL = flatten_list(triples)
        kgL = pd.DataFrame(flattenedL).rename(columns={0:'Source', 1:'Target', 2:'Relation'})
        fname = pdf.split('/')[-1].replace('.pdf', '')
        kgL.to_csv(f'./KG/{fname}.csv', index=False)
    except Exception as e:
        move_file(pdf, "./Unprocessed")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/kalbefarma/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


  0%|          | 0/4956 [00:00<?, ?it/s]