In [1]:
# !pip install pdf2image
# !pip install pytesseract
# !apt-get install poppler-utils
# !apt install tesseract-ocr
# !apt install libtesseract-dev
# !pip install openai biopython
# !pip install langchain==0.0.225
# !pip install chromadb

### <b>Parse PDF file</b>

In [2]:
!mkdir ./EX

In [3]:
import glob
import time
import shutil
import textwrap
import pdf2image
import pytesseract

import os
import time
import openai
import nltk
import pandas as pd
from langchain.chat_models import ChatOpenAI
from langchain.chains import create_extraction_chain, create_extraction_chain_pydantic
from langchain.prompts import ChatPromptTemplate

import re
from Bio import Entrez
from tqdm.auto import tqdm

from ast import literal_eval

nltk.download('punkt')

os.environ['OPENAI_API_KEY'] = "" # Your OPENAI KEY
Entrez.email = "" # Your PubMed Registered Email
Entrez.api_key = "" # Your PubMed API Key in your registered PubMed Account Setting


pdf_path = glob.glob("/Users/kalbefarma/Documents/Python/Others/pdf/*.pdf") # Change to your own pdf direactory

chunk_size = 16000

def clean_text(text):
    """Remove section titles and figure descriptions from text"""
    clean = "\n".join([row for row in text.split("\n") if (len(row.split(" "))) > 3 and not (row.startswith("(a)")) and not row.startswith("Figure")])
    return clean

fold = -1

def truncate_text(text, max_tokens):
    wrapper = textwrap.TextWrapper(width=max_tokens)
    truncated_text = wrapper.wrap(text)
    if len(truncated_text) > 0:
        return truncated_text[0]
    else:
        return ""

# extracted_text = truncate_text(extracted_text, 16000)

def split_text(text, chunk_size):
    chunks = []
    start = 0
    end = chunk_size
    while start < len(text):
        chunks.append(text[start:end])
        start = end
        end += chunk_size
    return chunks


def extract_gene_name(text):
    text_str = text.decode("utf-8")
    text_str = text_str.replace("\\n", "").replace("\\t", "").replace("\\'", "'")
    pattern = r"<NAME>(.*?)</NAME>"
    match = re.search(pattern, text_str)
    if match:
        gene_name = match.group(1)
        return gene_name
    else:
        return None

def get_geneName(rsid):
    text = Entrez.efetch(db="snp", id=rsid, retmode='xml').read()
    text = extract_gene_name(text)
    return text

def split_text_into_sentences(text, num_sentences):
    sentences = nltk.sent_tokenize(text)
    grouped_sentences = [sentences[i:i+num_sentences] for i in range(0, len(sentences), num_sentences)]
    return grouped_sentences

def flatten_list(nested_list):
    flattened_list = []
    for item in nested_list:
        if isinstance(item, list):
            flattened_list.extend(flatten_list(item))
        else:
            flattened_list.append(item)
    return flattened_list


def move_file(source_path, destination_path):

    # Make sure the destination folder exists before moving the file
    if not os.path.exists(destination_path):
        os.makedirs(destination_path)

    try:
        shutil.move(source_path, destination_path)
        print(f"File moved successfully from '{source_path}' to '{destination_path}'.")
    except Exception as e:
        print(f"Error: {e}")


llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-16k-0613")

schema = {
    "properties" : {
        "title" : {"type" : "string"},
        "author" : {"type" : "string"},
        "publisher" : {"type" : "string"},
        "publication_year" : {"type" : "string"},
        "gene_codes" : {"type" : "string"},
        "population_race" : {"type" : "string"},
        "phenotypes_or_diseases" : {"type" : "string"},
        "sample_size" : {"type" : "string"},
        "SNPs" : {"type" : "string"},
        "Study_Methodology" : {"type" : "string"},
        "Study_Level" : {"type" : "string"},
        "Outcome/Recommendation/Conclusion" : {"type" : "string"}
    },
    "required" : ["title"]
}

chain = create_extraction_chain(schema, llm)
err_path = []

for pdf in tqdm(pdf_path):

  try:
    images = pdf2image.convert_from_path(pdf)

    extracted_text = ""
    for image in images[:-1]:
        text = pytesseract.image_to_string(image)
        text = clean_text(text)
        extracted_text += text + " "

    text = extracted_text

    text_chunk = split_text(text, chunk_size)[:fold]

    chunkdf = []

    for i, chunk in enumerate(text_chunk):
        inp = chunk
        df = pd.DataFrame(literal_eval(str(chain.run(inp)[0]).replace("\'", "\"")), index=[0]).fillna('')
        chunkdf.append(df)

    concat = pd.concat(chunkdf, axis=0).reset_index().drop('index', axis=1).fillna('')
    concat['title'] = concat['title'][0]
    concat['author'] = concat['author'][0]
    concat['publisher'] = concat['publisher'][0]
    concat['publication_year'] = concat['publication_year'][0]
    # concat = concat.min().to_frame().T
    concat['SNPs'] = concat['SNPs'].apply(lambda x: x if x.startswith('rs') else '')
    for col in list(concat.columns):
      concat[col] = concat[col].apply(lambda x: x if x not in ['N/A', 'not mentioned', 'Not mentioned'] else '')

    L = []
    for i in range(len(concat)):
      if (len(concat['gene_codes'][i].split(',')) >= 1) and concat['SNPs'][i] == '':
        for g in concat['gene_codes'][i].split(','):
          L.append({
              'Title' : concat['title'][0],
              'Author' : concat['author'][0],
              'Publisher' : concat['publisher'][0],
              'Publication Year' : concat['publication_year'][0],
              'Genes' : g,
              'Population' : concat['population_race'][i],
              'Phenotype' : concat['phenotypes_or_diseases'][i].title(),
              'Sample Size' : concat['sample_size'][i],
              'SNPs' : concat['SNPs'][i],
              'Study Methodology' : concat['Study_Methodology'][i].title(),
              'Study Level' : concat['Study_Level'][i].title(),
              'Outcomes' : concat['Outcome/Recommendation/Conclusion'][i].capitalize()
          })
      elif (len(concat['SNPs'][i].split(',')) >= 1):
        for s in concat['SNPs'][i].split(','):
          try:
            L.append({
                'Title' : concat['title'][0],
                'Author' : concat['author'][0],
                'Publisher' : concat['publisher'][0],
                'Publication Year' : concat['publication_year'][0],
                'Genes' : get_geneName(s.strip()),
                'Population' : concat['population_race'][0],
                'Phenotype' : concat['phenotypes_or_diseases'][i].title(),
                'Sample Size' : concat['sample_size'][i],
                'SNPs' : s,
                'Study Methodology' : concat['Study_Methodology'][i],
                'Study Level' : concat['Study_Level'][i].title(),
                'Outcomes' : concat['Outcome/Recommendation/Conclusion'][i].capitalize()
            })
          except Exception as e:
            print(e)

    result = pd.DataFrame(L)

    fname = pdf.split('/')[-1].replace('.pdf', '')
    result.to_csv(f'./EX/{fname}_ER.csv', index=False)

  except Exception as e:
    move_file(pdf, "./Unprocessed")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/kalbefarma/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


ValidationError: 1 validation error for ChatOpenAI
__root__
  Did not find openai_api_key, please add an environment variable `OPENAI_API_KEY` which contains it, or pass  `openai_api_key` as a named parameter. (type=value_error)