In [22]:
import pandas as pd
import os
from pdfminer.high_level import extract_text

In [23]:
def extract_text_from_pdf(pdf_path):
    #extract pdf from pdf file
    text  = extract_text(pdf_path)
    return text

In [24]:
import fitz
def extract_text_from_pdf2(pdf_path):
    # Open the PDF file
    doc = fitz.open(pdf_path)
    text = ""

    # Iterate through each page
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)  # Load the page
        text += page.get_text()  # Extract text from the page

    return text

In [28]:
system_prompt = """ 
You are an expert in reading and understanding research papers, You are given a research paper on herbal medicine. Extract the following information based on the structure below. If a section is not present, mention "Not mentioned" or "Not applicable."

1. Title: Extract the full title of the research paper.
2. Authors: List all the authors of the paper, separated by commas.
3. Year: Provide the year of publication. If not available, use "Not available."
4. Scientific name of Medicinal plant: Extract all scientific names of the medicinal plants mentioned in the paper. If multiple plants are mentioned, list them all.
5. Medicinal use: Extract any medicinal use i.e the application of herbs, or medical plants, to prevent, diagnose, treat, or manage diseases and health conditions.If not available, use "Not mentioned."
6. Biological activity: Describe the biological activities of the medicinal plants in detail. If the paper does not cover biological activities, mention "Not mentioned."
7. Dose: Extract any information related to dosage, including amounts and units. If the paper does not mention dosages, use "Not mentioned."
8. Phytochemicals: List all phytochemicals present in the medicinal plants,that is mentioned in the paper only. If not available, use "Not mentioned."
9. Plant part used: Describe the part of the plant used for medicinal purposes,that is mentioned in the paper only. If not available, use "Not mentioned."
10. Formulation: Extract details about the formulation of the medicinal plants (e.g., tablets, capsules, extracts),that is mentioned in the paper only. If not available, use "Not mentioned."
11. Botanic description: Detailed physical characteristics of a plant, including its structure, appearance, and growth habits. if available. Otherwise, mention "Not mentioned."
12. Toxicity: Extract any information related to the toxicity of the medicinal plants,that is mentioned in the paper only. If no toxicity information is available, mention "Not mentioned."
13. Adverse reactions: List any reported adverse reactions,that is mentioned in the paper only. If none are mentioned, use "Not mentioned."
14. Health Benefits: Describe any health benefits offered by the medicinal plants, that is mentioned in the paper only. If no health benefits are mentioned, use "Not mentioned."
15. Nutritional benefits: Extract any nutritional benefits provided by the medicinal plants,that is mentioned in the paper only. If not applicable, mention "Not mentioned."
16. Reference: Extract the relevant citation information from a provided research paper, article, or academic text (such as title, author(s), year of publication, journal name, volume, issue, page numbers, and DOI), and format it according to the APA style (7th edition). The output should be a properly formatted APA reference.. If not applicable, mention "Not mentioned."

Edge cases to consider:
- If the paper mentions multiple plants, ensure all relevant information is extracted for each plant.
- If certain sections (e.g., adverse reactions, toxicity) are missing from the paper, ensure the output reflects this absence (e.g., "Not mentioned").
- Ensure correct units for dosages are extracted, including when multiple dosages are provided.
- When extracting formulation, provide details on the preparation method, if available.
- Do not change the title, write as you have extracted it from the paper but change the format to title case.
- On other columns  Biological activity, Dose, Formulation, Botanic description, Toxicity, Adverse reactions, Health Benefits, Nutritional benefits, do not just extract but summarize the information and provide definitions where necessary."
- For all the columns provided , please make sure you are using only information from the extracted paper.
"""

In [34]:
pdf_folder = 'dataextraction'
filename_text_dict = {}
for pdf_file in os.listdir(pdf_folder):
    if pdf_file.endswith('.pdf'):
        pdf_path = os.path.join(pdf_folder,pdf_file)
        try:
            pdf_text = extract_text_from_pdf2(pdf_path)
        except:
            print("error with the file")
        #pdf_text = extract_text_from_pdf(pdf_path)
        #combined_input = prompt  + pdf_text
        filename_text_dict[pdf_file] = pdf_text

In [35]:
# Print an example (first 100 characters of text for each file)
for filename, text in filename_text_dict.items():
    print(f"Filename: {filename}")
    print(f"Text Preview: {text[:100]}...\n")
print(len(filename_text_dict))

Filename: Musa acuminate 2021 Zahra.pdf
Text Preview:  
189 Zahra et al.  
 
Int. J. Biosci. 
2021 
 
 
REVIEW PAPER                                      ...

Filename: Musa acuminate 2023 Gervásio .pdf
Text Preview: Review: Biological, antioxidant and phytochemical activities of Musa spp..
Ciência Rural, v.53, n.12...

Filename: Negalla staiva 2019 Nordin .pdf
Text Preview: RESEARCH ARTICLE
Open Access
Effect of Nigella sativa and its bioactive
compound on type 2 epithelia...

Filename: Nigella sativa 2013 Ahmad .pdf
Text Preview: 337
Asian Pac J Trop Biomed 2013; 3(5): 337-352
Asian Pacific Journal of Tropical Biomedicine
journa...

Filename: Nigella sativa 2017 Eid .pdf
Text Preview: Review Article
A Review on the Cosmeceutical and External
Applications of Nigella sativa
Ahmad M. Ei...

Filename: Nigella sativa 2020  Begum.pdf
Text Preview: Begum et al                                                                                         ...

Filename: Nigella sativa 2020 Ahmad .pd

In [None]:
import os
from openai import OpenAI
import json
import time
data = []
for filename, text in filename_text_dict.items():
    user_prompt = """Here is the research paper. Extract the relevant information as per the structure provided, the output should ben in a json format as text. 
    Research paper:"""+ text
    try:
        client = OpenAI(
            # If the environment variable is not configured, replace the following line with: api_key="sk-xxx",
            api_key=QWEN_API_KEY,
            base_url="https://dashscope-intl.aliyuncs.com/compatible-mode/v1",
        )

        completion = client.chat.completions.create(
            model="qwen-plus",
            messages=[
                {'role': 'system', 'content': system_prompt},
                {'role': 'user', 'content': user_prompt}
                ]
        )
        print(completion.choices[0].message.content)
        raw_output = completion.choices[0].message.content
        json_start = raw_output.find('{')
        json_end = raw_output.rfind('}') + 1
        json_str = raw_output[json_start:json_end]
        json_output = json.loads(json_str)
        # Append to the data list with filename as a key
        data.append({"filename": filename, **json_output})
        print(len(data))
    except Exception as e:
        print(f"Error message: {e}")
        print("For more information, see: https://www.alibabacloud.com/help/en/model-studio/developer-reference/error-code")
    time.sleep(2)  # Sleep for 1 second to avoid rate limiting
# Convert to DataFrame
df = pd.DataFrame(data)
# Export DataFrame to an Excel file
df.to_excel("herbal_medicine_output.xlsx", index=False)


```json
{
  "Title": "Health Benefits of Banana (Musa) - A Review Study",
  "Authors": "Fatima Zahra, Sidra Khalid, Maria Aslam, Zainab Sharmeen",
  "Year": "2021",
  "Scientific name of Medicinal plant": "Musa spp., Musa paradisiaca, Musa acuminata",
  "Medicinal use": "Banana is used to treat and prevent diseases such as cancer, ulcer, Alzheimer’s disease, infection, diarrhea, hemorrhoids, diabetes, and hypertension due to its antifungal, antibiotic, antimicrobial, antidiabetic, antioxidant, and anti-inflammatory properties.",
  "Biological activity": "The biological activities include antimicrobial, antifungal, antidiabetic, antioxidant, anti-inflammatory, and immunomodulatory properties. It also has the ability to reduce oxidative stress, maintain electrolyte balance, and act as a prebiotic.",
  "Dose": "Not mentioned",
  "Phytochemicals": "Phenols, flavonoids, triterpenes, oxalic acid, starch, tannin, glycosides, sulphate, dopamine, norepinephrine, serotonin, campesterol, stigmast