In [1]:
import re
import pandas as pd
# Load the excel file
def load_excel(file_path):
    return pd.read_excel(file_path)

# Normalize the plant names to handle variations like punctuation, capitalization, and parentheses
def normalize_name(name):
    # Remove special characters like punctuation and parentheses, convert to lowercase
    return re.sub(r'[.,;()\\[\\]]', '', name.strip().lower())

# Function to find rows with similar scientific names or partial matches
def find_similar_rows(df, plant_name_column, target_name):
    normalized_target = normalize_name(target_name)
    
    def match_plant_name(cell):
        # Split plant names by commas, 'and', or other delimiters to handle multiple names in a single cell
        names_in_cell = re.split(r',| and ', str(cell))
        # Check if the target name (or a variation) appears in the cell
        # print(names_in_cell)
        return any(normalized_target in normalize_name(name) for name in names_in_cell)
    
    # Apply the match_plant_name function to each row in the specified column
    similar_rows = df[df[plant_name_column].apply(match_plant_name)]
    
    return similar_rows

In [2]:
# Load the file
df = load_excel('Extraction data new species.xlsx')
df.columns  

Index(['filename', 'Title', 'Authors', 'Year',
       'Scientific name of Medicinal plant', 'Medicinal use',
       'Biological activity', 'Dose', 'Phytochemicals', 'Plant part used',
       'Formulation', 'Botanic description', 'Toxicity', 'Adverse reactions',
       'Health Benefits', 'Nutritional benefits', 'Reference'],
      dtype='object')

In [3]:
system_prompt = """
You are an expert in scientific research, specifically in recognizing and extracting unique scientific names from research papers or datasets. You are given a document containing information about medicinal plants. Your task is to extract all unique scientific names of the plants mentioned.

Follow these guidelines carefully:
Recognize scientific names: Identify any text that corresponds to a scientific name of a plant. Scientific names are typically in Latin and often follow a binomial nomenclature format (e.g., Genus species). Variations with synonyms, subspecies, or other descriptors in parentheses should also be captured.
Ignore case and punctuation: Consider names to be the same if they only differ in capitalization, punctuation, or parentheses. For example:
Moringa oleifera is equivalent to Moringa oleifera Lam.
Zingiber officinale Roscoe is equivalent to Zingiber officinale.
Handle synonyms: If a name includes a synonym in parentheses (e.g., Aloe vera (synonym: Aloe barbadensis Miller)), treat the first part (Aloe vera) as the main name. Ensure that the core scientific name is extracted regardless of additional synonyms or subspecies mentioned.
Multiple names: If multiple scientific names are mentioned within the same text (separated by commas, "and", or "or"), extract each unique name.
No duplicates: Ensure that names are extracted only once. If the same scientific name appears in different variations, it should only be listed once in the final result.
Output format: The final output should be a list of unique scientific names, formatted as plain text or an array, with one name per entry.

Example input:

Moringa oleifera Lam
Moringa oleifera
Zingiber officinale Roscoe
Zingiber officinale Roscoe (Ginger)
Aloe vera (synonym: Aloe barbadensis Miller)
Expected output:

Moringa oleifera
Zingiber officinale
Aloe vera
Edge cases:

When synonyms or subspecies are provided (e.g., Aloe vera (synonym: Aloe barbadensis Miller)), only extract the main name (Aloe vera).
When there are small variations like missing author names (e.g., Zingiber officinale Roscoe vs. Zingiber officinale), consider them the same name.
"""

In [4]:
unique_text = """
Scientific name of Medicinal plant
Mentha x piperita L.
Athrixia phylicoides
Athrixia phylicoides
Athrixia phylicoides DC.
Athrixia phylicoides
Athrixia phylicoides
Camellia sinensis L., Melaleuca alternifolia
['Camellia sinensis', 'Camellia oleifera', 'Camellia japonica', 'Camellia taliensis', 'Camellia tenuiflora', 'Camellia vietnamensis', 'Camellia polyodontia', 'Camellia octopetala', 'Camellia meiocarpa', 'Camellia semiserrata', 'Camellia chekiangoleosa', 'Camellia fangchengensis', 'Camellia crassicolumna', 'Camellia osmantha', 'Camellia brevistyla']
Camellia sinensis (L.)
Camellia sinensis
Camellia sinensis
Camellia sinensis
Camellia sinensis
Camellia sinensis
Capsicum annuum L.
Capsicum annuum L.
Capsicum annuum
Capsicum annuum
Capsicum annuum, Capsicum frutescens, Capsicum chinense, Capsicum pubescens, Capsicum baccatum
Capsicum annuum L.
Capsicum annuum
Capsicum annuum L.
Capsicum annuum L., Capsicum chinense Jacq, Capsicum frutescens L., Capsicum baccatum L., Capsicum pubescens (Ruiz and Pav)
Centella asiatica (L.) Urban
Centella asiatica
Centella asiatica
Centella asiatica
Centella asiatica (L.) Urb.
Centella asiatica (also known as Centella asiatica (L.) Urb. or Gotu kola)
Centella asiatica (L.)
Centella asiatica
Centella asiatica
['Cinnamomum verum', 'Cinnamomum cassia', 'Cinnamomum burmanni', 'Cinnamomum tamala', 'Cinnamomum osmophloeum', 'Cinnamomum altissimum', 'Cinnamomum iners', 'Cinnamomum impressicostatum', 'Cinnamomum chemungianum', 'Cinnamomum micranthum', 'Cinnamomum porrectum', 'Cinnamomum subavenium', 'Cinnamomum tenuifolium', 'Cinnamomum kotoense']
['Cinnamomum aureofulvum Gamble', 'Cinnamomum bejolghota (Buch.-Ham.) Sweet', 'Cinnamomum burmannii (Nees & T. Nees) Blume', 'Cinnamomum cambodianum Lecomte', 'Cinnamomum caryophyllus (Lour.) S. Moore', 'Cinnamomum culilaban (L.) J. Presl', 'Cinnamomum filipedicellatum Kosterm.', 'Cinnamomum glanduliferum (Wall.) Meisn.', 'Cinnamomum glaucescens (Nees) Hand.-Mazz.', 'Cinnamomum iners Reinw. ex Blume', 'Cinnamomum insularimontanum Hayata', 'Cinnamomum javanicum Blume', 'Cinnamomum kotoense Kaneh. & Sasaki', 'Cinnamomum laubatii F.Muell', 'Cinnamomum loureiroi Nees', 'Cinnamomum macrocarpum Hook.f.', 'Cinnamomum mercadoi S.Vidal', 'Cinnamomum micranthum f. kanehirae (Hayata) S.S.Ying', 'Cinnamomum mollissimum Hook.f.', 'Cinnamomum oliveri F.M.Bailey', 'Cinnamomum osmophloeum Kaneh.', 'Cinnamomum parthenoxylon (Jack) Meisn.', 'Cinnamomum rhynchophyllum Miq.', 'Cinnamomum scortechinii Gamble', 'Cinnamomum sintoc Blume', 'Cinnamomum subavenium Miq.', 'Cinnamomum sulphuratum Nees', 'Cinnamomum tenuifolium (Makino) Sugim.', 'Cinnamomum travancoricum Gamble', 'Cinnamomum tazia (Buch.-Ham.) Kosterm. ex M.Gangop.', 'Cinnamomum walaiwarense Kosterm.', 'Cinnamomum wightii Meisn.', 'Cinnamomum wilsonii Gamble']
Cinnamomum zeylanicum, Cinnamomum loureirii, Cinnamomum burmanni, Cinnamomum aromaticum, Cinnamomum cassia
Cinnamomum zeylanicum, Cinnamomum cassia, Cinnamomum burmannii, Cinnamomum camphora, Cinnamomum osmophloeum, Cinnamomum verum
['Cinnamomum verum', 'Cinnamomum zeylanicum', 'Cinnamomum cassia', 'Cinnamomum aromaticum', 'Cinnamomum burmannii', 'Cinnamomum loureiroi']
Cinnamomum verum
Cinnamomum zeylanicum
Cinnamomum zeylanicum
Cinnamomum zeylanicum
Cinnamomum verum J.S. Presl
Cinnamomum zeylanicum
Cinnamomum burmannii, Cinnamomum cassia, Cinnamomum verum, Cinnamomum zeylanicum
['Cinnamomum zeylanicum', 'Cinnamomum cassia', 'Cinnamomum burmannii', 'Cinnamomum loureiroi', 'Cinnamomum osmophloeum', 'Cinnamomum camphora', 'Cinnamomum tamala', 'Cinnamomum altissimum']
Cinnamomum verum
Curcuma longa
Curcuma longa
Curcuma longa Linn
Curcuma longa
Curcuma longa
Curcuma longa
Curcuma longa Linn.
Curcuma longa L.
Curcuma longa
Curcuma longa
Curcuma longa Linn.
Curcuma longa
Curcuma longa L.
Curcuma longa Linn.
Curcuma longa
Curcuma longa
Cymbopogon citratus, Cymbopogon flexuosus
Cymbopogon citratus
Cymbopogon citratus
Cymbopogon citratus (DC.) Staph
Cymbopogon citratus (DC.) Stapf
Cymbopogon citratus
Cymbopogon citratus
Cymbopogon citratus (DC.) Stapf
Echinacea purpurea (L.) Moench
Echinacea purpurea, Echinacea angustifolia
Echinacea purpurea
Echinacea purpurea L.
Echinacea purpurea (L.) Moench
Echinacea purpurea (L.) Moench
Echinacea purpurea
['Echinacea purpurea (L.) Moench', 'Echinacea angustifolia DC.', 'Echinacea pallida (Nutt.) Nutt.']
['Echinacea angustifolia (DC.) Hell.', 'Echinacea pallida (Nutt.) Nutt.', 'Echinacea purpurea (L.) Moench']
Eucalyptus globulus
Eucalyptus globulus Labill.

Eucalyptus globulus
Eucalyptus globulus Labill.
['Eucalyptus globulus Labill.', 'Eucalyptus polybractea R.T. Baker', 'Eucalyptus smithii R.T. Baker']
Eucalyptus globulus
Eucalyptus globulus
Eucalyptus globulus
Eucalyptus globulus
Eucalyptus globulus Labill.
Jateorhiza palmata
Jateorhiza palmata (Lam.) Miers
Jateorhiza palmata
Jatropha gossypiifolia L.
Jatropha curcas, Jatropha gossypiifolia
Jatropha gossypifolia
Jatropha gossypifolia
Jatropha gossypiifolia L.
Jatropha gossypifolia L.
Jatropha gossypiifolia L.
Jatropha curcas L., Jatropha gossypifolia L., Jatropha multifidia L.
Jatropha gossypiifolia
Jatropha gossypifolia
Jatropha gossypifolia
Jatropha gossypiifolia L.
Kigelia africana (Lam.) Benth.
Kigelia africana (Lam.) Benth.
Kigelia africana (Lam.) Benth.
Kigelia africana (Lam.) Benth.
Kigelia africana
Kigelia africana (Lam.) Benth
Kigelia africana subsp. africana
Linum usitatissimum
Linum usitatissimum
Linum usitatissimum L.
Linum usitatissimum
Linum usitatissimum L.
Linum usitatissimum
Linum Usitatissimum
Linum usitatissimum
Linum usitatissimum L.
Linum usitatissimum L.
Mentha piperita
Mentha piperita L.
Mentha piperita L.
Mentha piperita
Mentha x piperita
Mentha Piperita Linn
Mentha piperita
Mentha piperita L., Mentha arvensis var. piperascens
Musa acuminata
Musa acuminata
Musa acuminata, Musa paradisiaca, Musa sapientum
Musa acuminata
Musa acuminata
Musa acuminata
Musa spp., Musa accuminata Colla, Musa balbisiana Colla
Musa spp., Musa paradisiaca, Musa acuminata
Musa acuminata, Musa balbisiana, Musa paradisiaca
Nigella sativa
Nigella sativa
Nigella sativa
Nigella sativa
Nigella sativa
Nigella sativa
Nigella sativa L.
Nigella sativa L.
Nigella sativa
Nigella sativa
Nigella sativa
Ocimum basilicum
Ocimum basilicum L.
Ocimum basilicum
Ocimum basilicum
Ocimum basilicum L.
Ocimum basilicum L.
['Ocimum americanum', 'Ocimum basilicum', 'Ocimum gratissimum', 'Ocimum campechianum', 'Ocimum sanctum']
Ocimum basilicum
Ocimum basilicum
Ocimum basilicum L.
Ocimum basilicum L.
Persea americana Mill
Persea americana
Persea americana
Persea americana Mill.
Persea americana
Persea americana
Persea americana Mill.
Persea americana var. drymifolia
['Piper nigrum L.', 'Piper aduncum L.', 'Piper betle L.', 'Piper auritum Kunth', 'Piper cernuum Vell.', 'Piper dilatatum Rich.', 'Piper gaudichaudianum Kunth', 'Piper guineense Schumach. & Thonn.', 'Piper marginatum Jacq.', 'Piper umbellatum L.', 'Piper tuberculatum Jacq.']
Persea americana
Piper nigrum L.
Piper nigrum
Piper nigrum
['Piper nigrum', 'Piper longum', 'Piper chaba', 'Piper guineense', 'Piper sarmentosum']
Salvia hispanica L.
Salvia hispanica L.
Salvia hispanica L.
Salvia hispanica L.
Salvia hispanica
Salvia hispanica L.
Taraxacum officinale
Taraxacum officinale
Taraxacum officinale
Taraxacum Officinale
Taraxacum officinale
Taraxacum officinale, Taraxacum japonicum, Taraxacum coreanum, Taraxacum platycarpum, Taraxacum formosanum, Taraxacum bicorne, Taraxacum hondoense, Taraxacum obovatum, Taraxacum bessarabicum, Taraxacum mongolicum
Taraxacum officinale Weber
['Taraxacum mongolicum Hand.-Mazz', 'Taraxacum officinale (L.) Weber ex F. H. Wigg', 'Taraxacum coreanum Nakai', 'Taraxacum borealisinense', 'Taraxacum platycarpum Dahlst.']
Taraxacum genus (Dandelion), including species like Taraxacum officinale, Taraxacum mongolicum, Taraxacum coreanum, Taraxacum formosanum, Taraxacum platycarpum
Taraxacum officinale F.H.Wigg.
Taraxacum officinale
Trigonella foenum-graecum L.
Trigonella foenum graecum
Trigonella foenum-graecum L.
Trigonella foenum-graecum
Trigonella foenum-graecum L.
Trigonella foenum-graecum
Trigonella foenum-graecum
Trigonella foenum-graecum
Trigonella foenum-graecum
Trigonella foenum-graecum
Trigonella foenum-graecum Linn.
"""

In [None]:
import os
from openai import OpenAI
import json
import time
plant_list = []
user_prompt = """you are an expert in extracting unique scientific names from a text, can you please extract the unique names from the text below, it should be in an rray, i just want the array result, make sure its unique and should be ordered by letters: """+ unique_text
try:
    client = OpenAI(
        # If the environment variable is not configured, replace the following line with: api_key="sk-xxx",
        api_key=QWEN_API_KEY,
        base_url="https://dashscope-intl.aliyuncs.com/compatible-mode/v1",
    )

    completion = client.chat.completions.create(
        model="qwen-plus",
        messages=[
            {'role': 'system', 'content': system_prompt},
            {'role': 'user', 'content': user_prompt}
            ]
    )
    print(completion.choices[0].message.content)
    plant_list = eval(completion.choices[0].message.content)
except Exception as e:
    print(f"Error message: {e}")
    print("For more information, see: https://www.alibabacloud.com/help/en/model-studio/developer-reference/error-code")


[
"Athrixia phylicoides",
"Capsicum annuum",
"Capsicum baccatum",
"Capsicum chinense",
"Capsicum frutescens",
"Capsicum pubescens",
"Camellia chekiangoleosa",
"Camellia crassicolumna",
"Camellia fangchengensis",
"Camellia japonica",
"Camellia oleifera",
"Camellia osmantha",
"Camellia polyodontia",
"Camellia sinensis",
"Camellia taliensis",
"Camellia tenuiflora",
"Centella asiatica",
"Cinnamomum altissimum",
"Cinnamomum aromaticum",
"Cinnamomum aureofulvum",
"Cinnamomum bejolghota",
"Cinnamomum burmannii",
"Cinnamomum cambodianum",
"Cinnamomum camphora",
"Cinnamomum caryophyllus",
"Cinnamomum chekiangoleosa",
"Cinnamomum culilaban",
"Cinnamomum filipedicellatum",
"Cinnamomum glanduliferum",
"Cinnamomum glaucescens",
"Cinnamomum iners",
"Cinnamomum insularimontanum",
"Cinnamomum javanicum",
"Cinnamomum kotoense",
"Cinnamomum laubatii",
"Cinnamomum loureiroi",
"Cinnamomum loureirii",
"Cinnamomum macrocarpum",
"Cinnamomum mercadoi",
"Cinnamomum micranthum",
"Cinnamomum mollissimum",
"Cinna

In [6]:
def generate_context_data(data):
    context = ""
    
    for record in data:
        plant_info = ""
        plant_info += f"Medicinal Uses: {record.get('Medicinal use', 'Not mentioned')}\n"
        plant_info += f"Biological Activity: {record.get('Biological activity', 'Not mentioned')}\n"
        plant_info += f"Dose: {record.get('Dose', 'Not mentioned')}\n"
        plant_info += f"Phytochemicals: {record.get('Phytochemicals', 'Not mentioned')}\n"
        plant_info += f"Plant Part Used: {record.get('Plant part used', 'Not mentioned')}\n"
        plant_info += f"Formulation: {record.get('Formulation', 'Not mentioned')}\n"
        plant_info += f"Botanic Description: {record.get('Botanic description', 'Not mentioned')}\n"
        plant_info += f"Toxicity: {record.get('Toxicity', 'Not mentioned')}\n"
        plant_info += f"Adverse Reactions: {record.get('Adverse reactions', 'Not mentioned')}\n"
        plant_info += f"Health Benefits: {record.get('Health Benefits', 'Not mentioned')}\n"
        plant_info += f"Nutritional Benefits: {record.get('Nutritional benefits', 'Not mentioned')}\n"
        plant_info += f"Reference: {record.get('Reference', 'Not mentioned')}\n"
        plant_info += "--------------------------------------\n"
        context += plant_info
    
    return context

In [7]:
len(plant_list)

109

In [8]:
plant_list = list(set(plant_list))

In [9]:
## summarizing code
summary_prompt = """
You are an expert in summarizing herbal medicine results from different sources or research papers, separated by "----", grouped together. Your task is to consolidate information across all sources into a structured summary for each plant. Ensure that the summarized output is provided in a well-structured JSON format based on the specified columns. For any missing information, mention "Not mentioned."

Only include the following fields in the JSON output:

1. Medicinal use(s): Provide a summary of the medicinal uses of the plant, considering all sources. Include a brief description of its applications in preventing, diagnosing, treating, or managing diseases.
2. Biological activity: Provide a concise summary of the biological activities of the plant, combining information across all sources.
3. Dose: Summarize dosage information (if mentioned), including typical amounts and units. Present multiple dosages as an array.
4. Phytochemicals: Consolidate the phytochemicals mentioned into an array.
5. Plant part used: Summarize the commonly used parts of the plant (e.g., root, leaves, bark). Present them as an array if multiple parts are mentioned.
6. Formulation: Provide a summary of the forms in which the plant is prepared or formulated (e.g., tablets, capsules, extracts). Use an array if multiple forms are mentioned.
7. Botanic description: Provide a summarized botanical description, consolidating the physical characteristics and growth habits.
8. Toxicity: Summarize any toxicity information. If mentioned across different sources, provide a general summary.
9. Adverse reactions: Summarize the adverse reactions mentioned across the sources.
10. Health benefits: Provide a summary of the key health benefits based on the papers.
11. Nutritional benefits: Provide a summary of the nutritional benefits, if mentioned.
12. References: Provide a consolidated array of references in APA style (7th edition). Each reference should include title, author(s), year of publication, journal name, volume, issue, page numbers, and DOI if available.

Ensure that the summary is concise and structured according to the specified fields. The output must be in valid JSON format. If certain sections are missing for a specific plant, reflect this in the output with "Not mentioned."

Edge cases to consider:
- Ensure all information from different sources (separated by "----") is grouped and consolidated correctly into the JSON format.
- Summarize descriptions (e.g., biological activity) concisely.
- Ensure the output contains only the information from the provided context.
"""


In [10]:
print(df)

                                              filename  \
0    assessment-report-mentha-x-piperita-l-folium-a...   
1                   Athrixia phylicoides lehlo2013.pdf   
2               Athrixia phylicoides Lerotholi2016.pdf   
3                  Athrixia phylicoides Mohale2020.pdf   
4                 Athrixia phylicoides Mokwena2021.pdf   
..                                                 ...   
198          Trigonella foenum-graecum Pushpa 2022.pdf   
199          Trigonella foenum-graecum Savita 2008.pdf   
200      Trigonella foenum-graecum Srinivasan 2006.pdf   
201      Trigonellafoenum-graecum L Ahmadiani 1999.pdf   
202         Trigonellafoenum-graecum L Kalyan 2016.pdf   

                                                 Title  \
0    Assessment Report on Mentha x piperita L., Fol...   
1    Bush Tea (Athrixia Phylicoides Dc.) Success St...   
2    Bush Tea (Athrixia phylicoides): A review of t...   
3    Untargeted Profiling Of Field Cultivated Bush ...   
4    Athrixia

In [None]:
data = []
#plant_list = ["Moringa oleifera"]
for target_name in plant_list:
    try:
        matching_rows = find_similar_rows(df, 'Scientific name of Medicinal plant', target_name)
    except Exception as e:
        # print(e)
        continue
    # Convert the dataframe to a list of dictionaries (each record is a plant)
    data_records = matching_rows.to_dict(orient='records')
    context_text = generate_context_data(data_records)
    # print(target_name)
    print(context_text)
    user_prompt = """Here is the information for {target_name} with each record seperated by ---. Summarize the information as per the structure provided, the output should be in a json format as text. 
    All information on the plant:
    """+ context_text
    try:
        client = OpenAI(
            # If the environment variable is not configured, replace the following line with: api_key="sk-xxx",
            api_key=QWEN_API_KEY,
            base_url="https://dashscope-intl.aliyuncs.com/compatible-mode/v1",
        )

        completion = client.chat.completions.create(
            model="qwen-plus",
            messages=[
                {'role': 'system', 'content': summary_prompt},
                {'role': 'user', 'content': user_prompt}
                ]
        )
        print(completion.choices[0].message.content)
        raw_output = completion.choices[0].message.content
        json_start = raw_output.find('{')
        json_end = raw_output.rfind('}') + 1
        json_str = raw_output[json_start:json_end]
        json_output = json.loads(json_str)
        # Append to the data list with filename as a key
        data.append({"Scientific name of Medicinal plant": target_name, **json_output})
        print(len(data))
    except Exception as e:
        print(f"Error message: {e}")
        print("For more information, see: https://www.alibabacloud.com/help/en/model-studio/developer-reference/error-code")
    time.sleep(2)  # Sleep for 1 second to avoid rate limiting

# Convert to DataFrame
data_df = pd.DataFrame(data)
# Export DataFrame to an Excel file
data_df.to_excel("summary_output.xlsx", index=False)
#print(sum)

Medicinal Uses: The Cinnamomum plants have been traditionally used to treat respiratory viruses, urinary tract infections, relieve abdominal discomfort, improve digestion, act as antidiabetic, analgesic, and neuroprotective agents.
Biological Activity: Cinnamomum species exhibit antibacterial, antidiabetic, antioxidant, anti-inflammatory, anticancer, and neuroprotective activities. These effects are mediated through various mechanisms such as inhibiting bacterial growth, stimulating insulin secretion, scavenging free radicals, reducing inflammation by inhibiting NF-κB, inducing apoptosis in cancer cells, and protecting neurons from oxidative stress.
Dose: Doses applied in randomized clinical trials vary between 1 g/day to 6 g/day. For cinnamon oil, the dose varies between 50 mg/day to 200 mg/day. Specific doses mentioned include 1.5 g/day for polycystic ovary syndrome (PCOS) patients and 3 g/day for type 2 diabetic patients.
Phytochemicals: ['cinnamaldehyde', 'eugenol', 'trans-cinnamyl

In [62]:
print(0)

0
