In [None]:
import re
import os
import os.path
import json
import pickle
import numpy as np
import glob
from tqdm import tqdm
import random
import time
from dotenv import load_dotenv
load_dotenv()
import json
from json.decoder import JSONDecodeError


In [None]:
# set up your LLM here, make sure to gave the api key in a dot env 
from mistralai.client import MistralClient
from mistralai.models.chat_completion import ChatMessage

api_key = os.environ["MISTRAL_API_KEY"]
model = "mistral-medium-latest"

client = MistralClient(api_key=api_key)


In [None]:
def combine_words_with_capital(string):
    # remove non-alphanumeric characters
    string = re.sub(r'[^\w\s/]', '', string)
    # split the string on the slash ("/")
    parts = string.split('/')
    # combine words with capitalization for each part
    parts = [''.join(word.capitalize() for word in part.split()) for part in parts]
    # join the parts with an empty string
    return ''.join(parts)

def create_clean_paragraphs(input_dict):

    # Create a new dictionary to store the cleaned-up values
    output_dict = {}
    
    # Loop through the keys and values of the input dictionary
    for key, value in input_dict.items():
        # Convert the list of values to a set to remove duplicates
        unique_values = set(value)
        
        # Join the sentences together into a single string
        combined_string = ' '.join(unique_values)
        
        # Add the cleaned-up string to the output dictionary
        output_dict[key] = combined_string
    
    # Return the cleaned-up dictionary
    return output_dict


# this is new 
def clean_sentence(sentence):
    sent = sentence.replace('Â','').replace('â', '-').replace('·','.').replace('Ã','x').replace(u'\xa0', u' ')
    sent = sent.replace('â', '').replace('â', '-').replace('x©', 'e').strip()
    
    return sent

def create_clean_and_unique_sentences(input_dict):
    output_dict = {}
    
    for key, value in input_dict.items():
        unique_values = [clean_sentence(sent) for sent in set(value) if len(sent.split(' '))>=2]
        
        output_dict[key] = unique_values
        
    return output_dict
        
    
def create_even_cleaner_paragraphs(input_dict):
    # Create a new dictionary to store the cleaned-up values
    output_dict = {}
    
    # Loop through the keys and values of the input dictionary
    for key, value in input_dict.items():
        # Convert the list of values to a set to remove duplicates
        unique_values = set(value)
        
        # Join the sentences together into a single string
        combined_string = ' '.join(unique_values)
        
        # Add the cleaned-up string to the output dictionary (these are new)
        output_dict[key] = clean_sentence(combined_string)
    
    # Return the cleaned-up dictionary
    return output_dict
    

In [None]:
# set up some data paths 
data_folder = 'Data/'
dataframes_folder = data_folder + 'DataFrames/'
traits_folder = data_folder + 'Traits/'
sentences_folder = data_folder + 'Sentences/'

# define the name of the dataset 
dataset = 'West' # Caribbean, Palm or West
results_folder = 'Results/Mistral/{}'.format(dataset)
os.makedirs(results_folder, exist_ok = True)


In [None]:
# 1. Read the traits. we are going to use this to query the LLM
with open(F"{traits_folder}{dataset}.json", 'r') as f:
  traits_dict = json.load(f)

print(traits_dict.keys())
print(len(traits_dict))

if dataset == 'Palm':
    del traits_dict['Measurement']
    print(traits_dict.keys())
    print(len(traits_dict))

# also save the traits in a more user friendly form, i.e., text 
with open('{}/traits.txt'.format(results_folder), 'w') as f:
    for tname in traits_dict:
        f.write('{}: {}\n'.format(tname, traits_dict[tname]))

In [None]:
# 2. Read the sentences/paragraphs. These are the relevant texts from which we want to extract traits. 
sentences_file = sentences_folder + 'Sents_{}.pkl'.format(dataset)
with open(sentences_file, 'rb') as f:
    sentences = pickle.load(f, encoding='utf8')
sentences_cleaned = create_clean_and_unique_sentences(sentences)


In [None]:
num_sentences_per_query = 4
num_traits_per_query = 4
with open('{}/settings.txt'.format(results_folder), 'w') as f:
    f.write('Num sentences per query: {}\nNum Traits per Query: {}'.format(num_sentences_per_query, num_traits_per_query))

In [None]:
# traits are in traits
traits_names = list(traits_dict.keys())
traits_names_cap = [ tr.capitalize() for tr in traits_dict.keys() ]
    
# parse all the species
for idx, species in enumerate(sentences_cleaned):
    
    if species.replace(' ', '_') not in allowed_species:
        continue
    
    print('Cur Species Num: {}/{} Name: {}'.format(idx, len(sentences_cleaned), species))

    
    # create the folder for the species. replace blanks with underscores to avoid potential issues
    species_folder = results_folder + '/{}'.format(species.replace(' ', '_'))
    os.makedirs(species_folder, exist_ok = True)
    
    
    # this is the list with all the sentences, we are gonna iterate and combine.
    sentences_ = sentences_cleaned[species]
    
    #print(sentences_)
    # save cleaned sentences in a txt format
    with open('{}/sentences_cleaned.txt'.format(species_folder), 'w') as f:
        for sent in sentences_:
            f.write(sent+'\n')
        
    # all the reponses and contents only for the species 
    responses_full = []
    contents = []
    gpt_dict_traits = {}
    
    cur_paragraph = '\n'.join(sentences_)
    
    pos_traits = '{'
    for j in range(0, len(traits_names)):
        pos_traits += '\"{}\": {}, '.format(traits_names[j].capitalize(), traits_dict[traits_names[j]]                                    )
        # until the third to last element to remove comma and space...
    pos_traits = pos_traits[:-2] + '}'
     
    
    text = 'We are interested in obtaining botanical trait information about the species {}.\n\n'.format(species)
    text += 'We will provide an input text with botanical descriptions,'\
            'followed by a dictionary where each key \'name\' represents a trait name, '\
            'referring to specific organ or other element of the plant, and is associated to a list '\
            'with all possible trait values for that trait, [\'value_1\', \'value_2\', ..., \'value_n\'].\n\n'
    
    text += 'Input text:\n'
    text += cur_paragraph +'\n\n'
    
    text += 'Initial dictionary of traits with all possible values:\n'
    text += pos_traits +'\n\n'
    
    text += 'Turn each string s in the list of values in the dictionary into a sublist (s,b), where b is a binary number,'\
             'either 0 or 1, indicating whether there is strong evidence for value s in the input text. '
    text+= 'Double check that \'value_i\' is reported referring to trait \'name\' in the text, '\
            'and not to a different trait. Always set \'b\' to \'0\' if you are not 100% sure about '\
            'the association. Do not add new trait values and do not modify the initial ones.Return the dictionary of traits and sublists of (value, evidence) containing ALL POSSIBLE NAMES AND (values, evidence) tuples.\n\n'
    text += 'Output only a dictionary in JSON format, no other text at all.\n\n'
    

    cur_path = '{}/results/'.format(species_folder)

    os.makedirs(cur_path, exist_ok = True)
  
    messages = [ChatMessage(role="user", content = text)]
    
    retries = 5
    while retries>0:
        try:

            chat_response = client.chat(
                model=model,
                #response_format={"type": "json_object"},
                messages=messages,
            )
            content = chat_response.choices[0].message.content
            content_as_json = json.loads(content)
            
            retries = 0.
            break
        except (Exception, JSONDecodeError) as e:
            if e:
                print('Some Kind of Error, {}'.format(e))
                retries -= 1
                time.sleep(5)

    
    with open('{}/mistral_response_full.txt'.format(cur_path), 'w') as f:
        f.write(str(chat_response))
    with open('{}/mistral_response_content_only.txt'.format(cur_path), 'w') as f:
        f.write(content)

    with open('{}/mistral_sent_info_and_content.txt'.format(cur_path), 'w') as f:
        f.write('{}\n\n{}'.format(text, content))

    responses_full.append(str(chat_response))
    contents.append(content)


    with open('{}/responses_all.txt'.format(species_folder), 'w') as f:
        for resp in responses_full:
            f.write(resp + '\n\n')
    with open('{}/contents_all.txt'.format(species_folder), 'w') as f:
        for cont in contents:
            f.write(cont + '\n\n')
    
   
    