In [None]:
import torch
import re
import os
import os.path
import json
import pickle
import openai
import pandas as pd
import numpy as np
import glob
from tqdm import tqdm
import random
import time
from dotenv import load_dotenv

# this is new
from unidecode import unidecode
from unicodedata import normalize
import tiktoken
import json
from json.decoder import JSONDecodeError



In [None]:
load_dotenv()

# create your LLM client here
from mistralai.client import MistralClient
from mistralai.models.chat_completion import ChatMessage

api_key = os.environ["MISTRAL_API_KEY"]
model = "mistral-medium-latest"

client = MistralClient(api_key=api_key)

In [None]:
def combine_words_with_capital(string):
    # remove non-alphanumeric characters
    string = re.sub(r'[^\w\s/]', '', string)
    # split the string on the slash ("/")
    parts = string.split('/')
    # combine words with capitalization for each part
    parts = [''.join(word.capitalize() for word in part.split()) for part in parts]
    # join the parts with an empty string
    return ''.join(parts)

def create_clean_paragraphs(input_dict):

    # Create a new dictionary to store the cleaned-up values
    output_dict = {}
    
    # Loop through the keys and values of the input dictionary
    for key, value in input_dict.items():
        # Convert the list of values to a set to remove duplicates
        unique_values = set(value)
        
        # Join the sentences together into a single string
        combined_string = ' '.join(unique_values)
        
        # Add the cleaned-up string to the output dictionary
        output_dict[key] = combined_string
    
    # Return the cleaned-up dictionary
    return output_dict


# this is new 
def clean_sentence(sentence):
    sent = sentence.replace('Â','').replace('â', '-').replace('·','.').replace('Ã','x').replace(u'\xa0', u' ')
    sent = sent.replace('â', '').replace('â', '-').replace('x©', 'e').strip()
    
    return sent

def create_clean_and_unique_sentences(input_dict):
    output_dict = {}
    
    for key, value in input_dict.items():
        unique_values = [clean_sentence(sent) for sent in set(value) if len(sent.split(' '))>=2]
        
        output_dict[key] = unique_values
        
    return output_dict
        
    
def create_even_cleaner_paragraphs(input_dict):
    # Create a new dictionary to store the cleaned-up values
    output_dict = {}
    
    # Loop through the keys and values of the input dictionary
    for key, value in input_dict.items():
        # Convert the list of values to a set to remove duplicates
        unique_values = set(value)
        
        # Join the sentences together into a single string
        combined_string = ' '.join(unique_values)
        
        # Add the cleaned-up string to the output dictionary (these are new)
        output_dict[key] = clean_sentence(combined_string)
    
    # Return the cleaned-up dictionary
    return output_dict
    

In [None]:
# set up some data paths 
data_folder = 'Data/'
dataframes_folder = data_folder + 'DataFrames/'
traits_folder = data_folder + 'Traits/'
survey_folder = 'Data/'


results_folder = 'SurveyResults/Mistral/'
os.makedirs(results_folder, exist_ok = True)

sentence_fol = results_folder + '/per_sentence'
os.makedirs(sentence_fol, exist_ok = True)


Read: 1) The Traits for the fiven name. 2) The sentences/paragraphs

In [None]:
# 1. Read the traits. we are going to use this to query the LLM
with open(F"{traits_folder}Caribbean.json", 'r') as f:
  traits_dict_caribbean = json.load(f)

with open(F"{traits_folder}West.json", 'r') as f:
  traits_dict_pnet = json.load(f)

with open(F"{traits_folder}Palm.json", 'r') as f:
  traits_dict_palm = json.load(f)


In [None]:
#print(traits_dict)
for key in traits_dict_pnet:
    print('{}:{}\n'.format(key, traits_dict_pnet[key]))

In [None]:
# 2. Read the survey data
surveys = pd.read_csv(survey_folder+'answers_surveys.csv')

In [None]:
species = surveys['Species'].values
main_traits = surveys['Main Trait'].values
sentences = surveys['Sentence'].values
dataset = surveys['Dataset'].values
results = surveys['Result'].values
subtraits = surveys['GT Sub Traits'].values

### Check that all traits in the file are found in the GT

In [None]:
print(traits_dict_caribbean.keys())
print(traits_dict_pnet.keys())

In [None]:
for idx, key in enumerate(main_traits):
    key = key.replace(' /','').strip()
    #if dataset[idx] == 'Palm':
    #    continue
    if key not in traits_dict_caribbean and key not in traits_dict_pnet and key not in traits_dict_palm:
        print('Error trait: {} dataset: {}', key, dataset[idx])

## Combine sentences and traits. 
Iterate the given configurations and ask gpt.

In [None]:
# parse all the species
results = []
for idx, sentence in enumerate(sentences):
    print('Cur Sentence Num: {}/{}'.format(idx, len(sentences)))
    
    if idx not in [188]:
        continue
        
    #if dataset[idx] == 'Palm':
    #    continue
    
    # create the folder for the species. replace blanks with underscores to avoid potential issues
    #res_folder = results_folder + '/sentence_num_{}'.format(idx)
    #os.makedirs(res_folder, exist_ok = True)
    
    #idx = 2
    #sentence = sentences[idx]
    # get the current info for this question 
    main_trait = main_traits[idx]
    subtrait = eval(subtraits[idx])
    spec = species[idx]
    

    trait_list = [main_trait]
    pos_traits = "{" + '\"{}\": {} '.format(main_trait.capitalize(), subtrait) + '}'
    
    
    text = 'We are interested in obtaining botanical trait information about the species {}.\n\n'.format(spec)
    text += 'We will provide an input text with botanical descriptions,'\
            'followed by a dictionary where each key \'name\' represents a trait name, '\
            'referring to specific organ or other element of the plant, and is associated to a list '\
            'with all possible trait values for that trait, [\'value_1\', \'value_2\', ..., \'value_n\'].\n\n'
    
    text += 'Input text:\n'
    text += sentence +'\n\n'
    
    text += 'Initial dictionary of traits with all possible values:\n'
    text += pos_traits +'\n\n'
    
    text += 'Turn each string s in the list of values in the dictionary into a sublist (s,b), where b is a binary number,'\
             'either 0 or 1, indicating whether there is strong evidence for value s in the input text. '
    text+= 'Double check that \'value_i\' is reported referring to trait \'name\' in the text, '\
            'and not to a different trait. Always set \'b\' to \'0\' if you are not 100% sure about '\
            'the association. Do not add new trait values and do not modify the initial ones.Return the dictionary of traits and sublists of (value, evidence) containing ALL POSSIBLE NAMES AND (values, evidence) tuples.\n\n'
    text += 'Output only a dictionary in JSON format, no other text at all.\n\n'
    
    
    messages = [ChatMessage(role="user", content = text)]
    
    retries = 5
    while retries>0:
        try:

            chat_response = client.chat(
                model=model,
                messages=messages,
            )
            content = chat_response.choices[0].message.content
            content_as_json = json.loads(content)
            
            retries = 0.
            break
        except (Exception, JSONDecodeError) as e:
            if e:
                print('Some Kind of Error, {}'.format(e))
                retries -= 1
                time.sleep(5)


    
    results.append(content)
    
    
    with open('{}/sentence_{}_prompt_and_response.txt'.format(sentence_fol, idx), 'w') as f:
        f.write('{}'.format(text))
        f.write('\n\n{}'.format(content))

with open(results_folder + '/responses_mistral_surveys.txt', 'w') as f:
    for res in results:
        f.write(res + '\n')
with open(results_folder + '/responses_mistral_surveys_cleaned.txt', 'w') as f:
    for res in results:
        f.write(res.replace('\n','') + '\n')


In [None]:
with open(results_folder + '/responses_mistral_surveys_cleaned.txt', 'w') as f:
    for res in results:
        res = res.split('Note')[0]
        res = res.split('Explanation')[0]
        f.write(res.replace('\n','') + '\n')

In [None]:
results[0].replace('\n', '')

Check some stuff if the previous fails

In [None]:
content_as_json = json.loads(content)
print(content_as_json)
print(key)
print(eval(content_as_json[key][0]))
print(eval(content_as_json[key][0])[0])
print(type(eval(content_as_json[key][0])[1]))

trait_list
content_as_json['Bark']

In [None]:
import regex
pattern = regex.compile(r'\{(?:[^{}]|(?R))*\}')    
response_final = []
for i in range(1216):
    with open(results_folder + '/per_sentence/sentence_{}_prompt_and_response.txt'.format(i), 'r') as f:
        resp = f.read()
        
    patt = pattern.findall(resp)[-1]
    response_final.append(patt)
with open(results_folder + '/responses_mistral_surveys_cleaned.txt', 'w') as f:
    for res in response_final:
        f.write(res.replace('\n','') + '\n')



In [None]:
print(response_final)