In [None]:
# Import necessary libraries
import pandas as pd
import requests
import os
import json
import time
import logging

# Logs the process and errors
logging.basicConfig(filename='smiles_retrieval.log', level=logging.INFO,
                    format='%(asctime)s %(levelname)s: %(message)s')
# Loads the CSV data
iig_data = pd.read_csv("IIR_OCOMM_missingremoved.csv")

# Caches for the smile
cache_file = "smiles_cache.json"

# Loads the cache if it exists
if os.path.exists(cache_file):
    try:
        with open(cache_file, 'r') as file:
            ingredient_to_smiles = json.load(file)
    except json.JSONDecodeError:
        logging.error("Failed to load cache file due to JSONDecodeError.")
        ingredient_to_smiles = {}
else:
    ingredient_to_smiles = {}
    
# Function to get canonical SMILES from PubChem API 
def get_canonical_smiles(ingredient_name, retries=3):
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{ingredient_name}/property/CanonicalSMILES/TXT"
    for attempt in range(retries):
        try:
            response = requests.get(url)
            if response.status_code == 200:
                return response.text.strip()
            elif response.status_code == 404:
                logging.warning(f"SMILES not found for ingredient: {ingredient_name}")
                return None
            else:
                logging.error(f"Error {response.status_code} for ingredient: {ingredient_name}")
        except requests.RequestException as e:
            logging.error(f"Request error for ingredient: {ingredient_name}, attempt {attempt + 1}/{retries}: {e}")
            time.sleep(1)  # adds 1 sec delat
    return None

# Creates the mapping of ingredient names to canonical SMILES in pubchem
for ingredient in iig_data['INGREDIENT_NAME']:  # Update to 'INGREDIENT_NAME'
    if ingredient not in ingredient_to_smiles:
        smiles = get_canonical_smiles(ingredient)
        if smiles is not None:
            logging.info(f"Retrieved SMILES for ingredient: {ingredient}")
        ingredient_to_smiles[ingredient] = smiles

# Saves the cache to a file
with open(cache_file, 'w') as file:
    json.dump(ingredient_to_smiles, file)

# Adds the canonical SMILES to the iig DataFrame
iig_data['CanonicalSMILES'] = iig_data['INGREDIENT_NAME'].map(ingredient_to_smiles)  # Update to 'INGREDIENT_NAME'

# Saves the updated DataFrame to a new CSV file
iig_data.to_csv("~/BIO730/IIR_OCOMM_with_SMILES.csv", index=False)