<a href="https://colab.research.google.com/github/hsandaver/essays/blob/main/DyeClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install RDKit (use in Colab)
!pip install rdkit pandas

# Import necessary libraries
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from google.colab import files

# Allow user to upload file in Colab
uploaded = files.upload()

# Assuming the uploaded file is a CSV
for filename in uploaded.keys():
    print(f"User uploaded file '{filename}'")

# Load the uploaded file into a pandas DataFrame
dye_data = pd.read_csv(filename)

# Define SMARTS patterns for chromophore detection
azo_pattern = Chem.MolFromSmarts('N=N')  # Azo group
anthraquinone_pattern = Chem.MolFromSmarts('C1=CC=C2C(=O)C=CC2=O')  # Anthraquinone
nitro_pattern = Chem.MolFromSmarts('[NX3](=O)=O')  # Nitro group
quinone_pattern = Chem.MolFromSmarts('O=C1C=CC=CC1=O')  # Quinone
indigo_pattern = Chem.MolFromSmarts('C1C=CC(=O)NC1=O')  # Indigoid pattern
cyanine_pattern = Chem.MolFromSmarts('C=N-C=N')  # Cyanine (simplified)
xanthene_pattern = Chem.MolFromSmarts('O1C=CC2=C1C=CC=C2')  # Xanthene (simplified)
thiazine_pattern = Chem.MolFromSmarts('N1C=NC=S1')  # Thiazine (sulfur-based)
coumarin_pattern = Chem.MolFromSmarts('O=C1OC=CC2=CC=CC=C12')  # Coumarin
porphyrin_pattern = Chem.MolFromSmarts('N1C=CC=N1')  # Porphyrin ring
phthalocyanine_pattern = Chem.MolFromSmarts('C1=C(C2=NC=C(C)N2)C3=CC=CC=C13')  # Phthalocyanine-like
carotenoid_pattern = Chem.MolFromSmarts('C=C(C)C=CC=C')  # Carotenoid (simplified)
metal_complex_pattern = Chem.MolFromSmarts('[M]')  # Placeholder for metal coordination complexes
squaraine_pattern = Chem.MolFromSmarts('C=CC=C')  # Squaraines (simplified conjugated)

# New SMARTS patterns for heavy atoms and unusual ligands
bromine_pattern = Chem.MolFromSmarts('Br')  # Bromine
selenium_pattern = Chem.MolFromSmarts('Se')  # Selenium

# Refined SMARTS patterns for metal complexes
pyridine_pattern = Chem.MolFromSmarts('C1=CC=NC=C1')  # Pyridine ligand
phosphine_pattern = Chem.MolFromSmarts('P(C)(C)C')  # Phosphine ligand
carbene_pattern = Chem.MolFromSmarts('[C]')  # Carbene ligand

# Add these new patterns to the pattern dictionary
patterns.update({
    'Bromine': bromine_pattern,
    'Selenium': selenium_pattern,
    'Pyridine': pyridine_pattern,
    'Phosphine': phosphine_pattern,
    'Carbene': carbene_pattern
})

for name, pattern in patterns.items():
    if pattern is None:
        print(f"Warning: Failed to compile SMARTS pattern for {name}")

def identify_chromophores(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return 'Unknown'

    # Add new chromophores and broader logic for hybrids
    if azo_pattern and mol.HasSubstructMatch(azo_pattern):
        return 'Azo'
    elif anthraquinone_pattern and mol.HasSubstructMatch(anthraquinone_pattern):
        return 'Anthraquinone'
    elif nitro_pattern and mol.HasSubstructMatch(nitro_pattern):
        return 'Nitro'
    elif quinone_pattern and mol.HasSubstructMatch(quinone_pattern):
        return 'Quinone'
    elif indigo_pattern and mol.HasSubstructMatch(indigo_pattern):
        return 'Indigoid'
    elif cyanine_pattern and mol.HasSubstructMatch(cyanine_pattern):
        return 'Cyanine'
    elif xanthene_pattern and mol.HasSubstructMatch(xanthene_pattern):
        return 'Xanthene'
    elif thiazine_pattern and mol.HasSubstructMatch(thiazine_pattern):
        return 'Thiazine'
    elif coumarin_pattern and mol.HasSubstructMatch(coumarin_pattern):
        return 'Coumarin'
    elif porphyrin_pattern and mol.HasSubstructMatch(porphyrin_pattern):
        return 'Porphyrin'
    elif phthalocyanine_pattern and mol.HasSubstructMatch(phthalocyanine_pattern):
        return 'Phthalocyanine'
    elif carotenoid_pattern and mol.HasSubstructMatch(carotenoid_pattern):
        return 'Carotenoid'
    elif squaraine_pattern and mol.HasSubstructMatch(squaraine_pattern):
        return 'Squaraine'
    elif metal_complex_pattern and mol.HasSubstructMatch(metal_complex_pattern):
        return 'Metal Complex'
    elif bromine_pattern and mol.HasSubstructMatch(bromine_pattern):
        return 'Bromine-based dye'
    elif selenium_pattern and mol.HasSubstructMatch(selenium_pattern):
        return 'Selenium-based dye'
    elif pyridine_pattern and mol.HasSubstructMatch(pyridine_pattern):
        return 'Metal Complex with Pyridine Ligand'
    elif phosphine_pattern and mol.HasSubstructMatch(phosphine_pattern):
        return 'Metal Complex with Phosphine Ligand'
    elif carbene_pattern and mol.HasSubstructMatch(carbene_pattern):
        return 'Metal Complex with Carbene Ligand'
    else:
        return 'Unknown'

# Define a function to detect auxochromes
def identify_auxochromes(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return 'Unknown'

    auxochromes = []

    if 'OH' in smiles:
        auxochromes.append('Hydroxyl (-OH)')
    if 'NH2' in smiles:
        auxochromes.append('Amine (-NH2)')
    if 'OCH3' in smiles:
        auxochromes.append('Methoxy (-OCH3)')
    if 'SH' in smiles:
        auxochromes.append('Thiol (-SH)')
    if 'COOH' in smiles:
        auxochromes.append('Carboxyl (-COOH)')

    return ', '.join(auxochromes) if auxochromes else 'None'

# Define a function to calculate descriptors
def calculate_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None

    mol_weight = Descriptors.MolWt(mol)  # Molecular Weight
    logP = Descriptors.MolLogP(mol)      # Hydrophobicity (LogP)
    tpsa = Descriptors.TPSA(mol)         # Topological Polar Surface Area (TPSA)
    num_rings = Descriptors.RingCount(mol)  # Number of rings
    num_double_bonds = Descriptors.NumAromaticRings(mol)  # Double bonds

    return {'MolWeight': mol_weight, 'LogP': logP, 'TPSA': tpsa, 'NumRings': num_rings, 'NumDoubleBonds': num_double_bonds}

# Define a function to map chromophores to color families
def chromophore_to_color(chromophore):
    chromophore_color_map = {
        'Azo': 'Red/Orange/Yellow',
        'Anthraquinone': 'Red/Blue/Violet',
        'Nitro': 'Yellow/Orange',
        'Quinone': 'Yellow/Orange/Brown',
        'Indigoid': 'Blue/Purple',
        'Cyanine': 'Green/Blue',
        'Xanthene': 'Yellow/Orange',
        'Thiazine': 'Blue/Green',
        'Coumarin': 'Blue/Green',
        'Porphyrin': 'Red/Purple',
        'Phthalocyanine': 'Green/Blue',
        'Carotenoid': 'Yellow/Orange',
        'Squaraine': 'Red/Purple',
        'Bromine-based dye': 'Dark Green/Purple',
        'Selenium-based dye': 'Deep Blue/Purple',
        'Metal Complex': 'Varies (often Green/Blue)',
        'Metal Complex with Pyridine Ligand': 'Varies (Green/Blue/Yellow)',
        'Metal Complex with Phosphine Ligand': 'Varies (Yellow/Green)',
        'Metal Complex with Carbene Ligand': 'Varies (Red/Purple)'
    }
    return chromophore_color_map.get(chromophore, 'Unknown')

# Define a function to estimate color based on chromophores, auxochromes, and descriptors
def estimate_color(chromophore, auxochromes, descriptors):
    # Translate chromophore to color family
    base_color = chromophore_to_color(chromophore)

    # Fallback if no chromophore is found
    if base_color == 'Unknown' and descriptors:
        # More nuanced inference rules
        if descriptors['NumDoubleBonds'] > 5:
            base_color = 'Red/Orange'
        elif descriptors['NumDoubleBonds'] > 3:
            base_color = 'Yellow/Orange'
        elif descriptors['NumRings'] >= 4:
            base_color = 'Blue/Violet'
        elif descriptors['NumRings'] == 3:
            base_color = 'Green/Blue'
        elif descriptors['MolWeight'] > 600:
            base_color = 'Deep Color (likely Red or Purple)'
        elif descriptors['MolWeight'] > 400:
            base_color = 'Moderate Color (likely Blue or Green)'
        else:
            base_color = 'Lighter Color (likely Yellow)'

    # Modify the color based on auxochromes
    if 'Hydroxyl' in auxochromes:
        base_color += ' (Shifted towards Red)'
    if 'Amine' in auxochromes:
        base_color += ' (Shifted towards Blue/Violet)'
    if 'Methoxy' in auxochromes:
        base_color += ' (Shifted towards Yellow)'

    return base_color

# Apply functions to the dataset using the correct SMILES column
dye_data['Chromophore'] = dye_data['SMILES'].apply(identify_chromophores)
dye_data['Auxochrome'] = dye_data['SMILES'].apply(identify_auxochromes)
dye_data['Descriptors'] = dye_data['SMILES'].apply(calculate_descriptors)

# Estimate color based on chromophores, auxochromes, and descriptors
dye_data['Estimated Color'] = dye_data.apply(lambda row: estimate_color(row['Chromophore'], row['Auxochrome'], row['Descriptors']), axis=1)

# Save the dataset to a new CSV file
output_file = 'output_dye_colors_nuanced_v9.csv'
dye_data.to_csv(output_file, index=False)

# Optionally download the result file
files.download(output_file)