<a href="https://colab.research.google.com/github/hsandaver/essays/blob/main/DyeClassifierv2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install RDKit and pandas (use rdkit-pypi for pip installation)
!pip install rdkit-pypi pandas

# Import necessary libraries
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors
from google.colab import files

# Function to install RDKit dependencies if necessary
def install_rdkit():
    try:
        from rdkit import Chem
    except ImportError:
        !pip install rdkit-pypi

# Initialize RDKit
install_rdkit()

# Allow user to upload file in Colab
print("Please upload a CSV file containing a 'SMILES' column.")
uploaded = files.upload()

# Check if any file was uploaded
if not uploaded:
    raise ValueError("No file uploaded. Please upload a CSV file with a 'SMILES' column.")

# Load the uploaded file into a pandas DataFrame
filename = next(iter(uploaded))
print(f"User uploaded file '{filename}'")
try:
    dye_data = pd.read_csv(filename)
except Exception as e:
    raise ValueError(f"Error reading the CSV file: {e}")

# Check if 'SMILES' column exists
if 'SMILES' not in dye_data.columns:
    raise ValueError("The uploaded CSV does not contain a 'SMILES' column.")

# Define SMARTS patterns for chromophore detection
chromophore_patterns = {
    'Azo': Chem.MolFromSmarts('N=N'),  # Azo group
    'Anthraquinone': Chem.MolFromSmarts('C1=CC=C2C(=O)C=CC2=O'),  # Anthraquinone
    'Nitro': Chem.MolFromSmarts('[NX3](=O)=O'),  # Nitro group
    'Quinone': Chem.MolFromSmarts('O=C1C=CC=CC1=O'),  # Quinone
    'Indigoid': Chem.MolFromSmarts('C1C=CC(=O)NC1=O'),  # Indigoid pattern
    'Cyanine': Chem.MolFromSmarts('C=C-C=C'),  # Cyanine (simplified)
    'Xanthene': Chem.MolFromSmarts('O1C=CC2=C1C=CC=C2'),  # Xanthene (simplified)
    'Thiazine': Chem.MolFromSmarts('N1C=NC=S1'),  # Thiazine (sulfur-based)
    'Coumarin': Chem.MolFromSmarts('O=C1OC=CC2=CC=CC=C12'),  # Coumarin
    'Porphyrin': Chem.MolFromSmarts('N1C=CC=N1'),  # Porphyrin ring
    'Phthalocyanine': Chem.MolFromSmarts('C1=C(C2=NC=C(C)N2)C3=CC=CC=C13'),  # Phthalocyanine-like
    'Carotenoid': Chem.MolFromSmarts('C=C(C)C=CC=C'),  # Carotenoid (simplified)
    'Squaraine': Chem.MolFromSmarts('C=CC=C'),  # Squaraines (simplified conjugated)
    'Metal Complex': Chem.MolFromSmarts('[!#1]')  # Placeholder for metal coordination complexes
}

# New SMARTS patterns for heavy atoms and unusual ligands
additional_patterns = {
    'Bromine': Chem.MolFromSmarts('Br'),  # Bromine
    'Selenium': Chem.MolFromSmarts('Se'),  # Selenium
    'Pyridine': Chem.MolFromSmarts('C1=CC=NC=C1'),  # Pyridine ligand
    'Phosphine': Chem.MolFromSmarts('P(C)(C)C'),  # Phosphine ligand
    'Carbene': Chem.MolFromSmarts('[C]')  # Carbene ligand
}

# Combine all patterns into a single dictionary
chromophore_patterns.update(additional_patterns)

# Check if all SMARTS patterns are compiled correctly
for name, pattern in chromophore_patterns.items():
    if pattern is None:
        print(f"Warning: Failed to compile SMARTS pattern for {name}")

# Define SMARTS patterns for auxochromes
auxochrome_patterns = {
    'Hydroxyl': Chem.MolFromSmarts('[OX2H]'),  # -OH
    'Amine': Chem.MolFromSmarts('N'),  # -NH2 or other amines
    'Methoxy': Chem.MolFromSmarts('COC'),  # -OCH3
    'Thiol': Chem.MolFromSmarts('[SX2H]'),  # -SH
    'Carboxyl': Chem.MolFromSmarts('C(=O)[OX2H1]')
}

# Function to identify chromophores
def identify_chromophores(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return 'Invalid SMILES'

    matched_chromophores = []

    for name, pattern in chromophore_patterns.items():
        if pattern and mol.HasSubstructMatch(pattern):
            matched_chromophores.append(name)

    return ', '.join(matched_chromophores) if matched_chromophores else 'Unknown'

# Function to identify auxochromes
def identify_auxochromes(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return 'Invalid SMILES'

    matched_auxochromes = []

    for name, pattern in auxochrome_patterns.items():
        if pattern and mol.HasSubstructMatch(pattern):
            matched_auxochromes.append(name)

    return ', '.join(matched_auxochromes) if matched_auxochromes else 'None'

# Function to calculate descriptors
def calculate_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None

    try:
        mol_weight = Descriptors.MolWt(mol)  # Molecular Weight
        logP = Descriptors.MolLogP(mol)      # Hydrophobicity (LogP)
        tpsa = rdMolDescriptors.CalcTPSA(mol)         # Topological Polar Surface Area (TPSA)
        num_rings = rdMolDescriptors.CalcNumRings(mol)  # Number of rings
        num_double_bonds = rdMolDescriptors.CalcNumDoubleBonds(mol)  # Number of double bonds
    except Exception as e:
        print(f"Error calculating descriptors for SMILES '{smiles}': {e}")
        return None

    return {
        'MolWeight': mol_weight,
        'LogP': logP,
        'TPSA': tpsa,
        'NumRings': num_rings,
        'NumDoubleBonds': num_double_bonds
    }

# Function to map chromophores to color families
def chromophore_to_color(chromophore):
    chromophore_color_map = {
        'Azo': 'Red/Orange/Yellow',
        'Anthraquinone': 'Red/Blue/Violet',
        'Nitro': 'Yellow/Orange',
        'Quinone': 'Yellow/Orange/Brown',
        'Indigoid': 'Blue/Purple',
        'Cyanine': 'Green/Blue',
        'Xanthene': 'Yellow/Orange',
        'Thiazine': 'Blue/Green',
        'Coumarin': 'Blue/Green',
        'Porphyrin': 'Red/Purple',
        'Phthalocyanine': 'Green/Blue',
        'Carotenoid': 'Yellow/Orange',
        'Squaraine': 'Red/Purple',
        'Bromine': 'Dark Green/Purple',
        'Selenium': 'Deep Blue/Purple',
        'Pyridine': 'Varies (often Green/Blue/Yellow)',
        'Phosphine': 'Varies (Yellow/Green)',
        'Carbene': 'Varies (Red/Purple)',
        'Metal Complex': 'Varies (often Green/Blue)'
    }
    return chromophore_color_map.get(chromophore, 'Unknown')

# Function to estimate color based on chromophores, auxochromes, and descriptors
def estimate_color(chromophores, auxochromes, descriptors):
    if chromophores == 'Invalid SMILES':
        return 'Invalid SMILES'

    if chromophores == 'Unknown' and descriptors:
        # More nuanced inference rules
        if descriptors['NumDoubleBonds'] > 5:
            base_color = 'Red/Orange'
        elif descriptors['NumDoubleBonds'] > 3:
            base_color = 'Yellow/Orange'
        elif descriptors['NumRings'] >= 4:
            base_color = 'Blue/Violet'
        elif descriptors['NumRings'] == 3:
            base_color = 'Green/Blue'
        elif descriptors['MolWeight'] > 600:
            base_color = 'Deep Color (likely Red or Purple)'
        elif descriptors['MolWeight'] > 400:
            base_color = 'Moderate Color (likely Blue or Green)'
        else:
            base_color = 'Lighter Color (likely Yellow)'
    else:
        # If multiple chromophores, take the first for base color
        first_chromophore = chromophores.split(', ')[0]
        base_color = chromophore_to_color(first_chromophore)

    # Modify the color based on auxochromes
    if auxochromes != 'Invalid SMILES':
        auxo_list = [auxo.strip() for auxo in auxochromes.split(',')]
        if 'Hydroxyl' in auxo_list:
            base_color += ' (Shifted towards Red)'
        if 'Amine' in auxo_list:
            base_color += ' (Shifted towards Blue/Violet)'
        if 'Methoxy' in auxo_list:
            base_color += ' (Shifted towards Yellow)'
        if 'Thiol' in auxo_list:
            base_color += ' (Potential for Increased Color Intensity)'
        if 'Carboxyl' in auxo_list:
            base_color += ' (Potential for Increased Solubility)'

    return base_color

# Apply functions to the dataset
print("Processing data...")

# Identify chromophores
dye_data['Chromophore'] = dye_data['SMILES'].apply(identify_chromophores)

# Identify auxochromes
dye_data['Auxochrome'] = dye_data['SMILES'].apply(identify_auxochromes)

# Calculate descriptors
dye_data['Descriptors'] = dye_data['SMILES'].apply(calculate_descriptors)

# Expand descriptors into separate columns
descriptor_df = dye_data['Descriptors'].apply(pd.Series)
dye_data = pd.concat([dye_data, descriptor_df], axis=1)
dye_data.drop(columns=['Descriptors'], inplace=True)

# Estimate color based on chromophores, auxochromes, and descriptors
dye_data['Estimated Color'] = dye_data.apply(
    lambda row: estimate_color(row['Chromophore'], row['Auxochrome'], row.to_dict()), axis=1
)

# Save the dataset to a new CSV file
output_file = 'output_dye_colors_enhanced.csv'
dye_data.to_csv(output_file, index=False)
print(f"Processing complete. Output saved to '{output_file}'.")

# Optionally download the result file
files.download(output_file)