In [1]:
!pip install /kaggle/input/rdkit-2025-3-3-cp311/rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl

Processing /kaggle/input/rdkit-2025-3-3-cp311/rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl
rdkit is already installed with the same version as the provided wheel. Use --force-reinstall to force an installation of the wheel.


In [2]:
import pandas as pd
import numpy as np

from tqdm import tqdm
tqdm.pandas()

from sklearn.ensemble import HistGradientBoostingRegressor,ExtraTreesRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split, GroupKFold
from sklearn.metrics import mean_absolute_error


import networkx as nx
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from rdkit.Chem import rdmolops
from rdkit import Chem
from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator

import xgboost as xgb
import torch
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)

import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)

In [3]:
class CFG:
    TARGETS = ['Tg', 'FFV', 'Tc', 'Density', 'Rg']
    SEED = 42
    FOLDS = 5

    # Optimization settings
    N_TRIALS = 100  # Optuna trials per target
    EARLY_STOPPING = 100
    MAX_ITERATIONS = 5000
    
    # Model settings
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
useless_cols = [   
    
    'MaxPartialCharge', 
    # Nan data
    'BCUT2D_MWHI',
    'BCUT2D_MWLOW',
    'BCUT2D_CHGHI',
    'BCUT2D_CHGLO',
    'BCUT2D_LOGPHI',
    'BCUT2D_LOGPLOW',
    'BCUT2D_MRHI',
    'BCUT2D_MRLOW',

    # Constant data
    'NumRadicalElectrons',
    'SMR_VSA8',
    'SlogP_VSA9',
    'fr_barbitur',
    'fr_benzodiazepine',
    'fr_dihydropyridine',
    'fr_epoxide',
    'fr_isothiocyan',
    'fr_lactam',
    'fr_nitroso',
    'fr_prisulfonamd',
    'fr_thiocyan',

    # High correlated data >0.95
    'MaxEStateIndex',
    'HeavyAtomMolWt',
    'ExactMolWt',
    'NumValenceElectrons',
    'Chi0',
    'Chi0n',
    'Chi0v',
    'Chi1',
    'Chi1n',
    'Chi1v',
    'Chi2n',
    'Kappa1',
    'LabuteASA',
    'HeavyAtomCount',
    'MolMR',
    'Chi3n',
    'BertzCT',
    'Chi2v',
    'Chi4n',
    'HallKierAlpha',
    'Chi3v',
    'Chi4v',
    'MinAbsPartialCharge',
    'MinPartialCharge',
    'MaxAbsPartialCharge',
    'FpDensityMorgan2',
    'FpDensityMorgan3',
    'Phi',
    'Kappa3',
    'fr_nitrile',
    'SlogP_VSA6',
    'NumAromaticCarbocycles',
    'NumAromaticRings',
    'fr_benzene',
    'VSA_EState6',
    'NOCount',
    'fr_C_O',
    'fr_C_O_noCOO',
    'NumHDonors',
    'fr_amide',
    'fr_Nhpyrrole',
    'fr_phenol',
    'fr_phenol_noOrthoHbond',
    'fr_COO2',
    'fr_halogen',
    'fr_diazo',
    'fr_nitro_arom',
    'fr_phos_ester'
]

In [5]:
MINMAX_DICT = {
    'Tg': [-148.0297376, 472.25],
    'FFV': [0.2269924, 0.77709707], 
    'Tc': [0.0465, 0.524],
    'Density': [0.748691234, 1.840998909],
    'Rg': [9.7283551, 34.672905605],
}

### Read Main Files

In [6]:
train=pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/train.csv')
test=pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/test.csv')
ss=pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/sample_submission.csv')
ID=test['id'].copy()

### Read Extra Files

In [7]:
tc_smiles = pd.read_csv('/kaggle/input/tc-smiles/Tc_SMILES.csv')
tgss_smiles = pd.read_csv('/kaggle/input/tg-smiles-pid-polymer-class/TgSS_enriched_cleaned.csv')
tg_smiles =pd.read_csv('/kaggle/input/smiles-extra-data/JCIM_sup_bigsmiles.csv')
ktg_smiles =pd.read_excel('/kaggle/input/smiles-extra-data/data_tg3.xlsx')
de_smiles =pd.read_excel('/kaggle/input/smiles-extra-data/data_dnst1.xlsx')

### Preprocessing

In [8]:
def clean_and_validate_smiles(smiles):
    if not isinstance(smiles, str) or len(smiles) == 0:
        return None

    bad_patterns = [
        '[R]', '[R1]', '[R2]', '[R3]', '[R4]', '[R5]', 
        "[R']", '[R"]', 'R1', 'R2', 'R3', 'R4', 'R5',
        # Additional patterns that cause issues
        '([R])', '([R1])', '([R2])', 
    ]

    for pattern in bad_patterns:
        if pattern in smiles:
            return None

    if '][' in smiles and any(x in smiles for x in ['[R', 'R]']):
        return None

    mol = Chem.MolFromSmiles(smiles)
    
    if mol is not None:
        return Chem.MolToSmiles(mol, canonical=True)
    else:
        return None

In [9]:
print(f"Train samples before cleaning {len(train['SMILES'].notnull())}")
print(f"Test samples before cleaning {len(test['SMILES'].notnull())}")
train['SMILES'] = train['SMILES'].progress_apply(lambda s: clean_and_validate_smiles(s))
test['SMILES'] = test['SMILES'].progress_apply(lambda s: clean_and_validate_smiles(s))
print(f"Train samples after cleaning {len(train['SMILES'].notnull())}")
print(f"Test samples after cleaning {len(test['SMILES'].notnull())}")

Train samples before cleaning 7973
Test samples before cleaning 3


100%|██████████| 7973/7973 [00:03<00:00, 2292.02it/s]
100%|██████████| 3/3 [00:00<00:00, 1362.67it/s]

Train samples after cleaning 7973
Test samples after cleaning 3





In [10]:
# we don't need to make changes to the tgss df

ktg_smiles.rename(columns={'Tg [K]': 'Tg'}, inplace=True)
tg_smiles.rename(columns={'Tg (C)': 'Tg'}, inplace=True)
tc_smiles.rename(columns={'TC_mean': 'Tc'}, inplace=True)
de_smiles.rename(columns={'density(g/cm3)': 'Density'}, inplace=True)

In [11]:
print(f"KTG samples before cleaning {len(ktg_smiles['SMILES'].notnull())}")
ktg_smiles['SMILES'] = ktg_smiles['SMILES'].progress_apply(lambda s: clean_and_validate_smiles(s))
print(f"KTG samples after cleaning {len(ktg_smiles['SMILES'].notnull())}")
ktg_smiles['Tg'] = ktg_smiles['Tg'] - 273.15

KTG samples before cleaning 501


100%|██████████| 501/501 [00:00<00:00, 4457.82it/s]

KTG samples after cleaning 501





In [12]:
print(f"TG samples before cleaning {len(tg_smiles['SMILES'].notnull())}")
tg_smiles['SMILES'] = tg_smiles['SMILES'].progress_apply(lambda s: clean_and_validate_smiles(s))
print(f"TG samples after cleaning {len(tg_smiles['SMILES'].notnull())}")

TG samples before cleaning 662


100%|██████████| 662/662 [00:00<00:00, 2595.90it/s]

TG samples after cleaning 662





In [13]:
print(f"TC samples before cleaning {len(tc_smiles['SMILES'].notnull())}")
tc_smiles['SMILES'] = tc_smiles['SMILES'].progress_apply(lambda s: clean_and_validate_smiles(s))
print(f"TC samples after cleaning {len(tc_smiles['SMILES'].notnull())}")

TC samples before cleaning 874


100%|██████████| 874/874 [00:00<00:00, 4157.21it/s]

TC samples after cleaning 874





In [14]:
print(f"DE samples before cleaning {len(de_smiles['SMILES'].notnull())}")
de_smiles['SMILES'] = de_smiles['SMILES'].progress_apply(lambda s: clean_and_validate_smiles(s))
print(f"DE samples after cleaning {len(de_smiles['SMILES'].notnull())}")

de_smiles = de_smiles[(de_smiles['SMILES'].notnull())&(de_smiles['Density'].notnull())&(de_smiles['Density'] != 'nylon')]
de_smiles['Density'] = de_smiles['Density'].astype('float64')
de_smiles['Density'] -= 0.118

DE samples before cleaning 787


100%|██████████| 787/787 [00:00<00:00, 7381.40it/s]

DE samples after cleaning 787





In [15]:
print(f"TGSS samples before cleaning {len(tgss_smiles['SMILES'].notnull())}")
tgss_smiles['SMILES'] = tgss_smiles['SMILES'].progress_apply(lambda s: clean_and_validate_smiles(s))
print(f"TGSS samples after cleaning {len(tgss_smiles['SMILES'].notnull())}")

TGSS samples before cleaning 7284


100%|██████████| 7284/7284 [00:03<00:00, 2179.37it/s]

TGSS samples after cleaning 7284





In [16]:
def preprocessing(df):
    desc_names = [desc[0] for desc in Descriptors.descList if desc[0] not in useless_cols]
    descriptors = [compute_all_descriptors(smi) for smi in df['SMILES'].to_list()]

    graph_feats = {'graph_diameter': [], 'avg_shortest_path': [], 'num_cycles': []}
    morgan_feats = {f"morgan_{i}" : [] for i in range(1024)}
    
    for smile in df['SMILES']:
        compute_graph_features(smile, graph_feats)
        fp_bits = compute_morgan_fingerprint(smile)
        for i, bit in enumerate(fp_bits):
            morgan_feats[f"morgan_{i}"].append(int(bit))

    result = pd.concat(
        [
            pd.DataFrame(descriptors, columns=desc_names),
            pd.DataFrame(graph_feats),
            pd.DataFrame(morgan_feats)
        ],
        axis=1
    )

    result = result.replace([-np.inf, np.inf], np.nan)
    return result

In [17]:
def add_extra_data(df_train, df_extra, target):
    n_samples_before = len(df_train[df_train[target].notnull()])
    
    df_extra = df_extra.groupby('SMILES', as_index=False)[target].mean()
    cross_smiles = set(df_extra['SMILES']) & set(df_train['SMILES'])
    unique_smiles_extra = set(df_extra['SMILES']) - set(df_train['SMILES'])

    # Make priority target value from competition's df
    for smile in df_train[df_train[target].notnull()]['SMILES'].tolist():
        if smile in cross_smiles:
            cross_smiles.remove(smile)

    # Imput missing values for competition's SMILES
    for smile in cross_smiles:
        df_train.loc[df_train['SMILES']==smile, target] = df_extra[df_extra['SMILES']==smile][target].values[0]
    
    df_train = pd.concat([df_train, df_extra[df_extra['SMILES'].isin(unique_smiles_extra)]], axis=0).reset_index(drop=True)

    n_samples_after = len(df_train[df_train[target].notnull()])
    print(f'\nFor target "{target}" added {n_samples_after-n_samples_before} new samples!')
    print(f'New unique SMILES: {len(unique_smiles_extra)}')
    return df_train

train = add_extra_data(train, tc_smiles, 'Tc')
train = add_extra_data(train, tg_smiles, 'Tg')
train = add_extra_data(train, ktg_smiles, 'Tg')
train = add_extra_data(train, tgss_smiles, 'Tg')
train = add_extra_data(train, de_smiles, 'Density')


For target "Tc" added 129 new samples!
New unique SMILES: 129

For target "Tg" added 151 new samples!
New unique SMILES: 136

For target "Tg" added 499 new samples!
New unique SMILES: 499

For target "Tg" added 7083 new samples!
New unique SMILES: 1845

For target "Density" added 634 new samples!
New unique SMILES: 473


In [18]:
def compute_all_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return [None] * len(desc_names)
    return [desc[1](mol) for desc in Descriptors.descList if desc[0] not in useless_cols]

def compute_graph_features(smiles, graph_feats):
    mol = Chem.MolFromSmiles(smiles)
    adj = rdmolops.GetAdjacencyMatrix(mol)
    G = nx.from_numpy_array(adj)

    graph_feats['graph_diameter'].append(nx.diameter(G) if nx.is_connected(G) else 0)
    graph_feats['avg_shortest_path'].append(nx.average_shortest_path_length(G) if nx.is_connected(G) else 0)
    graph_feats['num_cycles'].append(len(list(nx.cycle_basis(G))))

def compute_morgan_fingerprint(smiles, radius=2, n_bits=1024):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return {f'morgan_{i}': 0 for i in range(n_bits)}

    generator = GetMorganGenerator(radius=radius, fpSize=n_bits)
    fp = generator.GetFingerprint(mol)
    # Convert to bit string
    fp_bits = fp.ToBitString()
    
    return fp_bits

def element_features(df):
    basic_features = []
    for smile in df['SMILES']:
        smiles_str = str(smile)
        basic_features.append({
                'smiles_length': len(smiles_str),
                'carbon_count': smiles_str.count('C'),
                'nitrogen_count': smiles_str.count('N'),
                'oxygen_count': smiles_str.count('O'),
                'sulfur_count': smiles_str.count('S'),
                'phosphorus_count': smiles_str.count('P'),
                'fluorine_count': smiles_str.count('F'),
                'chlorine_count': smiles_str.count('Cl'),
                'bromine_count': smiles_str.count('Br'),
                'iodine_count': smiles_str.count('I'),
                'double_bonds': smiles_str.count('='),
                'triple_bonds': smiles_str.count('#'),
                'rings': smiles_str.count('('),
                'aromatic_c': smiles_str.count('c'),
                'aromatic_n': smiles_str.count('n'),
                'aromatic_o': smiles_str.count('o'),
                'branches': smiles_str.count('['),
                'polymer_stars': smiles_str.count('*')
            })
        return pd.DataFrame(basic_features)

In [19]:
train = pd.concat([train, preprocessing(train), element_features(train)], axis=1)
test = pd.concat([test, preprocessing(test), element_features(train)], axis=1)

print(train.shape)
train['Ipc']=np.log10(train['Ipc'])  
for n in train.columns[7:]:
    train[n]=train[n].replace(-np.inf,np.nan)
    train[n]=train[n].replace(np.inf,np.nan)    
    train[n].fillna(train[n].mean())
  
test['Ipc']=np.log10(test['Ipc'])
for n in test.columns[7:]:
    test[n]=test[n].replace(-np.inf,np.nan)
    test[n]=test[n].replace(np.inf,np.nan)      
    test[n].fillna(train[n].mean())

(11055, 1200)


In [20]:
all_features = train.columns.tolist()[7:]
features_by_target = {}

for target in CFG.TARGETS:
    target_data = train[train[target].notnull()]
    if len(target_data) == 0:
        print(f"No data for {target}, skipping")
        features_by_target[target] = []
        continue

    good_features = []
    for col in all_features:
        if col in target_data.columns:
            values = target_data[col]

            if values.nunique() <= 1: continue
            if (values == 0).mean() > 0.98: continue
            if values.isnull().mean() > 0.5: continue

            good_features.append(col)
    features_by_target[target] = good_features

In [21]:
#len(features_by_target['Tg'])

In [22]:
def objective(trial, X, y, groups, feature_names):
    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'mae',
        'seed': CFG.SEED,
        'verbosity': 0,
        'tree_method': 'hist',
        'missing': 0.0,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 12),  # Reduced max depth
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
    }

    if torch.cuda.is_available():
        params['tree_method'] = 'gpu_hist'
        params['gpu_id'] = 0

    group_kfold = GroupKFold(n_splits=CFG.FOLDS)
    cv_scores = []

    for train_idx, valid_idx in group_kfold.split(X, y, groups=groups):
        X_train, X_valid = X[train_idx], X[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]

        # Additional cleaning for each fold
        X_train = np.nan_to_num(X_train, nan=0.0, posinf=0.0, neginf=0.0)
        X_valid = np.nan_to_num(X_valid, nan=0.0, posinf=0.0, neginf=0.0)
        
        dtrain = xgb.DMatrix(X_train, label=y_train, missing=0.0)
        dvalid = xgb.DMatrix(X_valid, label=y_valid, missing=0.0)
        
        model = xgb.train(
            params,
            dtrain,
            num_boost_round=CFG.MAX_ITERATIONS,
            evals=[(dtrain, 'train'), (dvalid, 'valid')],
            early_stopping_rounds=CFG.EARLY_STOPPING,
            verbose_eval=False
        )

        cv_scores.append(model.best_score)
        
    return np.mean(cv_scores)

In [25]:
def train_optimized_model(target):
    print(f"training model for {target}")
    target_data = train[train[target].notnull()].reset_index(drop=True)
    if len(target_data) < 50:
        print(f"Not enough data for {target}")
        return None, None

    target_features = features_by_target[target]
    if len(target_features) == 0:
        print(f"No features available for {target}")
        return None, None

    X = target_data[target_features].values
    y = target_data[target].values
    groups = target_data['SMILES'].factorize()[0]

    print(f"Initial data: {len(target_data)} samples, {X.shape[1]} features")

    study = optuna.create_study(
        direction='minimize',
        sampler=optuna.samplers.TPESampler(seed=CFG.SEED),
        pruner=optuna.pruners.MedianPruner(n_warmup_steps=5)  # More aggressive pruning
    )

    study.optimize(
            lambda trial: objective(trial, X, y, groups, target_features),
            n_trials=CFG.N_TRIALS,
            show_progress_bar=True,
            timeout=1800  # 30 minute timeout per target
        )

    best_params = {
            'objective': 'reg:squarederror',
            'eval_metric': 'mae', 
            'seed': CFG.SEED,
            'verbosity': 0,
            'tree_method': 'gpu_hist' if torch.cuda.is_available() else 'hist',
            'missing': 0.0
        }
    best_params.update(study.best_params)
    
    print(f"Best CV MAE: {study.best_value:.5f}")

    X_final = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0)
    dtrain = xgb.DMatrix(X_final, label=y, missing=0.0)
        
    final_model = xgb.train(
        best_params,
        dtrain,
        num_boost_round=CFG.MAX_ITERATIONS,
        verbose_eval=False
    )

    train_pred = final_model.predict(dtrain)
    train_mae = mean_absolute_error(y, train_pred)
    
    print(f"{target} complete - CV: {study.best_value:.5f}, Train: {train_mae:.5f}")
    
    return final_model, {
        'cv_mae': study.best_value,
        'train_mae': train_mae,
        'best_params': best_params,
        'n_samples': len(target_data),
        'features': target_features  # Store the cleaned feature names
        }

In [26]:
print("Training optimized models")
models = {}
results = {}

for target in CFG.TARGETS:
    model, result = train_optimized_model(target)
    if model is not None:
        models[target] = model
        results[target] = result

print(f"\nTraining complete!")
print(f"Successfully trained models: {list(models.keys())}")

print(f"\nResults Summary:")
for target, result in results.items():
    print(f"{target}: CV={result['cv_mae']:.5f}, Train={result['train_mae']:.5f}, Samples={result['n_samples']:,}")

Training optimized models
training model for Tg
Initial data: 8244 samples, 461 features


  0%|          | 0/100 [00:00<?, ?it/s]

Best CV MAE: 24.51912
Tg complete - CV: 24.51912, Train: 1.60556
training model for FFV
Initial data: 7030 samples, 464 features


  0%|          | 0/100 [00:00<?, ?it/s]

Best CV MAE: 0.00573
FFV complete - CV: 0.00573, Train: 0.00143
training model for Tc
Initial data: 866 samples, 323 features


  0%|          | 0/100 [00:00<?, ?it/s]

Best CV MAE: 0.03090
Tc complete - CV: 0.03090, Train: 0.00221
training model for Density
Initial data: 1247 samples, 326 features


  0%|          | 0/100 [00:00<?, ?it/s]

Best CV MAE: 0.03597
Density complete - CV: 0.03597, Train: 0.00258
training model for Rg
Initial data: 614 samples, 320 features


  0%|          | 0/100 [00:00<?, ?it/s]

Best CV MAE: 1.60860
Rg complete - CV: 1.60860, Train: 0.00097

Training complete!
Successfully trained models: ['Tg', 'FFV', 'Tc', 'Density', 'Rg']

Results Summary:
Tg: CV=24.51912, Train=1.60556, Samples=8,244
FFV: CV=0.00573, Train=0.00143, Samples=7,030
Tc: CV=0.03090, Train=0.00221, Samples=866
Density: CV=0.03597, Train=0.00258, Samples=1,247
Rg: CV=1.60860, Train=0.00097, Samples=614


In [33]:
def get_predictions():

    test_predictions = pd.DataFrame({'id': test['id']})
    for target in CFG.TARGETS:
        test_predictions[target] = 0.0

    for target in CFG.TARGETS:
        if target in models and target in results:
            model_features = results[target]['features']
            X_test = test[model_features].values
            dtest = xgb.DMatrix(X_test, missing=0.0)
            preds = models[target].predict(dtest)
            print(f"Predictions generated for {target}")
            test_predictions[target] = preds

    return test_predictions

In [34]:
preds_df = get_predictions()
preds_df

Predictions generated for Tg
Predictions generated for FFV
Predictions generated for Tc
Predictions generated for Density
Predictions generated for Rg


Unnamed: 0,id,Tg,FFV,Tc,Density,Rg
0,1109053969,162.38446,0.377759,0.198537,1.112411,22.824816
1,1422188626,152.104111,0.378421,0.219827,1.098118,19.953814
2,2032016830,142.973251,0.351451,0.270223,1.116518,21.739464


In [35]:
# preds_df.to_csv('submission.csv', index=False)