In [1]:
!pip install /kaggle/input/rdkit-2025-3-3-cp311/rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl

Processing /kaggle/input/rdkit-2025-3-3-cp311/rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl
Installing collected packages: rdkit
Successfully installed rdkit-2025.3.3


In [2]:
import pandas as pd
import numpy as np

from tqdm import tqdm
tqdm.pandas()

from sklearn.ensemble import HistGradientBoostingRegressor,ExtraTreesRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error


import networkx as nx
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from rdkit.Chem import rdmolops
from rdkit import Chem

import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)

In [3]:
class CFG:
    TARGETS = ['Tg', 'FFV', 'Tc', 'Density', 'Rg']
    SEED = 42
    FOLDS = 5

### Read Main Files

In [4]:
train=pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/train.csv')
test=pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/test.csv')
ss=pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/sample_submission.csv')
ID=test['id'].copy()

### Read Extra Files

In [5]:
tc_smiles =pd.read_csv('/kaggle/input/tc-smiles/Tc_SMILES.csv')
tg_smiles =pd.read_csv('/kaggle/input/smiles-extra-data/JCIM_sup_bigsmiles.csv')
ktg_smiles =pd.read_excel('/kaggle/input/smiles-extra-data/data_tg3.xlsx')
de_smiles =pd.read_excel('/kaggle/input/smiles-extra-data/data_dnst1.xlsx')

In [6]:
train.head()

Unnamed: 0,id,SMILES,Tg,FFV,Tc,Density,Rg
0,87817,*CC(*)c1ccccc1C(=O)OCCCCCC,,0.374645,0.205667,,
1,106919,*Nc1ccc([C@H](CCC)c2ccc(C3(c4ccc([C@@H](CCC)c5...,,0.37041,,,
2,388772,*Oc1ccc(S(=O)(=O)c2ccc(Oc3ccc(C4(c5ccc(Oc6ccc(...,,0.37886,,,
3,519416,*Nc1ccc(-c2c(-c3ccc(C)cc3)c(-c3ccc(C)cc3)c(N*)...,,0.387324,,,
4,539187,*Oc1ccc(OC(=O)c2cc(OCCCCCCCCCOCC3CCCN3c3ccc([N...,,0.35547,,,


In [7]:
tc_smiles.head()

Unnamed: 0,TC_mean,SMILES
0,0.2445,*CC(*)C
1,0.225333,*CC(*)CC
2,0.246333,*CC(*)CCC
3,0.1868,*CC(*)C(C)C
4,0.230667,*CC(*)CCCC


In [8]:
tg_smiles.head()

Unnamed: 0.1,Unnamed: 0,SMILES,BigSMILES,Tg (C)
0,0,*C1COC2C1OCC2Oc1ccc(cc1)CNC(=O)CCCCCCC(=O)NCc1...,{<Oc1ccc(cc1)CNC(=O)CCCCCCC(=O)NCc2ccc(cc2)OC3...,21.581731
1,1,*OC(CCC(OC(=O)Nc1ccc(cc1)Cc1ccc(cc1)NC(=O)*)C)C,{<OC(C)CCC(C)OC(=O)Nc1ccc(cc1)Cc2ccc(cc2)NC(=O)>},63.589338
2,2,*OC(=O)c1ccc(cc1)C(=O)OCCCC(=O)NCc1ccc(cc1)CNC...,{<CCCC(=O)NCc1ccc(cc1)CNC(=O)CCCOC(=O)c2ccc(cc...,53.557261
3,3,*OC(=O)NCCNC(=O)OCC*,{<CCOC(=O)NCCNC(=O)O>},5.896093
4,4,*SCCCCC*,{<CCCCCS>},-55.37861


In [9]:
ktg_smiles.head()

Unnamed: 0,SMILES,Tg [K]
0,CC1(C)OC[C@H](CO)O1,282.0
1,COC(=O)[C@H](C)O,286.0
2,FC(F)C(F)C(F)(F)F,95.0
3,FCC(F)(F)F,71.0
4,CC(F)(Cl)Cl,41.0


In [10]:
de_smiles.head()

Unnamed: 0,SMILES,uSMILES,std_name,density(g/cm3),density_std_err(g/cm3),abbreviations,synonyms,tradenames
0,NC(=O)OC(=C1)SC(=C1),NC(=O)OC1=CC=CS1,Polyurethane,1.58,0.24,,,
1,*OCCOC(=O)Nc2ccc(Cc1ccc(NC(*)=O)cc1)cc2,O(CCOC(=O)NC1=CC=C(CC2=CC=C(NC(=O)[*])C=C2)C=C...,Polyurethane,1.37,0.12,PU;PUR,"Poly[(diethylene glycol)-alt-(1,6-hexamethylen...",
2,NC(=O)OC(=S),NC(=O)OC=S,Polyurethane,1.65,0.47,,,
3,NC(=O)NC1=CC=C(S1),NC(=O)NC1=CC=CS1,Polyurea,1.76,0.3,,,
4,NC(=O),NC=O,Polyurea,1.47,0.31,,,


In [11]:
def make_smile_canonical(smile): # To avoid duplicates, for example: canonical '*C=C(*)C' == '*C(=C*)C'
    try:
        mol = Chem.MolFromSmiles(smile)
        canon_smile = Chem.MolToSmiles(mol, canonical=True)
        return canon_smile
    except:
        return np.nan

train['SMILES'] = train['SMILES'].progress_apply(lambda s: make_smile_canonical(s))
test['SMILES'] = test['SMILES'].progress_apply(lambda s: make_smile_canonical(s))

100%|██████████| 7973/7973 [00:05<00:00, 1556.71it/s]
100%|██████████| 3/3 [00:00<00:00, 922.10it/s]


In [12]:
ktg_smiles.rename(columns={'Tg [K]': 'Tg'}, inplace=True)
tg_smiles.rename(columns={'Tg (C)': 'Tg'}, inplace=True)
tc_smiles.rename(columns={'TC_mean': 'Tc'}, inplace=True)
de_smiles.rename(columns={'density(g/cm3)': 'Density'}, inplace=True)

In [13]:
de_smiles['SMILES'] = de_smiles['SMILES'].progress_apply(lambda s: make_smile_canonical(s))
de_smiles = de_smiles[(de_smiles['SMILES'].notnull())&(de_smiles['Density'].notnull())&(de_smiles['Density'] != 'nylon')]
de_smiles['Density'] = de_smiles['Density'].astype('float64')
de_smiles['Density'] -= 0.118

ktg_smiles['Tg'] = ktg_smiles['Tg'] - 273.15

 67%|██████▋   | 524/787 [00:00<00:00, 5233.58it/s][07:38:10] SMILES Parse Error: syntax error while parsing: *O[Si](*)([R])[R]
[07:38:10] SMILES Parse Error: check for mistakes around position 12:
[07:38:10] *O[Si](*)([R])[R]
[07:38:10] ~~~~~~~~~~~^
[07:38:10] SMILES Parse Error: Failed parsing SMILES '*O[Si](*)([R])[R]' for input: '*O[Si](*)([R])[R]'
[07:38:10] SMILES Parse Error: syntax error while parsing: *NC(=O)c4ccc3c(=O)n(c2ccc([R]c1ccc(*)cc1)cc2)c(=O)c3c4
[07:38:10] SMILES Parse Error: check for mistakes around position 28:
[07:38:10] c4ccc3c(=O)n(c2ccc([R]c1ccc(*)cc1)cc2)c(=
[07:38:10] ~~~~~~~~~~~~~~~~~~~~^
[07:38:10] SMILES Parse Error: Failed parsing SMILES '*NC(=O)c4ccc3c(=O)n(c2ccc([R]c1ccc(*)cc1)cc2)c(=O)c3c4' for input: '*NC(=O)c4ccc3c(=O)n(c2ccc([R]c1ccc(*)cc1)cc2)c(=O)c3c4'
[07:38:10] SMILES Parse Error: syntax error while parsing: O=C=N[R1]N=C=O.O[R2]O.O[R3]O
[07:38:10] SMILES Parse Error: check for mistakes around position 7:
[07:38:10] O=C=N[R1]N=C=O.O[R2]O.O[R3]O


In [14]:
useless_cols = [   
    
    'MaxPartialCharge', 
    # Nan data
    'BCUT2D_MWHI',
    'BCUT2D_MWLOW',
    'BCUT2D_CHGHI',
    'BCUT2D_CHGLO',
    'BCUT2D_LOGPHI',
    'BCUT2D_LOGPLOW',
    'BCUT2D_MRHI',
    'BCUT2D_MRLOW',

    # Constant data
    'NumRadicalElectrons',
    'SMR_VSA8',
    'SlogP_VSA9',
    'fr_barbitur',
    'fr_benzodiazepine',
    'fr_dihydropyridine',
    'fr_epoxide',
    'fr_isothiocyan',
    'fr_lactam',
    'fr_nitroso',
    'fr_prisulfonamd',
    'fr_thiocyan',

    # High correlated data >0.95
    'MaxEStateIndex',
    'HeavyAtomMolWt',
    'ExactMolWt',
    'NumValenceElectrons',
    'Chi0',
    'Chi0n',
    'Chi0v',
    'Chi1',
    'Chi1n',
    'Chi1v',
    'Chi2n',
    'Kappa1',
    'LabuteASA',
    'HeavyAtomCount',
    'MolMR',
    'Chi3n',
    'BertzCT',
    'Chi2v',
    'Chi4n',
    'HallKierAlpha',
    'Chi3v',
    'Chi4v',
    'MinAbsPartialCharge',
    'MinPartialCharge',
    'MaxAbsPartialCharge',
    'FpDensityMorgan2',
    'FpDensityMorgan3',
    'Phi',
    'Kappa3',
    'fr_nitrile',
    'SlogP_VSA6',
    'NumAromaticCarbocycles',
    'NumAromaticRings',
    'fr_benzene',
    'VSA_EState6',
    'NOCount',
    'fr_C_O',
    'fr_C_O_noCOO',
    'NumHDonors',
    'fr_amide',
    'fr_Nhpyrrole',
    'fr_phenol',
    'fr_phenol_noOrthoHbond',
    'fr_COO2',
    'fr_halogen',
    'fr_diazo',
    'fr_nitro_arom',
    'fr_phos_ester'
]

In [15]:
def preprocessing(df):
    desc_names = [desc[0] for desc in Descriptors.descList if desc[0] not in useless_cols]
    descriptors = [compute_all_descriptors(smi) for smi in df['SMILES'].tolist()]

    graph_feats = {'graph_diameter': [], 'avg_shortest_path': [], 'num_cycles': []}

    for smile in df['SMILES']:
         compute_graph_features(smile, graph_feats)
        
    result = pd.concat(
        [
            pd.DataFrame(descriptors, columns=desc_names),
            pd.DataFrame(graph_feats)
        ],
        axis=1
    )

    result = result.replace([-np.inf, np.inf], np.nan)
    return result

### Adding features from extra data

In [16]:
def add_extra_data(df_train, df_extra, target):
    n_samples_before = len(df_train[df_train[target].notnull()])
    
    df_extra['SMILES'] = df_extra['SMILES'].apply(lambda s: make_smile_canonical(s))
    df_extra = df_extra.groupby('SMILES', as_index=False)[target].mean()
    cross_smiles = set(df_extra['SMILES']) & set(df_train['SMILES'])
    unique_smiles_extra = set(df_extra['SMILES']) - set(df_train['SMILES'])

    # Make priority target value from competition's df
    for smile in df_train[df_train[target].notnull()]['SMILES'].tolist():
        if smile in cross_smiles:
            cross_smiles.remove(smile)

    # Imput missing values for competition's SMILES
    for smile in cross_smiles:
        df_train.loc[df_train['SMILES']==smile, target] = df_extra[df_extra['SMILES']==smile][target].values[0]
    
    df_train = pd.concat([df_train, df_extra[df_extra['SMILES'].isin(unique_smiles_extra)]], axis=0).reset_index(drop=True)

    n_samples_after = len(df_train[df_train[target].notnull()])
    print(f'\nFor target "{target}" added {n_samples_after-n_samples_before} new samples!')
    print(f'New unique SMILES: {len(unique_smiles_extra)}')
    return df_train

train = add_extra_data(train, tc_smiles, 'Tc')
train = add_extra_data(train, tg_smiles, 'Tg')
train = add_extra_data(train, ktg_smiles, 'Tg')
train = add_extra_data(train, de_smiles, 'Density')


For target "Tc" added 129 new samples!
New unique SMILES: 129

For target "Tg" added 151 new samples!
New unique SMILES: 136

For target "Tg" added 499 new samples!
New unique SMILES: 499

For target "Density" added 634 new samples!
New unique SMILES: 524


In [17]:
def compute_all_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return [None] * len(desc_names)
    return [desc[1](mol) for desc in Descriptors.descList if desc[0] not in useless_cols]

def compute_graph_features(smiles, graph_feats):
    mol = Chem.MolFromSmiles(smiles)
    adj = rdmolops.GetAdjacencyMatrix(mol)
    G = nx.from_numpy_array(adj)

    graph_feats['graph_diameter'].append(nx.diameter(G) if nx.is_connected(G) else 0)
    graph_feats['avg_shortest_path'].append(nx.average_shortest_path_length(G) if nx.is_connected(G) else 0)
    graph_feats['num_cycles'].append(len(list(nx.cycle_basis(G))))

train = pd.concat([train, preprocessing(train)], axis=1)
test = pd.concat([test, preprocessing(test)], axis=1)

all_features = train.columns[7:].tolist()
features = {}
for target in CFG.TARGETS:
    const_descs = []
    for col in train.columns.drop(CFG.TARGETS):
        if train[train[target].notnull()][col].nunique() == 1:
            const_descs.append(col)
    features[target] = [f for f in all_features if f not in const_descs]

print(train.shape)
train['Ipc']=np.log10(train['Ipc'])  
for n in train.columns[7:]:
    train[n]=train[n].replace(-np.inf,np.nan)
    train[n]=train[n].replace(np.inf,np.nan)    
    train[n].fillna(train[n].mean())
  
print(train.shape)
test['Ipc']=np.log10(test['Ipc'])
for n in test.columns[7:]:
    train[n]=train[n].replace(-np.inf,np.nan)
    train[n]=train[n].replace(np.inf,np.nan)      
    test[n].fillna(train[n].mean())

(9261, 158)
(9261, 158)


In [18]:
# We'll separate train to be one model for each target variable.
t_1=train[['SMILES','Tg']].copy()
t_2=train[['SMILES','FFV']].copy()
t_3=train[['SMILES','Tc']].copy()
t_4=train[['SMILES','Density']].copy()
t_5=train[['SMILES','Rg']].copy()

# We will drop the rows with missing values related to that target after separation.
#This is important , dropping them beforehand would result Null for all data.
t_1.dropna(inplace=True)
t_2.dropna(inplace=True)
t_3.dropna(inplace=True)
t_4.dropna(inplace=True)
t_5.dropna(inplace=True)

In [19]:
train=train.drop(['id','Tg','FFV','Tc','Density','Rg'],axis=1)
test=test.drop(['id','SMILES'],axis=1)

In [20]:
tg=t_1.merge(train,on='SMILES',how='left')
ffv=t_2.merge(train,on='SMILES',how='left')
tc=t_3.merge(train,on='SMILES',how='left')
density=t_4.merge(train,on='SMILES',how='left')
rg=t_5.merge(train,on='SMILES',how='left')

In [21]:
for i in (tg,tc,density,ffv,rg):
    i.drop('SMILES',axis=1,inplace=True)
    i.dropna(inplace=True)

### Preparing model

In [22]:
def model(train_d,test_d,model,target,submission=False):
    # We divide the data into training and validation sets for model evaluation
    X=train_d.drop(target,axis=1)
    y=train_d[target].copy()
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=10)

    Model=model()
    if submission==False:
       Model.fit(X_train,y_train)
       y_pred=Model.predict(X_test)
       return mean_absolute_error(y_pred,y_test)         # We assess our model performance using MAE metric
    if submission==True:
       Model.fit(X,y)
       submission=Model.predict(test_d)
       return submission

### Model evaluation

In [23]:
from sklearn.ensemble import (
    RandomForestRegressor,
    ExtraTreesRegressor,
    GradientBoostingRegressor,
    HistGradientBoostingRegressor
)
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

In [24]:
models_to_compare = {
    'RandomForest': lambda: RandomForestRegressor(),
    'ExtraTrees': lambda: ExtraTreesRegressor(),
    'GradientBoosting': lambda: GradientBoostingRegressor(),
    'HistGradientBoosting': lambda: HistGradientBoostingRegressor(),
    'XGBoost': lambda: XGBRegressor(verbosity=0),
    'LightGBM': lambda: LGBMRegressor(verbose=-1),
    'CatBoost': lambda: CatBoostRegressor(verbose=0),
}

for name, model_init in models_to_compare.items():
    try:
        score = model(tg, test, model_init, 'Tg', submission=False)
        print(f'{name:<20}: MAE = {score:.5f}')
    except Exception as e:
        print(f'{name:<20}: Failed with error: {e}')

RandomForest        : MAE = 35.52949
ExtraTrees          : MAE = 35.41423
GradientBoosting    : MAE = 37.79259
HistGradientBoosting: MAE = 35.05668
XGBoost             : MAE = 39.22545
LightGBM            : MAE = 35.59063
CatBoost            : MAE = 35.26248


In [25]:
models_to_compare = {
    'RandomForest': lambda: RandomForestRegressor(),
    'ExtraTrees': lambda: ExtraTreesRegressor(),
    'GradientBoosting': lambda: GradientBoostingRegressor(),
    'HistGradientBoosting': lambda: HistGradientBoostingRegressor(),
    'XGBoost': lambda: XGBRegressor(verbosity=0),
    'LightGBM': lambda: LGBMRegressor(verbose=-1),
    'CatBoost': lambda: CatBoostRegressor(verbose=0),
}

for name, model_cls in models_to_compare.items():
    try:
        score = model(ffv, test, model_cls, 'FFV', submission=False)
        print(f'{name:<20}: MAE = {score:.5f}')
    except Exception as e:
        print(f'{name:<20}: Failed with error: {e}')

RandomForest        : MAE = 0.00758
ExtraTrees          : MAE = 0.00666
GradientBoosting    : MAE = 0.00995
HistGradientBoosting: MAE = 0.00781
XGBoost             : MAE = 0.00738
LightGBM            : MAE = 0.00772
CatBoost            : MAE = 0.00684


In [26]:
models_to_compare = {
    'RandomForest': lambda: RandomForestRegressor(),
    'ExtraTrees': lambda: ExtraTreesRegressor(),
    'GradientBoosting': lambda: GradientBoostingRegressor(),
    'HistGradientBoosting': lambda: HistGradientBoostingRegressor(),
    'XGBoost': lambda: XGBRegressor(verbosity=0),
    'LightGBM': lambda: LGBMRegressor(verbose=-1),
    'CatBoost': lambda: CatBoostRegressor(verbose=0),
}

for name, model_cls in models_to_compare.items():
    try:
        score = model(tc, test, model_cls, 'Tc', submission=False)
        print(f'{name:<20}: MAE = {score:.5f}')
    except Exception as e:
        print(f'{name:<20}: Failed with error: {e}')

RandomForest        : MAE = 0.03623
ExtraTrees          : MAE = 0.03731
GradientBoosting    : MAE = 0.03597
HistGradientBoosting: MAE = 0.03569
XGBoost             : MAE = 0.03571
LightGBM            : MAE = 0.03664
CatBoost            : MAE = 0.03451


In [27]:
models_to_compare = {
    'RandomForest': lambda: RandomForestRegressor(),
    'ExtraTrees': lambda: ExtraTreesRegressor(),
    'GradientBoosting': lambda: GradientBoostingRegressor(),
    'HistGradientBoosting': lambda: HistGradientBoostingRegressor(),
    'XGBoost': lambda: XGBRegressor(verbosity=0),
    'LightGBM': lambda: LGBMRegressor(verbose=-1),
    'CatBoost': lambda: CatBoostRegressor(verbose=0),
}

for name, model_cls in models_to_compare.items():
    try:
        score = model(density, test, model_cls, 'Density', submission=False)
        print(f'{name:<20}: MAE = {score:.5f}')
    except Exception as e:
        print(f'{name:<20}: Failed with error: {e}')

RandomForest        : MAE = 0.04799
ExtraTrees          : MAE = 0.03932
GradientBoosting    : MAE = 0.04752
HistGradientBoosting: MAE = 0.04533
XGBoost             : MAE = 0.04739
LightGBM            : MAE = 0.04467
CatBoost            : MAE = 0.03930


In [28]:
models_to_compare = {
    'RandomForest': lambda: RandomForestRegressor(),
    'ExtraTrees': lambda: ExtraTreesRegressor(),
    'GradientBoosting': lambda: GradientBoostingRegressor(),
    'HistGradientBoosting': lambda: HistGradientBoostingRegressor(),
    'XGBoost': lambda: XGBRegressor(verbosity=0),
    'LightGBM': lambda: LGBMRegressor(verbose=-1),
    'CatBoost': lambda: CatBoostRegressor(verbose=0),
}

for name, model_cls in models_to_compare.items():
    try:
        score = model(rg, test, model_cls, 'Rg', submission=False)
        print(f'{name:<20}: MAE = {score:.5f}')
    except Exception as e:
        print(f'{name:<20}: Failed with error: {e}')

RandomForest        : MAE = 1.80732
ExtraTrees          : MAE = 1.84876
GradientBoosting    : MAE = 1.92350
HistGradientBoosting: MAE = 2.02431
XGBoost             : MAE = 1.92072
LightGBM            : MAE = 1.97340
CatBoost            : MAE = 1.81393


### Submission

In [29]:
sub = {
    'id': ID,
    'Tg': model(tg, test, lambda: CatBoostRegressor(verbose=0), 'Tg', submission=True),
    'FFV': model(ffv, test, lambda: CatBoostRegressor(verbose=0), 'FFV', submission=True),
    'Tc': model(tc, test, lambda: CatBoostRegressor(verbose=0), 'Tc', submission=True),
    'Density': model(density, test, lambda: CatBoostRegressor(verbose=0), 'Density', submission=True),
    'Rg': model(rg, test, RandomForestRegressor, 'Rg', submission=True)
}

In [30]:
submission=pd.DataFrame(sub)

In [31]:
submission

Unnamed: 0,id,Tg,FFV,Tc,Density,Rg
0,1109053969,171.132531,0.37506,0.210824,1.153362,20.557311
1,1422188626,207.932945,0.379103,0.236521,1.083875,20.98686
2,2032016830,114.399048,0.354266,0.24914,1.083284,20.603246


In [32]:
submission.to_csv('submission.csv',index=False)