# Load previous data

In [1]:
import pandas as pd

train = pd.read_csv('../from_natsume/train_Tc-only_merged.csv')
display(train.describe())
train.head()

Unnamed: 0,id,Tg,FFV,Tc,Density,Rg
count,8103.0,511.0,7030.0,867.0,613.0,614.0
mean,1062722000.0,96.452314,0.367212,0.256539,0.985484,16.419787
std,631567300.0,111.228279,0.029609,0.101271,0.146189,4.60864
min,15259.0,-148.029738,0.226992,0.0465,0.748691,9.728355
25%,509976200.0,13.674509,0.349549,0.186167,0.890243,12.540328
50%,1061550000.0,74.040183,0.364264,0.236,0.948193,15.052194
75%,1613181000.0,161.147595,0.38079,0.325,1.062096,20.411067
max,2147438000.0,472.25,0.777097,1.59,1.840999,34.672906


Unnamed: 0,SMILES,id,Tg,FFV,Tc,Density,Rg
0,*/C(=C(/*)c1ccccc1)c1ccccc1,218059466,206.569886,,,,
1,*/C(=C(\c1ccccc1)c1ccc(*)cc1)c1ccccc1,15862,,,0.338,,
2,*/C(F)=C(\F)C(F)(C(*)(F)F)C(F)(F)F,38242048,,,0.102,,
3,*/C=C(/*)C#CCCCCCCCCCCCCCCCCCCCCC(=O)O,686833175,,,0.4105,0.885737,15.064002
4,*/C=C(/*)CCCCCCCCCCCCCCCCCCCCC(=O)O,1620933064,,,0.403,0.868464,14.078555


In [2]:
TARGETS = ['Tg', 'FFV', 'Tc', 'Density', 'Rg']
for t in TARGETS:
    print(f'"{t}": {len(train[train[t].notnull()])}')

"Tg": 511
"FFV": 7030
"Tc": 867
"Density": 613
"Rg": 614


# Load new data

In [3]:
import pandas as pd
from rdkit import Chem
import numpy as np

def make_smile_canonical(smile): # To avoid duplicates, for example: canonical '*C=C(*)C' == '*C(=C*)C'
    try:
        mol = Chem.MolFromSmiles(smile)
        canon_smile = Chem.MolToSmiles(mol, canonical=True)
        return canon_smile
    except:
        return np.nan

# https://springernature.figshare.com/articles/dataset/dataset_with_glass_transition_temperature/24219958?file=42507037
data_tg2 = pd.read_csv('archive/JCIM_sup_bigsmiles.csv', usecols=['SMILES', 'Tg (C)'])
data_tg2 = data_tg2.rename(columns={'Tg (C)': 'Tg'})

# https://www.sciencedirect.com/science/article/pii/S2590159123000377#ec0005
data_tg3 = pd.read_excel('archive/data_tg3.xlsx')
data_tg3 = data_tg3.rename(columns={'Tg [K]': 'Tg'})
data_tg3['Tg'] = data_tg3['Tg'] - 273.15

# https://github.com/Duke-MatSci/ChemProps
data_dnst = pd.read_excel('archive/data_dnst1.xlsx')
data_dnst = data_dnst.rename(columns={'density(g/cm3)': 'Density'})[['SMILES', 'Density']]
data_dnst['SMILES'] = data_dnst['SMILES'].apply(lambda s: make_smile_canonical(s))
data_dnst = data_dnst[(data_dnst['SMILES'].notnull())&(data_dnst['Density'].notnull())&(data_dnst['Density'] != 'nylon')]
data_dnst['Density'] = data_dnst['Density'].astype('float64')
data_dnst['Density'] -= 0.118

def add_extra_data(df_train, df_extra, target):
    n_samples_before = len(df_train[df_train[target].notnull()])
    
    df_extra['SMILES'] = df_extra['SMILES'].apply(lambda s: make_smile_canonical(s))
    df_extra = df_extra.groupby('SMILES', as_index=False)[target].mean()
    cross_smiles = set(df_extra['SMILES']) & set(df_train['SMILES'])
    unique_smiles_extra = set(df_extra['SMILES']) - set(df_train['SMILES'])

    # Make priority target value from competition's df
    for smile in df_train[df_train[target].notnull()]['SMILES'].tolist():
        if smile in cross_smiles:
            cross_smiles.remove(smile)

    # Imput missing values for competition's SMILES
    for smile in cross_smiles:
        df_train.loc[df_train['SMILES']==smile, target] = df_extra[df_extra['SMILES']==smile][target].values[0]
    
    df_train = pd.concat([df_train, df_extra[df_extra['SMILES'].isin(unique_smiles_extra)]], axis=0).reset_index(drop=True)

    n_samples_after = len(df_train[df_train[target].notnull()])
    print(f'\nFor target "{target}" added {n_samples_after-n_samples_before} new samples!')
    print(f'New unique SMILES: {len(unique_smiles_extra)}')
    return df_train

train = add_extra_data(train, data_tg2, 'Tg')
train = add_extra_data(train, data_tg3, 'Tg')
train = add_extra_data(train, data_dnst, 'Density')
train.describe()


[21:26:57] SMILES Parse Error: syntax error while parsing: *O[Si](*)([R])[R]
[21:26:57] SMILES Parse Error: check for mistakes around position 12:
[21:26:57] *O[Si](*)([R])[R]
[21:26:57] ~~~~~~~~~~~^
[21:26:57] SMILES Parse Error: Failed parsing SMILES '*O[Si](*)([R])[R]' for input: '*O[Si](*)([R])[R]'
[21:26:57] SMILES Parse Error: syntax error while parsing: *NC(=O)c4ccc3c(=O)n(c2ccc([R]c1ccc(*)cc1)cc2)c(=O)c3c4
[21:26:57] SMILES Parse Error: check for mistakes around position 28:
[21:26:57] c4ccc3c(=O)n(c2ccc([R]c1ccc(*)cc1)cc2)c(=
[21:26:57] ~~~~~~~~~~~~~~~~~~~~^
[21:26:57] SMILES Parse Error: Failed parsing SMILES '*NC(=O)c4ccc3c(=O)n(c2ccc([R]c1ccc(*)cc1)cc2)c(=O)c3c4' for input: '*NC(=O)c4ccc3c(=O)n(c2ccc([R]c1ccc(*)cc1)cc2)c(=O)c3c4'
[21:26:57] SMILES Parse Error: syntax error while parsing: O=C=N[R1]N=C=O.O[R2]O.O[R3]O
[21:26:57] SMILES Parse Error: check for mistakes around position 7:
[21:26:57] O=C=N[R1]N=C=O.O[R2]O.O[R3]O
[21:26:57] ~~~~~~^
[21:26:57] SMILES Parse Error: F


For target "Tg" added 155 new samples!
New unique SMILES: 140

For target "Tg" added 499 new samples!
New unique SMILES: 499

For target "Density" added 634 new samples!
New unique SMILES: 524


Unnamed: 0,id,Tg,FFV,Tc,Density,Rg
count,8103.0,1165.0,7030.0,867.0,1247.0,614.0
mean,1062722000.0,53.531782,0.367212,0.256539,1.071682,16.419787
std,631567300.0,123.146686,0.029609,0.101271,0.209673,4.60864
min,15259.0,-255.15,0.226992,0.0465,0.748691,9.728355
25%,509976200.0,-12.605837,0.349549,0.186167,0.912,12.540328
50%,1061550000.0,47.488886,0.364264,0.236,1.030703,15.052194
75%,1613181000.0,118.96,0.38079,0.325,1.192,20.411067
max,2147438000.0,472.25,0.777097,1.59,1.982,34.672906


In [4]:
TARGETS = ['Tg', 'FFV', 'Tc', 'Density', 'Rg']
for t in TARGETS:
    print(f'"{t}": {len(train[train[t].notnull()])}')

"Tg": 1165
"FFV": 7030
"Tc": 867
"Density": 1247
"Rg": 614


In [5]:
train.to_csv('host_tc-natsume_full-dmitry.csv', index=False)