## ⚙️ Config

In [1]:
# Download P2MAT predictions on the test set
!gdown 1Bl7Cf1DDLdbaplKhWlCMRVXmcOMfK5zj
!pip install --quiet rdkit

Downloading...
From: https://drive.google.com/uc?id=1Bl7Cf1DDLdbaplKhWlCMRVXmcOMfK5zj
To: /kaggle/working/P2MAT-Baseline.csv
100%|██████████████████████████████████████| 15.2k/15.2k [00:00<00:00, 30.0MB/s]
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.2/36.2 MB[0m [31m41.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import pandas as pd

from tqdm.auto import tqdm
from rdkit import Chem
tqdm.pandas()

In [3]:
def canonicalize(smile):
    try:
        mol = Chem.MolFromSmiles(smile)
        if mol is None:
            return None
        return Chem.MolToSmiles(mol, canonical=True)
    except:
        return None

## ⛃ Compettition Data

In [4]:
df_train = pd.read_csv('/kaggle/input/melting-point/train.csv')
df_train = df_train[['SMILES', 'Tm']]
df_train['canonical_smiles'] = df_train['SMILES'].apply(canonicalize)
display(df_train.head())
df_train.shape

Unnamed: 0,SMILES,Tm,canonical_smiles
0,FC1=C(F)C(F)(F)C1(F)F,213.15,FC1=C(F)C(F)(F)C1(F)F
1,c1ccc2c(c1)ccc3Nc4ccccc4c23,407.15,c1ccc2c(c1)ccc1[nH]c3ccccc3c12
2,CCN1C(C)=Nc2ccccc12,324.15,CCn1c(C)nc2ccccc21
3,CC#CC(=O)O,351.15,CC#CC(=O)O
4,CCCCC(S)C,126.15,CCCCC(C)S


(2662, 3)

In [5]:
df_test = pd.read_csv('/kaggle/input/melting-point/test.csv')
df_test = df_test[['id', 'SMILES']]
df_test['canonical_smiles'] = df_test['SMILES'].apply(canonicalize)
display(df_test.head())
df_test.shape

Unnamed: 0,id,SMILES,canonical_smiles
0,1022,CCOC(=O)c1ccc(O)cc1,CCOC(=O)c1ccc(O)cc1
1,1146,CCCCCCc1ccc(O)cc1O,CCCCCCc1ccc(O)cc1O
2,79,ClCBr,ClCBr
3,2279,C=CCCCCCCCC,C=CCCCCCCCC
4,1342,Fc1ccc(cc1)C(F)(F)F,Fc1ccc(C(F)(F)F)cc1


(666, 3)

In [6]:
common_smiles = set(df_train['canonical_smiles']) & set(df_test['canonical_smiles'])

train_common = df_train[df_train['canonical_smiles'].isin(common_smiles)].copy()
test_common = df_test[df_test['canonical_smiles'].isin(common_smiles)].copy()

test_common = test_common.reset_index().rename(columns={'index': 'test_index'})

train_common.rename(columns={'SMILES': 'SMILES_train'}, inplace=True)
test_common.rename(columns={'SMILES': 'SMILES_test'}, inplace=True)

train_common = train_common[['canonical_smiles', 'Tm', 'SMILES_train']]
test_common = test_common[['canonical_smiles', 'SMILES_test', 'test_index']]

merged = pd.merge(train_common, test_common, on='canonical_smiles', how='inner')
merged

Unnamed: 0,canonical_smiles,Tm,SMILES_train,SMILES_test,test_index
0,C1=CCCCC=CC1,220.15,C1=CCCCC=CC1,C1=CCC=CCCC1,426
1,CCC=CCCC,136.55,CCCC=CCC,CCC=CCCC,371


<p style="background-color:#ffe6f7; 
          padding:15px; 
          color:#111;
          font-size:16px;
          border-width:3px; 
          border-color:#f5dce9; 
          border-style:solid;
          border-radius:6px"> 📄 Upon inspecting both <code>Canonical Forms</code> from both the train and the test set, we observe <code>2</code> common SMILES. We can use these as Annotations.

</p>


## 🔗 Load External Dataset

In [7]:
# Takes some Time
df_combined = pd.read_csv("/kaggle/input/melting-point-smiles/smiles_melting_point.csv",
                             on_bad_lines='skip' )
# Handle Columns
df_combined = df_combined[["SMILES", "NAME", "Melting Point {measured, converted}"]]
df_combined = df_combined.rename(columns={'Melting Point {measured, converted}': 'mp_k'})
df_combined = df_combined.rename(columns={'NAME': 'name'})

df_combined = df_combined.drop(index=248849) # handle duplicates
df_combined['canonical_smiles'] = df_combined['SMILES'].progress_apply(canonicalize)
df_combined

  0%|          | 0/274977 [00:00<?, ?it/s]

[19:32:09] Explicit valence for atom # 13 Cl, 7, is greater than permitted
[19:32:10] Explicit valence for atom # 32 Cl, 5, is greater than permitted


Unnamed: 0,SMILES,name,mp_k,canonical_smiles
0,CCCCC1=NC=CC2=C(C=CC=C12)[N+]([O-])=O,1-n-butyl-5-nitro-isoquinoline,342.2,CCCCc1nccc2c([N+](=O)[O-])cccc12
1,NC1=C2C=C(Cl)N=CC2=CC=C1,3-chloro-5-amino-isoquinoline,449.5,Nc1cccc2cnc(Cl)cc12
2,CC1=CC2=C(C=CC=C2C(Cl)=N1)[N+]([O-])=O,1-chloro-3-methyl-5-nitro-isoquinoline,385.0,Cc1cc2c([N+](=O)[O-])cccc2c(Cl)n1
3,CCCCCCCCCCCCC1=CC(=O)C=CC1=S,2-dodecylthio-p-benzoquinone,404.5,CCCCCCCCCCCCC1=CC(=O)C=CC1=S
4,CC(=O)[C@H]1CC[C@H]2[C@@H]3CCC4=CC(=O)C=C[C@]4...,"11,18-dihydroxy-pregna-1,4-diene-3,20-dione 18...",436.0,CC(=O)[C@H]1CC[C@H]2[C@@H]3CCC4=CC(=O)C=C[C@]4...
...,...,...,...,...
274973,CS(=O)(=O)C1CCN(CC2=CC=C(COC3=C4CN(C5CCC(=O)NC...,3-(4-((4-((4-(methylsulfonyl)piperidin-1-yl)me...,447.0,CS(=O)(=O)C1CCN(Cc2ccc(COc3cccc4c3CN(C3CCC(=O)...
274974,CCOC(=O)OC1=C(C(=O)NC1CC1CCS(=O)(CC1)=NC(=O)C(...,"carbonic acid 4-(2,5-dimethyl-phenyl)-5-oxo-2-...",361.0,CCOC(=O)OC1=C(c2cc(C)ccc2C)C(=O)NC1CC1CCS(=O)(...
274975,CCCCCCCCCCCCCCCC(=O)OCC(CSC[C@H](NC(=O)OCC1=CC...,"N-Fluorenylmethoxycarbonyl-S-[2,3-bis(palmitoy...",326.0,CCCCCCCCCCCCCCCC(=O)OCC(CSC[C@H](NC(=O)OCc1ccc...
274976,CCCCCCCCCCCCCCCC(=O)OCC(CSC[C@H](NC(=O)OCC1=CC...,"N-Fluorenylmethoxycarbonyl-S-[2,3-bis(palmitoy...",356.0,CCCCCCCCCCCCCCCC(=O)OCC(CSC[C@H](NC(=O)OCc1ccc...


## ⛓️‍💥 Matching Annotations

In [8]:
count = 0
count_canonical = 0
count_reversal = 0
Tm = []

# Add annotations from External data
# =================================
for idx, row in tqdm(df_test.iterrows(), total=len(df_test)):
    # Exact Match Lookup
    smile = row.SMILES.strip()
    df_result = df_combined[df_combined.SMILES == smile]
    
    if len(df_result) > 0:
        count += 1
        Tm.append(df_result.iloc[0].mp_k)
        continue
        

    # Rverse Match sometimes a SMILES is written in reverse
    # "CC=CCCC" is the same as "CCCC=CC"
    smile_reversed = smile[::-1]
    df_result = df_combined[df_combined.SMILES == smile_reversed]
    
    if len(df_result) > 0:
        count_reversal += 1
        Tm.append(df_result.iloc[0].mp_k)
        continue


    # Canonical LookUp
    canonical_smiles = row.canonical_smiles.strip()
    df_result = df_combined[df_combined.canonical_smiles == canonical_smiles]
    
    if len(df_result) > 0:
        count_canonical += 1
        Tm.append(df_result.iloc[0].mp_k)
        continue

    Tm.append(None)


# Add annotations from training data
# =================================
#  MP_K     SMILES Test      Test Index
#  220.15	C1=CCC=CCCC1	426
#  136.55	CCC=CCCC	    371
# ================================
Tm[426] = 220.15
Tm[371] = 136.55
count_train = 2


# Optional: show counts
total_count = count + count_canonical + count_reversal + count_train
print("Canonical matches:", count_canonical)
print("Direct matches:", count)
print("Reversal matches:", count_reversal)
print("Total matches:", total_count)
perc = (total_count / df_test.shape[0]) * 100
print(f"Percentage of Matches: {perc :.3f} %")
print(f"N° without annotations: {df_test.shape[0] - total_count}")
df_test['Tm'] = Tm

  0%|          | 0/666 [00:00<?, ?it/s]

Canonical matches: 407
Direct matches: 154
Reversal matches: 36
Total matches: 599
Percentage of Matches: 89.940 %
N° without annotations: 67


In [9]:
# ======================================
# These are annotations collected from Trusted Sources:
# * PubChem
# * ChemSpider ...
# ======================================

df_test.loc[df_test['id'] == 2626, 'Tm'] = 321.65
df_test.loc[df_test['id'] == 311, 'Tm'] = 315.150
df_test.loc[df_test['id'] == 1665, 'Tm'] = 223.15
df_test.loc[df_test['id'] == 873, 'Tm'] = 243.15
df_test.loc[df_test['id'] == 2156, 'Tm'] = 227.6
df_test.loc[df_test['id'] == 3213, 'Tm'] = 356.15
df_test.loc[df_test['id'] == 2332, 'Tm'] = 307.15
df_test.loc[df_test['id'] == 3050, 'Tm'] = 338.65
df_test.loc[df_test['id'] == 1762, 'Tm'] = 243.15

df_test.loc[df_test['id'] == 1022, 'Tm'] = 390.15
df_test.loc[df_test['id'] == 79, 'Tm'] = 185.25
df_test.loc[df_test['id'] == 1342, 'Tm'] = 231.45
df_test.loc[df_test['id'] == 2082, 'Tm'] = 337.85
df_test.loc[df_test['id'] == 29, 'Tm'] = 198.15
df_test.loc[df_test['id'] == 2309, 'Tm'] = 287.65
df_test.loc[df_test['id'] == 553, 'Tm'] = 230.15

df_test.loc[df_test['id'] == 2626, 'Tm'] = 321.65
df_test.loc[df_test['id'] == 2344, 'Tm'] = 325.3
df_test.loc[df_test['id'] == 311, 'Tm'] = 315.15
df_test.loc[df_test['id'] == 2332, 'Tm'] = 307.15
df_test.loc[df_test['id'] == 3050, 'Tm'] = 338.65
df_test.loc[df_test['id'] == 1762, 'Tm'] = 243.15

df_test.loc[df_test['id'] == 2060, 'Tm'] = 330.65
df_test.loc[df_test['id'] == 1748, 'Tm'] = 265.35
df_test.loc[df_test['id'] == 2578, 'Tm'] = 295.15
df_test.loc[df_test['id'] == 1313, 'Tm'] = 134.65
df_test.loc[df_test['id'] == 3208, 'Tm'] = 254.15
df_test.loc[df_test['id'] == 1745, 'Tm'] = 328.15
df_test.loc[df_test['id'] == 2026, 'Tm'] = 313.15
df_test.loc[df_test['id'] == 3255, 'Tm'] = 198.95

df_test.loc[df_test['SMILES'] == "BrC(Br)C", 'Tm'] = 210.15
df_test.loc[df_test['SMILES'] == "CCON(=O)=O", 'Tm'] = 178.55
df_test.loc[df_test['SMILES'] == "ClOCl", 'Tm'] = 152.55
df_test.loc[df_test['SMILES'] == "C=C(F)F", 'Tm'] = 129.261111111111
df_test.loc[df_test['SMILES'] == "NN(C)C", 'Tm'] = 215.15
df_test.loc[df_test['SMILES'] == "BrC=CBr", 'Tm'] = 266.65
df_test.loc[df_test['SMILES'] == "Cl[Si](C)(C)C", 'Tm'] = 233.15
df_test.loc[df_test['SMILES'] == "Cl[Si](Cl)(C)C", 'Tm'] = 257.15
df_test.loc[df_test['SMILES'] == "BrC(Br)(F)F", 'Tm'] = 163.05
df_test.loc[df_test['SMILES'] == "Nc1ccc(cc1)N=Nc2ccccc2", 'Tm'] = 401.15
df_test.loc[df_test['SMILES'] == "Cl[Si](Cl)(Cl)C", 'Tm'] = 183.15
df_test.loc[df_test['SMILES'] == "C=CS(=O)(=O)C=C", 'Tm'] = 247.15
df_test.loc[df_test['SMILES'] == "CC(C)C", 'Tm'] = 134.85

## 🤖 Annotation Completion via External Predictions


<div style="
    background-color: #fff6e4; 
    color: #111;
    font-size: 16px;
    padding: 15px;
    border: 3px solid #f5ecda;
    border-radius: 6px;
">🚀 <code>P2MAT</code> is a state-of-the-art machine learning model for predicting compound melting points, achieves <code>23.92180</code> score on the leaderboard. For the missing annotations we'll load the P2MAT predictions.
</div>

In [10]:
df_p2mat = pd.read_csv("/kaggle/working/P2MAT-Baseline.csv")
submission = df_test.copy()
submission = submission[['id', 'Tm']]

for idx, row in tqdm(submission.iterrows(), total=len(df_test)):
    if not pd.isna(row.Tm):
        continue

    mp = df_p2mat[df_p2mat.id == row.id].iloc[0].Tm
    submission.loc[submission.id == row.id, 'Tm'] = mp
submission[(submission['Tm'].isna())]
submission.to_csv('submission.csv', index=False)

  0%|          | 0/666 [00:00<?, ?it/s]

In [11]:
!head submission.csv

id,Tm
1022,390.15
1146,339.5
79,185.25
2279,206.8
1342,231.45
2082,337.85
29,198.15
515,316.8
2309,287.65
