In [None]:
import json
import requests
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

from rdkit import Chem
from rdkit.Chem import Descriptors, AllChem, PandasTools, Draw
from rdkit.Chem.Draw import IPythonConsole

df = pd.read_csv("data/hydrocarbons.csv")
df.head()

* take iupac names and generate dataset with smiles, mp, bp, etc

In [None]:
def fetch_smiles_by_name(name):
    error = None
    baseurl = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/'
    rpath = name + '/property/CanonicalSMILES/JSON'
    url = baseurl + rpath
    r = requests.get(url)
    if r.status_code == 200:
        dat = r.json()
        smiles = dat['PropertyTable']['Properties'][0]['CanonicalSMILES']
    elif r.status_code == 503:
        smiles, error = fetch_smiles_by_name(name)
    else:
        print(f"problem retrieving data for {name}")
        error = {name: r.status_code}
        smiles = ''
        
    return smiles, error

In [None]:
smiles_set = []
errors = {}
for name in df['IUPAC name']:
    print(f"getting PubChem data for {name}...")
    smiles, error = fetch_smiles_by_name(name)
    smiles_set += [smiles]
    if error:
        errors.update(error)

In [None]:
df['Canonical SMILES'] = smiles_set
df = df.copy()
df.head()

* create mol objects with smiles

In [None]:
mol_list = []
for smiles in df['Canonical SMILES']:
    mol = Chem.MolFromSmiles(smiles)
    mol_list += [mol]

mol_list[183]

* calculate autocorr2D descriptors
* add them to the dataframe

In [None]:
autocorr2d_set = []
for mol in mol_list:
    autocorr2d_set += [Chem.rdMolDescriptors.CalcAUTOCORR2D(mol)]

for i in range(len(autocorr2d_set[0])):
    col_vals = [x[i] for x in autocorr2d_set]
    col_name = f'Autocorr2D-{i+1}'
    df[col_name] = col_vals
    df = df.copy()    # jupyter gives a performance warning that the dataframe is highly fragmented without this

df.head()

* set up ML algorithm to predict mp or bp using autocorr2D

In [None]:
newdf = df.dropna(axis=0, subset='Melting point')
X = newdf.iloc[:,8:]
Y = newdf['Melting point']

In [None]:
model = LinearRegression()
model.fit(X, Y)

* view the results, determine best descriptors