In [None]:
# | default_exp data


In [None]:
# |hide

from nbdev.showdoc import *


In [None]:
# |export

import os

import pandas as pd
from gpt3forchem.helpers import HashableDataFrame
from collections import Counter
_THIS_DIR = os.path.abspath(os.path.dirname(os.path.abspath("")))


# Data

> Helpers for loading data


## Polymers


In [None]:
# |export

POLYMER_FEATURES = [
    "num_[W]",
    "max_[W]",
    "num_[Tr]",
    "max_[Tr]",
    "num_[Ta]",
    "max_[Ta]",
    "num_[R]",
    "max_[R]",
    "[W]",
    "[Tr]",
    "[Ta]",
    "[R]",
    "rel_shannon",
    "length",
]


In [None]:
# |export
def get_polymer_data(
    datadir="../data" # path to folder with data files
):
    return HashableDataFrame(pd.read_csv(os.path.join(datadir, "polymers.csv")))


In [None]:
df = get_polymer_data("../data/")


In [None]:
df.head()


Unnamed: 0.1,Unnamed: 0,smiles,string,deltaGmin,A2_normalized,deltaGmin_cat,A2_normalized_cat,num_[W],max_[W],num_[Tr],...,[W],[W].1,[Tr],[Tr].1,[Ta],[Ta].1,[R],[R].1,rel_shannon,length
0,0,[W][Ta][Tr][W][W][Ta][Ta][Ta][R][W][Tr][Tr][R]...,W-A-B-W-W-A-A-A-R-W-B-B-R-R-B-R,-7.535286,-0.109726,very large,very small,0.25,2,0.25,...,4.0,0.25,4.0,0.25,4.0,0.25,4.0,0.25,0.5,16
1,1,[R][W][W][R][R][Tr][Tr][Tr][Ta][Ta][Ta][W][W][...,R-W-W-R-R-B-B-B-A-A-A-W-W-A-R-B,-7.270527,0.580595,very large,very large,0.4,2,0.2,...,4.0,0.25,4.0,0.25,4.0,0.25,4.0,0.25,0.5,16
2,2,[Ta][R][Ta][W][Tr][W][Ta][R][Tr][W][Ta][Tr][Tr...,A-R-A-W-B-W-A-R-B-W-A-B-B-R-W-R,-6.416311,0.95632,very large,very large,0.0,0,1.0,...,4.0,0.25,4.0,0.25,4.0,0.25,4.0,0.25,0.5,16
3,3,[W][Ta][R][Ta][Tr][Tr][Tr][W][Ta][W][Tr][R][Ta...,W-A-R-A-B-B-B-W-A-W-B-R-A-W-R-R,-6.684816,1.129924,very large,very large,0.0,0,0.5,...,4.0,0.25,4.0,0.25,4.0,0.25,4.0,0.25,0.5,16
4,4,[R][R][Tr][Tr][W][R][Ta][W][R][W][Ta][Tr][Ta][...,R-R-B-B-W-R-A-W-R-W-A-B-A-A-W-B,-6.606492,-0.496439,very large,very small,0.0,0,0.333333,...,4.0,0.25,4.0,0.25,4.0,0.25,4.0,0.25,0.5,16


## Photoswitch


In [None]:
df_photoswitches = pd.read_csv('../data/photoswitches.csv')

In [None]:
Counter(df_photoswitches['SMILES']).most_common()

[('C[N]1C=CC(=N1)N=NC2=CC=CC=C2', 2),
 ('[H]C1=CC([N+]([O-])=O)=CC([H])=C1/N=N/C2=CC([H])=C(C=C2OC)N(CC)CC', 2),
 ('C[N]1C=NC(=N1)N=NC2=CC=CC=C2', 1),
 ('C[N]1C=C(C)C(=N1)N=NC2=CC=CC=C2', 1),
 ('C[N]1C=C(C=N1)N=NC2=CC=CC=C2', 1),
 ('C[N]1N=C(C)C(=C1C)N=NC2=CC=CC=C2', 1),
 ('C[N]1N=CC=C1N=NC2=CC=CC=C2', 1),
 ('C[N]1N=CC(=C1N=NC2=CC=CC=C2)C', 1),
 ('CC(N(C)C(C)=C1)=C1/N=N/C2=CC=CC=C2', 1),
 ('CC(N(C)C(C)=C1C)=C1/N=N/C2=CC=CC=C2', 1),
 ('CN1C(/N=N/C2=CC=CC=C2)=C(C)C=C1C', 1),
 ('CN1C(/N=N/C2=CC=CC=C2)=CC=C1', 1),
 ('CN1C(/N=N/C2=CC=CC=C2)=NC=C1', 1),
 ('CN(C=N1)C=C1/N=N/C2=CC=CC=C2', 1),
 ('CN1C(/N=N/C2=CC=CC=C2)=CN=C1', 1),
 ('NC(C=C1)=CC=C1N=NC2=NNC=C2', 1),
 ('COC(C=C1)=CC=C1N=NC2=NNC=C2', 1),
 ('CC(C=C1)=CC=C1N=NC2=NNC=C2', 1),
 ('C1(/N=N/C2=CC=NN2)=CC=NN1', 1),
 ('[H]C(C=C1)=CC=C1N=NC2=NNC=C2', 1),
 ('BrC(C=C1)=CC=C1N=NC2=NNC=C2', 1),
 ('FC1=CC=CC=C1N=NC2=NNC=C2', 1),
 ('O=C(O)C(C=C1)=CC=C1N=NC2=NNC=C2', 1),
 ('O=C(C1=CC=C(/N=N/C2=NNC=C2)C=C1)OCC', 1),
 ('CC(C(C=C1)=CC=C1N=NC2=NNC=C2

In [None]:
# |export
def get_photoswitch_data(
    datadir="../data" # path to folder with data files
):
    """By default we drop the rows without E isomer pi-pi* transition wavelength."""
    df =  pd.read_csv(os.path.join(datadir, "photoswitches.csv"))
    df.dropna(subset=['E isomer pi-pi* wavelength in nm'], inplace=True)
    df.drop_duplicates(subset=['SMILES'], inplace=True) # not sure how and if they did this in the initial work. There are certainly duplicates, e.g. C[N]1C=CC(=N1)N=NC2=CC=CC=C2 (see top)
    df.reset_index(inplace=True)
    return HashableDataFrame(df)

In [None]:
df = get_photoswitch_data("../data/")


In [None]:
df.head()


Unnamed: 0,SMILES,rate of thermal isomerisation from Z-E in s-1,Solvent used for thermal isomerisation rates,Z PhotoStationaryState,E PhotoStationaryState,E isomer pi-pi* wavelength in nm,Extinction,E isomer n-pi* wavelength in nm,Extinction coefficient in M-1 cm-1,Z isomer pi-pi* wavelength in nm,...,CAM-B3LYP/6-31G** DFT E isomer n-pi* wavelength in nm,CAM-B3LYP/6-31G** DFT Z isomer pi-pi* wavelength in nm,CAM-B3LYP/6-31G** DFT Z isomer n-pi* wavelength in nm,BHLYP/6-31G* DFT E isomer pi-pi* wavelength in nm,BHLYP/6-31G* DFT E isomer n-pi* wavelength in nm,BHLYP/6-31G* Z isomer pi-pi* wavelength in nm,BHLYP/6-31G* DFT Z isomer n-pi* wavelength in nm,name,selfies,wavelength_cat
0,C[N]1N=NC(=N1)N=NC2=CC=CC=C2,2.1e-07,MeCN,76.0,72.0,310.0,1.67,442.0,0.0373,290.0,...,,,,,,,,,[C][NH0][N][=N][C][=Branch1][Ring2][=N][Ring1]...,0.0
1,C[N]1C=NC(=N1)N=NC2=CC=CC=C2,3.8e-07,MeCN,90.0,84.0,310.0,1.87,438.0,0.0505,272.0,...,,,,,,,,,[C][NH0][C][=N][C][=Branch1][Ring2][=N][Ring1]...,0.0
2,C[N]1C=CC(=N1)N=NC2=CC=CC=C2,1.1e-07,MeCN,98.0,97.0,320.0,1.46,425.0,0.0778,272.0,...,,,,,,,,,[C][NH0][C][=C][C][=Branch1][Ring2][=N][Ring1]...,0.0
3,C[N]1C=C(C)C(=N1)N=NC2=CC=CC=C2,1.5e-06,MeCN,96.0,87.0,325.0,1.74,428.0,0.0612,286.0,...,,,,,,,,,[C][NH0][C][=C][Branch1][C][C][C][=Branch1][Ri...,0.0
4,C[N]1C=C(C=N1)N=NC2=CC=CC=C2,7.6e-09,MeCN,98.0,70.0,328.0,1.66,417.0,0.064,275.0,...,427.0,256.0,401.0,,,,,,[C][NH0][C][=C][Branch1][Branch1][C][=N][Ring1...,0.0


## MOF

Let's generate the data we'll use

In [None]:
from mofdscribe.datasets import QMOFDataset
import pandas as pd

In [None]:
data = QMOFDataset(drop_nan=False)

df = data._df
df = df.dropna(subset=['info.mofid.mofid'])

df_clean_mofid = [
    s.split(";")[0].replace(" MOFid-v1", "") for s in df['info.mofid.mofid']
]


df['info.mofid.mofid_clean'] = df_clean_mofid

2022-09-04 17:58:04.992 | DEBUG    | mofdscribe.datasets.qmof_dataset:__init__:257 - Dropped 0 duplicate basenames. New length 15042
2022-09-04 17:58:05.145 | DEBUG    | mofdscribe.datasets.qmof_dataset:__init__:263 - Dropped 136 duplicate graphs. New length 14906
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['info.mofid.mofid_clean'] = df_clean_mofid


In [None]:
df_csd = pd.read_csv('../data/csd_meta.csv')

In [None]:
df_csd['basename'] = df_csd['name'].apply(lambda x: x[:6])

In [None]:
df_merged = df.merge(df_csd, left_on='info.basename', right_on='basename')

In [None]:
df_merged['outputs.CH4-henry_coefficient-mol--kg--Pa'].isna().sum()

3525

In [None]:
len(df_merged)

3627

In [None]:
df_merged.to_csv('../data/mof.csv', index=False)

In [None]:
# | export 

def get_mof_data(
    datadir="../data" # path to folder with data files
):
    return HashableDataFrame(pd.read_csv(os.path.join(datadir, "mof.csv")))

In [None]:
get_mof_data()

  return pd.read_csv(os.path.join(datadir, "mof.csv"))


Unnamed: 0,flavor.all,flavor.gcmc,flavor.csd,flavor.csd-gcmc,outputs.pbe.energy_total,outputs.pbe.energy_vdw,outputs.pbe.energy_elec,outputs.pbe.net_magmom,outputs.pbe.bandgap,outputs.pbe.cbm,...,features.amd_all_mean_91,features.amd_all_mean_92,features.amd_all_mean_93,features.amd_all_mean_94,features.amd_all_mean_95,features.amd_all_mean_96,features.amd_all_mean_97,features.amd_all_mean_98,features.amd_all_mean_99,info.mofid.mofid_clean
0,True,False,False,False,-3372.0,-35.840,-3336.0,2.0,0.2255,0.17100,...,6.450,6.470,6.490,6.508,6.535,6.562,6.582,6.600,6.620,[Cu][Cu].[O-]C(=O)c1ccc(cc1)N(c1ccc(cc1)C(=O)O...
1,True,False,True,False,-3322.0,-27.750,-3294.0,0.0,2.8700,0.07135,...,7.492,7.516,7.543,7.566,7.594,7.620,7.650,7.680,7.707,O=C(Nc1ccc(cc1)C(=O)[O-])Nc1ccc(cc1)C(=O)[O-]....
2,True,False,False,False,-3314.0,-36.750,-3276.0,0.0,3.1910,2.53700,...,6.320,6.348,6.375,6.400,6.426,6.450,6.473,6.496,6.520,[O-]C(=O)c1ccc(cc1)Oc1ccc(cc1)C(=O)[O-].[Zn].n...
3,True,True,False,False,-3298.0,-37.840,-3260.0,0.0,2.0530,2.34400,...,6.050,6.074,6.098,6.120,6.145,6.168,6.190,6.210,6.234,[Nd].[O-]C(=O)c1ccc(s1)C(=O)[O-].rob.cat0
4,True,True,False,False,-3272.0,-33.030,-3238.0,104.0,0.2050,-1.45700,...,6.906,6.940,6.960,6.984,6.996,7.023,7.050,7.070,7.090,O[Fe][O]([Fe]O)[Fe]O.[O-]C(=O)c1cccc(c1)C(=O)[...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3511,True,False,False,False,-160.5,-2.016,-158.5,0.0,3.2100,2.76400,...,6.160,6.190,6.223,6.254,6.290,6.305,6.324,6.348,6.363,O.[Na].[O]n1nnnc1c1nnnn1[O].hcb.cat0
3512,True,False,False,False,-158.8,-2.113,-156.6,0.0,2.9360,2.03700,...,6.190,6.220,6.250,6.280,6.305,6.320,6.344,6.363,6.380,[Na].[O-]C(=O)C=CC(=O)O.fsc.cat0
3513,True,True,True,True,-156.8,-2.445,-154.2,0.0,2.3100,2.41000,...,6.434,6.445,6.465,6.480,6.496,6.516,6.543,6.570,6.600,Cc1c[n+]([O-])c(c[n+]1[O-])C.[Cd].[S]C#N.sql.cat0
3514,True,False,False,False,-140.9,-2.174,-138.6,0.0,4.5700,4.13000,...,6.234,6.250,6.270,6.290,6.316,6.340,6.367,6.395,6.410,[Ga].[OH2][Ga](F)(F)[OH2].[O]P(=O)(CCP(=O)([O]...
