In [None]:
# | default_exp data


In [10]:
# |hide

from nbdev.showdoc import *


In [11]:
# |export

import os
from collections import Counter

import pandas as pd

from gpt3forchem.helpers import HashableDataFrame

_THIS_DIR = os.path.abspath(os.path.dirname(os.path.abspath("")))
import numpy as np 


# Data

> Helpers for loading data


In [12]:
# |export


def discretize(
    df: pd.DataFrame, col: str, n_bins: int = 5, new_name: str = None, labels=None
) -> None:
    """Adds a new column to the dataframe with the discretized values of the column."""
    if new_name is None:
        new_name = col + "_cat"
    if labels is None:
        labels = ["very small", "small", "medium", "large", "very large"]

    df[new_name] = pd.cut(
        df[col],
        n_bins,
        labels=labels,
    )


## Polymers


In [13]:
# |export

POLYMER_FEATURES = [
    "num_[W]",
    "max_[W]",
    "num_[Tr]",
    "max_[Tr]",
    "num_[Ta]",
    "max_[Ta]",
    "num_[R]",
    "max_[R]",
    "[W]",
    "[Tr]",
    "[Ta]",
    "[R]",
    "rel_shannon",
    "length",
]


In [14]:
# |export
def get_polymer_data(datadir="../data"):  # path to folder with data files
    return HashableDataFrame(pd.read_csv(os.path.join(datadir, "polymers.csv")))


In [15]:
df = get_polymer_data("../data/")


In [16]:
df.head()


Unnamed: 0.1,Unnamed: 0,smiles,string,deltaGmin,A2_normalized,deltaGmin_cat,A2_normalized_cat,num_[W],max_[W],num_[Tr],...,[W],[W].1,[Tr],[Tr].1,[Ta],[Ta].1,[R],[R].1,rel_shannon,length
0,0,[W][Ta][Tr][W][W][Ta][Ta][Ta][R][W][Tr][Tr][R]...,W-A-B-W-W-A-A-A-R-W-B-B-R-R-B-R,-7.535286,-0.109726,very large,very small,0.25,2,0.25,...,4.0,0.25,4.0,0.25,4.0,0.25,4.0,0.25,0.5,16
1,1,[R][W][W][R][R][Tr][Tr][Tr][Ta][Ta][Ta][W][W][...,R-W-W-R-R-B-B-B-A-A-A-W-W-A-R-B,-7.270527,0.580595,very large,very large,0.4,2,0.2,...,4.0,0.25,4.0,0.25,4.0,0.25,4.0,0.25,0.5,16
2,2,[Ta][R][Ta][W][Tr][W][Ta][R][Tr][W][Ta][Tr][Tr...,A-R-A-W-B-W-A-R-B-W-A-B-B-R-W-R,-6.416311,0.95632,very large,very large,0.0,0,1.0,...,4.0,0.25,4.0,0.25,4.0,0.25,4.0,0.25,0.5,16
3,3,[W][Ta][R][Ta][Tr][Tr][Tr][W][Ta][W][Tr][R][Ta...,W-A-R-A-B-B-B-W-A-W-B-R-A-W-R-R,-6.684816,1.129924,very large,very large,0.0,0,0.5,...,4.0,0.25,4.0,0.25,4.0,0.25,4.0,0.25,0.5,16
4,4,[R][R][Tr][Tr][W][R][Ta][W][R][W][Ta][Tr][Ta][...,R-R-B-B-W-R-A-W-R-W-A-B-A-A-W-B,-6.606492,-0.496439,very large,very small,0.0,0,0.333333,...,4.0,0.25,4.0,0.25,4.0,0.25,4.0,0.25,0.5,16


## Photoswitch


In [None]:
df_photoswitches = pd.read_csv("../data/photoswitches.csv")


In [None]:
Counter(df_photoswitches["SMILES"]).most_common()


[('C[N]1C=CC(=N1)N=NC2=CC=CC=C2', 2),
 ('[H]C1=CC([N+]([O-])=O)=CC([H])=C1/N=N/C2=CC([H])=C(C=C2OC)N(CC)CC', 2),
 ('C[N]1C=NC(=N1)N=NC2=CC=CC=C2', 1),
 ('C[N]1C=C(C)C(=N1)N=NC2=CC=CC=C2', 1),
 ('C[N]1C=C(C=N1)N=NC2=CC=CC=C2', 1),
 ('C[N]1N=C(C)C(=C1C)N=NC2=CC=CC=C2', 1),
 ('C[N]1N=CC=C1N=NC2=CC=CC=C2', 1),
 ('C[N]1N=CC(=C1N=NC2=CC=CC=C2)C', 1),
 ('CC(N(C)C(C)=C1)=C1/N=N/C2=CC=CC=C2', 1),
 ('CC(N(C)C(C)=C1C)=C1/N=N/C2=CC=CC=C2', 1),
 ('CN1C(/N=N/C2=CC=CC=C2)=C(C)C=C1C', 1),
 ('CN1C(/N=N/C2=CC=CC=C2)=CC=C1', 1),
 ('CN1C(/N=N/C2=CC=CC=C2)=NC=C1', 1),
 ('CN(C=N1)C=C1/N=N/C2=CC=CC=C2', 1),
 ('CN1C(/N=N/C2=CC=CC=C2)=CN=C1', 1),
 ('NC(C=C1)=CC=C1N=NC2=NNC=C2', 1),
 ('COC(C=C1)=CC=C1N=NC2=NNC=C2', 1),
 ('CC(C=C1)=CC=C1N=NC2=NNC=C2', 1),
 ('C1(/N=N/C2=CC=NN2)=CC=NN1', 1),
 ('[H]C(C=C1)=CC=C1N=NC2=NNC=C2', 1),
 ('BrC(C=C1)=CC=C1N=NC2=NNC=C2', 1),
 ('FC1=CC=CC=C1N=NC2=NNC=C2', 1),
 ('O=C(O)C(C=C1)=CC=C1N=NC2=NNC=C2', 1),
 ('O=C(C1=CC=C(/N=N/C2=NNC=C2)C=C1)OCC', 1),
 ('CC(C(C=C1)=CC=C1N=NC2=NNC=C2

In [None]:
# |export
def get_photoswitch_data(datadir="../data"):  # path to folder with data files
    """By default we drop the rows without E isomer pi-pi* transition wavelength."""
    df = pd.read_csv(os.path.join(datadir, "photoswitches.csv"))
    df.dropna(subset=["E isomer pi-pi* wavelength in nm"], inplace=True)
    df.drop_duplicates(
        subset=["SMILES"], inplace=True
    )  # not sure how and if they did this in the initial work. There are certainly duplicates, e.g. C[N]1C=CC(=N1)N=NC2=CC=CC=C2 (see top)
    df.reset_index(inplace=True)
    return HashableDataFrame(df)


In [None]:
df = get_photoswitch_data("../data/")


In [None]:
df.head()


Unnamed: 0,SMILES,rate of thermal isomerisation from Z-E in s-1,Solvent used for thermal isomerisation rates,Z PhotoStationaryState,E PhotoStationaryState,E isomer pi-pi* wavelength in nm,Extinction,E isomer n-pi* wavelength in nm,Extinction coefficient in M-1 cm-1,Z isomer pi-pi* wavelength in nm,...,CAM-B3LYP/6-31G** DFT E isomer n-pi* wavelength in nm,CAM-B3LYP/6-31G** DFT Z isomer pi-pi* wavelength in nm,CAM-B3LYP/6-31G** DFT Z isomer n-pi* wavelength in nm,BHLYP/6-31G* DFT E isomer pi-pi* wavelength in nm,BHLYP/6-31G* DFT E isomer n-pi* wavelength in nm,BHLYP/6-31G* Z isomer pi-pi* wavelength in nm,BHLYP/6-31G* DFT Z isomer n-pi* wavelength in nm,name,selfies,wavelength_cat
0,C[N]1N=NC(=N1)N=NC2=CC=CC=C2,2.1e-07,MeCN,76.0,72.0,310.0,1.67,442.0,0.0373,290.0,...,,,,,,,,,[C][NH0][N][=N][C][=Branch1][Ring2][=N][Ring1]...,0.0
1,C[N]1C=NC(=N1)N=NC2=CC=CC=C2,3.8e-07,MeCN,90.0,84.0,310.0,1.87,438.0,0.0505,272.0,...,,,,,,,,,[C][NH0][C][=N][C][=Branch1][Ring2][=N][Ring1]...,0.0
2,C[N]1C=CC(=N1)N=NC2=CC=CC=C2,1.1e-07,MeCN,98.0,97.0,320.0,1.46,425.0,0.0778,272.0,...,,,,,,,,,[C][NH0][C][=C][C][=Branch1][Ring2][=N][Ring1]...,0.0
3,C[N]1C=C(C)C(=N1)N=NC2=CC=CC=C2,1.5e-06,MeCN,96.0,87.0,325.0,1.74,428.0,0.0612,286.0,...,,,,,,,,,[C][NH0][C][=C][Branch1][C][C][C][=Branch1][Ri...,0.0
4,C[N]1C=C(C=N1)N=NC2=CC=CC=C2,7.6e-09,MeCN,98.0,70.0,328.0,1.66,417.0,0.064,275.0,...,427.0,256.0,401.0,,,,,,[C][NH0][C][=C][Branch1][Branch1][C][=N][Ring1...,0.0


## MOF


Let's generate the data we'll use


In [None]:
import pandas as pd
from mofdscribe.datasets import CoREDataset, QMOFDataset


In [None]:
data = QMOFDataset(drop_nan=False)

df = data._df
df = df.dropna(subset=["info.mofid.mofid"])

df_clean_mofid = [
    s.split(";")[0].replace(" MOFid-v1", "") for s in df["info.mofid.mofid"]
]


df["info.mofid.mofid_clean"] = df_clean_mofid


2022-09-04 17:58:04.992 | DEBUG    | mofdscribe.datasets.qmof_dataset:__init__:257 - Dropped 0 duplicate basenames. New length 15042
2022-09-04 17:58:05.145 | DEBUG    | mofdscribe.datasets.qmof_dataset:__init__:263 - Dropped 136 duplicate graphs. New length 14906
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['info.mofid.mofid_clean'] = df_clean_mofid


In [None]:
df_csd = pd.read_csv("../data/csd_meta.csv")


In [None]:
df_csd["basename"] = df_csd["name"].apply(lambda x: x[:6])


In [None]:
df_merged = df.merge(df_csd, left_on="info.basename", right_on="basename")


In [None]:
df_merged["outputs.CH4-henry_coefficient-mol--kg--Pa"].isna().sum()


3525

In [None]:
len(df_merged)


3627

In [None]:
df_merged.to_csv("../data/mof.csv", index=False)


In [8]:
# | export


def get_mof_data(datadir="../data"):  # path to folder with data files
    df =  HashableDataFrame(pd.read_csv(os.path.join(datadir, "mof.csv")))


    return df

In [None]:
get_mof_data()


  return pd.read_csv(os.path.join(datadir, "mof.csv"))


Unnamed: 0,flavor.all,flavor.gcmc,flavor.csd,flavor.csd-gcmc,outputs.pbe.energy_total,outputs.pbe.energy_vdw,outputs.pbe.energy_elec,outputs.pbe.net_magmom,outputs.pbe.bandgap,outputs.pbe.cbm,...,features.amd_all_mean_91,features.amd_all_mean_92,features.amd_all_mean_93,features.amd_all_mean_94,features.amd_all_mean_95,features.amd_all_mean_96,features.amd_all_mean_97,features.amd_all_mean_98,features.amd_all_mean_99,info.mofid.mofid_clean
0,True,False,False,False,-3372.0,-35.840,-3336.0,2.0,0.2255,0.17100,...,6.450,6.470,6.490,6.508,6.535,6.562,6.582,6.600,6.620,[Cu][Cu].[O-]C(=O)c1ccc(cc1)N(c1ccc(cc1)C(=O)O...
1,True,False,True,False,-3322.0,-27.750,-3294.0,0.0,2.8700,0.07135,...,7.492,7.516,7.543,7.566,7.594,7.620,7.650,7.680,7.707,O=C(Nc1ccc(cc1)C(=O)[O-])Nc1ccc(cc1)C(=O)[O-]....
2,True,False,False,False,-3314.0,-36.750,-3276.0,0.0,3.1910,2.53700,...,6.320,6.348,6.375,6.400,6.426,6.450,6.473,6.496,6.520,[O-]C(=O)c1ccc(cc1)Oc1ccc(cc1)C(=O)[O-].[Zn].n...
3,True,True,False,False,-3298.0,-37.840,-3260.0,0.0,2.0530,2.34400,...,6.050,6.074,6.098,6.120,6.145,6.168,6.190,6.210,6.234,[Nd].[O-]C(=O)c1ccc(s1)C(=O)[O-].rob.cat0
4,True,True,False,False,-3272.0,-33.030,-3238.0,104.0,0.2050,-1.45700,...,6.906,6.940,6.960,6.984,6.996,7.023,7.050,7.070,7.090,O[Fe][O]([Fe]O)[Fe]O.[O-]C(=O)c1cccc(c1)C(=O)[...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3511,True,False,False,False,-160.5,-2.016,-158.5,0.0,3.2100,2.76400,...,6.160,6.190,6.223,6.254,6.290,6.305,6.324,6.348,6.363,O.[Na].[O]n1nnnc1c1nnnn1[O].hcb.cat0
3512,True,False,False,False,-158.8,-2.113,-156.6,0.0,2.9360,2.03700,...,6.190,6.220,6.250,6.280,6.305,6.320,6.344,6.363,6.380,[Na].[O-]C(=O)C=CC(=O)O.fsc.cat0
3513,True,True,True,True,-156.8,-2.445,-154.2,0.0,2.3100,2.41000,...,6.434,6.445,6.465,6.480,6.496,6.516,6.543,6.570,6.600,Cc1c[n+]([O-])c(c[n+]1[O-])C.[Cd].[S]C#N.sql.cat0
3514,True,False,False,False,-140.9,-2.174,-138.6,0.0,4.5700,4.13000,...,6.234,6.250,6.270,6.290,6.316,6.340,6.367,6.395,6.410,[Ga].[OH2][Ga](F)(F)[OH2].[O]P(=O)(CCP(=O)([O]...


In [9]:
# | export 

def preprocess_mof_data(mof_data, n_bins=None, labels=None): 
    if n_bins is None:
        n_bins = 3
    if labels is None:
        labels = ["low", "medium", "high"]
    features = [
        "outputs.Xe-henry_coefficient-mol--kg--Pa",
        "outputs.Kr-henry_coefficient-mol--kg--Pa",
        "outputs.H2O-henry_coefficient-mol--kg--Pa",
        "outputs.H2S-henry_coefficient-mol--kg--Pa",
        "outputs.CO2-henry_coefficient-mol--kg--Pa",
        "outputs.CH4-henry_coefficient-mol--kg--Pa",
        "outputs.O2-henry_coefficient-mol--kg--Pa",
    ]

    for feature in features:
        mof_data[feature + '_log'] = np.log10(mof_data[feature] + 1e-40)

    for feature in features:

        discretize(
            mof_data, f"{feature}_log", n_bins=n_bins, labels=labels
        )

In [None]:
core_data = CoREDataset(drop_nan=False)._df


2022-09-12 11:29:39.718 | DEBUG    | mofdscribe.datasets.core_dataset:__init__:134 - Dropped 3227 duplicate basenames. New length 2166
2022-09-12 11:29:39.908 | DEBUG    | mofdscribe.datasets.core_dataset:__init__:140 - Dropped 62 duplicate graphs. New length 2104


In [None]:
df_core_merged = core_data.merge(df_csd, left_on="info.basename", right_on="basename")


In [None]:
len(df_core_merged)


2189

In [None]:
mofid_data = pd.read_csv(
    "/Users/kevinmaikjablonka/git/kjappelbaum/gpt3forchem/legacy/src/gpt3forchem/mofs/data/data.csv"
)


In [None]:
mofid_data["clean_mofid"] = mofid_data["mofid.mofid"].apply(
    lambda x: x.split(";")[0].replace("MOFid-v1.", "")
)


In [None]:
df_core_merged = df_core_merged.merge(
    mofid_data, left_on="info.basename", right_on="clean_name"
)


  df_core_merged= df_core_merged.merge(mofid_data, left_on='info.basename', right_on='clean_name')


In [None]:
keep = [True if not "no_mof" in f else False for f in df_core_merged["clean_mofid"]]


In [None]:
df_core_merged[keep].to_csv("../data/core_mof.csv", index=False)


In [None]:
# | export


def get_core_mof_data(datadir="../data"):  # path to folder with data files
    return HashableDataFrame(pd.read_csv(os.path.join(datadir, "core_mof.csv")))


In [None]:
get_core_mof_data()


Unnamed: 0,features.phstats_C-H-N-O_dim1_birth_min,features.phstats_C-H-N-O_dim1_birth_max,features.phstats_C-H-N-O_dim1_birth_mean,features.phstats_C-H-N-O_dim1_birth_std,features.phstats_C-H-N-O_dim1_death_min,features.phstats_C-H-N-O_dim1_death_max,features.phstats_C-H-N-O_dim1_death_mean,features.phstats_C-H-N-O_dim1_death_std,features.phstats_C-H-N-O_dim1_persistence_min,features.phstats_C-H-N-O_dim1_persistence_max,...,color_y.1,habit_y.1,mp_y.1,name_y.1,formula_y.1,density_y.1,r_factor_y.1,title_y.1,logKH_CO2_cat,clean_mofid
0,0.7036,3.656,1.670,0.3909,1.128,3.682,1.7590,0.3848,6.600000e-07,1.1180,...,purple,Block,,XULDOZ,"(C42 H46 Co5 O28)n,2(H2 O1)",1.478,8.69,,4,[Co][O]([Co][O]([Co])[Co])[Co].[O-]C(=O)c1cc(O...
1,0.6940,3.008,1.580,0.3232,1.144,3.027,1.7100,0.2808,2.130000e-05,1.3450,...,purple,block,,YOKKIU,"(C40 H32 Co4 N6 O14)n,n(C1 H4 O1),n(H2 O1)",1.586,7.47,,2,C1=CC2=NN=N[C]2C=C1.[CH]1C=CC2=NN=NC2=C1.[Co]....
2,1.4440,4.164,1.772,0.5260,1.495,4.168,1.8980,0.5312,3.900000e-06,0.6470,...,colorless,polyhedra,,IQEWOQ,"(C12 H12 Mn6 O24)n,2n(C4 H8 O2)",1.910,5.07,,0,[Mn].[O-]C=O met.cat0
3,0.6914,6.550,1.762,0.6724,1.317,6.590,1.9350,0.7886,2.900000e-06,4.5940,...,colorless,block,,VAGBUD,"(C36 H28 Cu4 Gd1 I4 N3 O8)n,2.5n(C4 H8 O2)",1.839,3.92,,1,I[Cu]12[Cu]3([Cu]1([Cu]23I)I)I.[Gd][Gd].[O-]C(...
4,0.6960,3.926,1.717,0.6420,1.316,3.947,1.8550,0.6400,6.000000e-08,2.4300,...,colorless,block,,XADDAJ,"(C132 H116 N4 O68 Tb8)n,10n(C3 H7 N1 O1),6n(H2...",1.762,4.17,,4,[O-]C(=O)c1ccc(cc1)c1cc(cc(c1)C(=O)[O-])C(=O)[...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1929,0.6963,7.152,1.713,0.5130,1.229,7.160,1.8090,0.4932,3.190000e-05,2.2730,...,blue,octahedron,,HUYKUH,"(C58 H62 N6 Ni3 O22)n,n(C3 H8 O3)",0.831,8.34,,0,[Ni].[O-]C(=O)c1cc(cc(c1)C(=O)[O-])C(=O)[O-] s...
1930,0.6953,3.959,1.801,0.5815,1.244,3.963,1.8870,0.5470,2.000000e-07,0.6943,...,colorless,plate,,MAKXAZ,"(C40 H28 N2 O8 Zn2)n,2n(C3 H7 N1 O1)",1.507,5.30,,2,[O-]C(=O)c1ccc(cc1)c1ccc(cc1)C(=O)[O-].[Zn].n1...
1931,1.2200,2.380,1.540,0.2683,1.252,2.621,1.6455,0.3298,1.600000e-06,1.2560,...,blue,polyhedra,,WEVRUK,"(C24 H36 Cu3 O36 Pr2)n,12n(H2 O1)",1.809,3.05,,0,[Cu].[O-]C(=O)COCC(=O)[O-].[Pr] UNKNOWN.cat0
1932,1.2510,7.777,2.246,1.9270,1.260,7.780,2.4240,1.9800,8.637000e-05,3.0700,...,colorless,,,DOYBEA,"(C8 H6 Al2 O10)n,8n(H2 O1)",1.543,4.80,,3,[Al].[O-]C(=O)C=CC(=O)[O-].[OH] rna.cat0


### Gas data


In [5]:
# | export
gas_features = pd.DataFrame(
    [
        {
            "name": "carbon_dioxide",
            "formula": "CO2",
            "critical_temperature": 304.19,
            "critical_pressure": 7382000,
            "accentric_factor": 0.228,
            "radius": 1.525,
            "polar": False,
            "related_column": "outputs.CO2-henry_coefficient-mol--kg--Pa_log_cat"
        },
        {
            "name": "xenon",
            "formula": "Xe",
            "critical_temperature": 289.74,
            "critical_pressure": 5840000,
            "accentric_factor": 0,
            "radius": 1.985,
            "polar": False,
            "related_column": "outputs.Xe-henry_coefficient-mol--kg--Pa_log_cat"
        },
        {
            "name": "krypton",
            "formula": "Kr",
            "critical_temperature": 209.35,
            "critical_pressure": 5502000,
            "accentric_factor": 0,
            "radius": 1.83,
            "polar": False,
            "related_column": "outputs.Kr-henry_coefficient-mol--kg--Pa_log_cat"
        },
        {
            "name": "hydrogen disulfide",
            "formula": "H2S",
            "critical_temperature": 373.53,
            "critical_pressure": 8963000,
            "accentric_factor": 0.0942,
            "radius": 1.74,
            "polar": True,
            "related_column": "outputs.H2S-henry_coefficient-mol--kg--Pa_log_cat"
        },
        {
            "name": "water",
            "formula": "H2O",
            "critical_temperature": 647.16,
            "critical_pressure": 22055000,
            "accentric_factor": 0.3449,
            "radius": 1.58,
            "polar": True,
            "related_column": "outputs.H2O-henry_coefficient-mol--kg--Pa_log_cat"
        },
        {
            "name": "methane",
            "formula": "CH4",
            "critical_temperature": 190.56,
            "critical_pressure": 4599000,
            "accentric_factor": 0.012,
            "radius": 1.865,
            "polar": False,
            "related_column": "outputs.CH4-henry_coefficient-mol--kg--Pa_log_cat"
        },
        {
            "name": "oxygen",
            "formula": "O2",
            "critical_temperature": 154.58,
            "critical_pressure": 5043000,
            "accentric_factor": 0.0222,
            "radius": 1.51,
            "polar": False,
            "related_column": "outputs.O2-henry_coefficient-mol--kg--Pa_log_cat"
        },
        {
            "name": "nitrogen",
            "formula": "N2",
            "critical_temperature": 126.20,
            "critical_pressure": 3460000,
            "accentric_factor": 0.0377,
            "radius": 1.655,
            "polar": False,
            "related_column": "outputs.N2-henry_coefficient-mol--kg--Pa_log_cat"
        },
    ]
)


In [6]:
gas_features

Unnamed: 0,name,formula,critical_temperature,critical_pressure,accentric_factor,radius,polar,related_column
0,carbon_dioxide,CO2,304.19,7382000,0.228,1.525,False,outputs.CO2-henry_coefficient-mol--kg--Pa_log_cat
1,xenon,Xe,289.74,5840000,0.0,1.985,False,outputs.Xe-henry_coefficient-mol--kg--Pa_log_cat
2,krypton,Kr,209.35,5502000,0.0,1.83,False,outputs.Kr-henry_coefficient-mol--kg--Pa_log_cat
3,hydrogen disulfide,H2S,373.53,8963000,0.0942,1.74,True,outputs.H2S-henry_coefficient-mol--kg--Pa_log_cat
4,water,H2O,647.16,22055000,0.3449,1.58,True,outputs.H2O-henry_coefficient-mol--kg--Pa_log_cat
5,methane,CH4,190.56,4599000,0.012,1.865,False,outputs.CH4-henry_coefficient-mol--kg--Pa_log_cat
6,oxygen,O2,154.58,5043000,0.0222,1.51,False,outputs.O2-henry_coefficient-mol--kg--Pa_log_cat
7,nitrogen,N2,126.2,3460000,0.0377,1.655,False,outputs.N2-henry_coefficient-mol--kg--Pa_log_cat


### OPV


In [None]:
from collections import Counter


In [None]:
df = pd.read_csv(os.path.join("../data", "opv.csv"))


  df = pd.read_csv(os.path.join("../data", "opv.csv"))


In [None]:
Counter(df["SMILES"]).most_common()


[('CC1=CC(CCCCCCCCCCCCCC)=C(C2=NC(SC(C3=C(CCCCCCCCCCCCCC)C=C(C4=CC(CC(CCCCCC)CCCCCCCC)=C(C5=NC(SC(C6=C(CC(CCCCCCCC)CCCCCC)C=C(C)S6)=N7)=C7S5)S4)S3)=N8)=C8S2)S1',
  6),
 ('O=C(OCC(CC)CCCC)C1=C(F)C2=C(C)SC(C3=CC4=C(C(C5=CC=C(CC(CC)CCCC)S5)=C(C=C(C)S6)C6=C4C7=CC=C(CC(CC)CCCC)S7)S3)=C2S1',
  4),
 ('CC1=CC2=C(C(CCC(CCCCCC)CCCC)=C(C=C(C3=CC=C(C4=C(F)C(F)=C(C5=CC=C(C)S5)C6=NN(CCC(CCCCCC)CCCC)N=C64)S3)S7)C7=C2CCC(CCCCCC)CCCC)S1',
  4),
 ('CC1=C(CC(CCCCCCCC)CCCCCCCCCC)C=C(C2=C(F)C(F)=C(C3=CC(CC(CCCCCCCCCC)CCCCCCCC)=C(C4=CC=C(C5=CC=C(C)S5)S4)S3)C6=NSN=C62)S1',
  3),
 ('O=C1N(CCCCCCCC)C(C2=C(C)SC(C(S3)=CC4=C3C(OC[C@@H](CC)CCCC)=C5C(SC(C)=C5)=C4OCC(CC)CCCC)=C21)=O',
  3),
 ('CC1=CC2=C(C(C=C(C(C3=CC=C(CCCCCC)C=C3)(C4=CC=C(CCCCCC)C=C4)C5=C6[Se]C(C7=C(F)C(F)=C(C)C8=NSN=C87)=C5)C6=C9)=C9C2(C%10=CC=C(CCCCCC)C=C%10)C%11=CC=C(CCCCCC)C=C%11)[Se]1',
  3),
 ('CC1=CC(CCCCCCCCCCCC)=C(C2=CC=C(C3=CC=C(C4=C(CCCCCCCCCCCC)C=C(C)S4)S3)S2)S1',
  2),
 ('CC1=C(C(F)=C(C(OCC(CC)CCCC)=O)S2)C2=C(C3=CC4=C(OCC(CCCC)CC)C(SC(

In [None]:
df[
    df["SMILES"]
    == "CC1=CC(CCCCCCCCCCCCCC)=C(C2=NC(SC(C3=C(CCCCCCCCCCCCCC)C=C(C4=CC(CC(CCCCCC)CCCCCCCC)=C(C5=NC(SC(C6=C(CC(CCCCCCCC)CCCCCC)C=C(C)S6)=N7)=C7S5)S4)S3)=N8)=C8S2)S1"
]


Unnamed: 0,ID,Nickname,Ref.No,PCE_max(%),PCE_ave(%),Voc(V),Jsc(mA\tcm^2),FF,Mw(kgmol^-1),Mn(kgmol^-1),...,ecpf_1055,ecpf_1056,ecpf_1057,ecpf_1058,ecpf_1059,ecpf_1060,ecpf_1061,ecpf_1062,ecpf_1063,pce_bin
183,186,PTzBT-14HD,S83,5.4,5.2,0.82,9.9,0.66,290.0,33.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2
189,192,PTzBT-14HD,S84,3.1,3.0,0.92,6.1,0.56,18.0,13.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1
190,193,PTzBT-14HD,S84,4.6,4.0,0.88,7.8,0.67,60.0,20.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1
191,194,PTzBT-14HD,S84,5.7,5.3,0.84,10.6,0.64,300.0,33.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2
192,195,PTzBT-14HD,S84,5.3,4.8,0.85,9.2,0.68,1450.0,73.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2
193,196,PTzBT-14HD,S84,3.2,3.2,0.92,6.1,0.56,18.0,13.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1


Again, there are duplicated entries we need to take care of. For fun, let's just check what the impact would be


In [None]:
df_mean = df.groupby(["SMILES", "selfies"]).mean().reset_index()


In [None]:
features = [f for f in df_mean.columns if "ecpf" in f]


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error
import numpy as np


In [None]:
mean_r2_score_no_duplicates = []
mean_r2_score_all = []

for i in range(20):
    rf = RandomForestRegressor(n_estimators=500)
    X_train, X_test, y_train, y_test = train_test_split(
        df_mean[features], df_mean["PCE_ave(%)"], test_size=0.2
    )
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    train_len = len(X_train)
    mean_r2_score_no_duplicates.append(r2_score(y_test, y_pred))
    rf = RandomForestRegressor(n_estimators=100, max_depth=10)
    X_train, X_test, y_train, y_test = train_test_split(
        df[features], df["PCE_ave(%)"], test_size=0.2
    )
    X_train, y_train = X_train[:train_len], y_train[:train_len]
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    mean_r2_score_all.append(r2_score(y_test, y_pred))


In [None]:
print(
    f"No duplicates: {np.mean(mean_r2_score_no_duplicates)} +- {np.std(mean_r2_score_no_duplicates)}"
)
print(f"All: {np.mean(mean_r2_score_all)} +- {np.std(mean_r2_score_all)}")


No duplicates: 0.40703354329729147 +- 0.05160336407361533
All: 0.3862491109445428 +- 0.053969712309947945


In [None]:
# | export


def get_opv_data(datadir="../data"):  # path to folder with data files
    """Load the OPV dataset."""
    df = pd.read_csv(os.path.join(datadir, "opv.csv"))
    df = df.groupby(["SMILES", "selfies"]).mean().reset_index()
    return HashableDataFrame(df)


In [None]:
get_opv_data()


  df = pd.read_csv(os.path.join(datadir, "opv.csv"))


Unnamed: 0,SMILES,selfies,ID,PCE_max(%),PCE_ave(%),Voc(V),Jsc(mA\tcm^2),FF,Mw(kgmol^-1),Monomer(gmol^-1),...,ecpf_1055,ecpf_1056,ecpf_1057,ecpf_1058,ecpf_1059,ecpf_1060,ecpf_1061,ecpf_1062,ecpf_1063,pce_bin
0,C/C=C/C1=CC(N(C2=CC=C(OCCCCCCCC)C=C2)C3=C4C=CC...,[C][/C][=C][/C][=C][C][Branch2][=Branch1][#C][...,28.0,0.30,0.30,0.55,0.859,0.484,7.8,840.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,C/C=C/C1=CC(N(C2=CC=C(OCCCCCCCC)C=C2)C3=C4C=CC...,[C][/C][=C][/C][=C][C][Branch2][Branch2][Branc...,29.0,0.20,0.20,0.50,0.761,0.432,8.2,1003.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,C/C=C/C1=CC(N(C2=CC=C(OCCCCCCCC)C=C2)C3=C4C=CC...,[C][/C][=C][/C][=C][C][Branch2][Ring2][=C][N][...,27.0,0.40,0.40,0.70,1.076,0.445,5.7,533.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,C/C=C/C1=CC(N(C2=CC=C(OCCCCCCCC)C=C2)C3=C4C=CC...,[C][/C][=C][/C][=C][C][Branch2][Branch2][=C][N...,30.0,0.80,0.80,0.80,1.558,0.546,4.3,1061.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,CC(C1=C2N=C(C3=CC=C(OCCCCCCCC)C=C3)C(C4=CC=C(O...,[C][C][Branch2][Branch1][=Branch1][C][=C][N][=...,603.0,1.90,1.90,0.77,5.000,0.503,660.9,1011.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1093,O=C1N(CCCCCCCC)C(C2=CC=C(C)S2)=C3C1=C(C4=CC=C(...,[O][=C][N][Branch1][=Branch2][C][C][C][C][C][C...,51.0,3.20,3.20,0.80,8.600,0.470,63.0,956.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1094,O=C1N(CCCCCCCC)C(C2=CC=C(C)S2)=C3C1=C(C4=CC=C(...,[O][=C][N][Branch1][=Branch2][C][C][C][C][C][C...,57.0,0.88,0.88,0.74,2.510,0.470,27.1,941.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1095,O=C1N(CCCCCCCCCC)C(C2=CC=C(C)S2)=C3C1=C(C4=CC=...,[O][=C][N][Branch1][O][C][C][C][C][C][C][C][C]...,52.0,3.80,3.80,0.77,9.100,0.550,56.7,1068.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1096,O=C1N(C[C@@H](CCCC)CC)C(C2=CC=C(C)S2)=C3C1=C(C...,[O][=C][N][Branch1][O][C][C@@H1][Branch1][Bran...,58.0,1.12,1.12,0.44,4.470,0.570,14.5,968.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
