#### Import required libs

In [1]:
import sys
sys.path.append('/home/jupyter/app')

import numpy as np
import pandas as pd

from sklearn.metrics import mean_absolute_error

from src.io_utils import load_dataframe

#### Define constants

In [2]:
GS_DIR = "gs://pcqm4mv2/data/raw"
TARGET = "homolumogap"

#### Load train set

In [3]:
df_train = load_dataframe("train.csv", GS_DIR)

print("df_train shape:", df_train.shape)
df_train.head()

df_train shape: (3378606, 2)


  mask |= (ar1 == a)


Unnamed: 0_level_0,smiles,homolumogap
idx,Unnamed: 1_level_1,Unnamed: 2_level_1
0,O=C1[N]c2ccncc2[CH][C@@H]1c1ccc(cc1)C,3.047675
1,COc1cc(OC)ccc1/C=C/N(C(=O)C)C,4.410966
2,C=CCN(C(=O)C)/C=C/c1ccccc1C,4.639541
3,C=CCN(C(=O)C)/C=C/c1ccccc1F,4.4926
4,C=CCN(C(=O)C)/C=C/c1ccccc1Cl,4.61233


In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3378606 entries, 0 to 3378605
Data columns (total 2 columns):
 #   Column       Dtype  
---  ------       -----  
 0   smiles       object 
 1   homolumogap  float64
dtypes: float64(1), object(1)
memory usage: 77.3+ MB


#### Load validation set

In [5]:
df_val = load_dataframe("valid.csv", GS_DIR)

print("df_val shape:", df_val.shape)
df_val.head()

df_val shape: (73545, 2)


Unnamed: 0_level_0,smiles,homolumogap
idx,Unnamed: 1_level_1,Unnamed: 2_level_1
3378606,COc1ccccc1N[C@H](/C(=N\C(=N)O)/O)C,4.58784
3378607,COc1ccccc1N[C@H](/C(=N\C(=N)O)/O)C,4.97152
3378613,CC(/N=C(\N/N=C/1\C[C@H]2[C@@H]1CC=C2)/S)C,5.4967
3378614,CC(/N=C(\N/N=C/1\C[C@H]2[C@@H]1CC=C2)/S)C,5.485815
3378624,C/N=C(\c1cc2c(s1)ccc(c2)F)/O,4.748387


In [6]:
df_val.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 73545 entries, 3378606 to 3746612
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   smiles       73545 non-null  object 
 1   homolumogap  73545 non-null  float64
dtypes: float64(1), object(1)
memory usage: 1.7+ MB


#### Create baseline model

In [7]:
class BaselineModel:
    def __init__(self, target: str=TARGET):
        self.target = TARGET
    
    def train(self, dataframe: pd.DataFrame) -> None:
        self.mean = dataframe[self.target].mean()
        return None
    
    def predict(self, dataframe: pd.DataFrame) -> np.ndarray:
        return np.repeat(self.mean, len(dataframe))

In [8]:
baseline_model = BaselineModel()

#### Train baseline model

In [9]:
baseline_model.train(df_train)

#### Get predictions for validation set

In [10]:
y_val_pred = baseline_model.predict(df_val)
y_val_pred

array([5.68945901, 5.68945901, 5.68945901, ..., 5.68945901, 5.68945901,
       5.68945901])

#### Evaluate MAE metric for baseline model

In [11]:
y_val = df_val[TARGET].to_numpy()
y_val

array([4.58783952, 4.97152005, 5.49669978, ..., 4.95519322, 8.17974235,
       3.3143467 ])

In [12]:
mae_result = mean_absolute_error(y_val, y_val_pred)

print("MAE for validation set with Baseline Model:", mae_result)

MAE for validation set with Baseline Model: 1.006081692517224
