## Summary

### Data split
- Train and validate datasets were merged - cross dataset
- Train and validate datasets were made using the cross dataset.
- It was split into 4 parts and each time the new part was used as validation dsataset.

### Results
- Test rmse: 0.582178 +/- 0.016978
- Test r2: 0.896924 +/- 0.006035

In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.neural_network import MLPRegressor
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
from sklearn.metrics import mean_squared_error, r2_score
from warnings import filterwarnings
from sklearn.model_selection import train_test_split
filterwarnings('ignore')

In [2]:
data_train = pd.read_csv('../../../data/3_final_data/split_data/logP_wo_parameters_train.csv', index_col=0)
data_valid = pd.read_csv('../../../data/3_final_data/split_data/logP_wo_parameters_validation.csv', index_col=0)
data_test = pd.read_csv('../../../data/3_final_data/split_data/logP_wo_parameters_test.csv', index_col=0)

In [3]:
data_cross = pd.concat([data_train, data_valid])

In [4]:
def get_morgan_count_fps(data, bits=4096, radius=2):
    X = [AllChem.GetHashedMorganFingerprint(m, radius, nBits=bits) for m in data]
    X_list = []
    for x in X:
        array = np.zeros((0,), dtype=np.int64)
        DataStructs.ConvertToNumpyArray(x, array)
        X_list.append(array)
    X = pd.DataFrame(X_list)
    return X

In [5]:
X_test_mol = [Chem.MolFromSmiles(x) for x in data_test.smiles]
X_test = get_morgan_count_fps(X_test_mol)
y_test = data_test.logP

In [6]:
rmse_values = []
r2_values = []

In [7]:
part_size = len(data_cross) // 4

In [8]:
len(data_cross)

10732

In [9]:
for i in range(4):
    print(f"Fold {i} is started")
    valid_indices = pd.Series([i * part_size <= j < (i + 1) * part_size for j in range(len(data_cross))])
    print(valid_indices)
    train_data = data_cross[~valid_indices]
    valid_data = data_cross[valid_indices]
    X_train_mol = [Chem.MolFromSmiles(smi) for smi in train_data.smiles]
    y_train = train_data.logP
    X_valid_mol = [Chem.MolFromSmiles(smi) for smi in valid_data.smiles]
    y_valid = valid_data.logP
    
    X_train = get_morgan_count_fps(X_train_mol)
    X_valid = get_morgan_count_fps(X_valid_mol)

    print("Counted fingerprints")
    regr = MLPRegressor(random_state=10, max_iter=7)
    regr.fit(X_train, y_train)
    test_rmse = mean_squared_error(y_test, regr.predict(X_test), squared=False)
    test_r2 = r2_score(y_test, regr.predict(X_test))
    rmse_values.append(test_rmse)
    r2_values.append(test_r2)


Fold 0 is started
0         True
1         True
2         True
3         True
4         True
         ...  
10727    False
10728    False
10729    False
10730    False
10731    False
Length: 10732, dtype: bool
Counted fingerprints
Fold 1 is started
0        False
1        False
2        False
3        False
4        False
         ...  
10727    False
10728    False
10729    False
10730    False
10731    False
Length: 10732, dtype: bool
Counted fingerprints
Fold 2 is started
0        False
1        False
2        False
3        False
4        False
         ...  
10727    False
10728    False
10729    False
10730    False
10731    False
Length: 10732, dtype: bool
Counted fingerprints
Fold 3 is started
0        False
1        False
2        False
3        False
4        False
         ...  
10727     True
10728     True
10729     True
10730     True
10731     True
Length: 10732, dtype: bool
Counted fingerprints


In [10]:
rmse_values = pd.Series(rmse_values)
r2_values = pd.Series(r2_values)

In [11]:
rmse_values

0    0.603630
1    0.587504
2    0.571763
3    0.565816
dtype: float64

In [12]:
rmse_values.describe()

count    4.000000
mean     0.582178
std      0.016978
min      0.565816
25%      0.570276
50%      0.579634
75%      0.591536
max      0.603630
dtype: float64

In [13]:
r2_values

0    0.889259
1    0.895097
2    0.900643
3    0.902699
dtype: float64

In [14]:
r2_values.describe()

count    4.000000
mean     0.896924
std      0.006035
min      0.889259
25%      0.893637
50%      0.897870
75%      0.901157
max      0.902699
dtype: float64