## Data splitting into Train/Validation/Test datasets

### Loading our clean data

In [5]:
import pandas as pd 
import numpy as np
path = r'C:\Users\HP\Documents\Cleaned Cp\cp_data_cleaned.csv'
data = pd.read_csv(path)
data

Unnamed: 0,Formula,Temperature,Heat Capacity
0,B2O3,1400.0,134.3060
1,B2O3,1300.0,131.2940
2,B2O3,1200.0,128.0720
3,B2O3,1100.0,124.5160
4,B2O3,1000.0,120.6250
...,...,...,...
4568,Zr1,450.0,26.2460
4569,Zr1,400.0,25.9350
4570,Zr1,350.0,25.6060
4571,Zr1,300.0,89.4475


### Separating our variables into Inputs/Target

In [9]:
X = data[['Formula','Temperature']]
Y = data['Heat Capacity']

### Splitting Data

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=0.2, random_state= 42)

print(X_train.shape)
print(X_test.shape)
# Remember we have only 245 distinct formulae so we have to ensure that each formulae may exist in only one
# type of dataset, not in multiples ones.


(3658, 2)
(915, 2)


In [12]:
num_rows = len(X_train)
print(f'There are in total {num_rows} rows in the X_train DataFrame.')

num_unique_formulae = len(X_train['Formula'].unique())
print(f'But there are only {num_unique_formulae} unique Formulae!\n')

print('Unique Formulae and their number of occurances in the X_train DataFrame:')
print(X_train['Formula'].value_counts(), '\n')
print('Unique Formulae and their number of occurances in the X_test DataFrame:')
print(X_test['Formula'].value_counts())

There are in total 3658 rows in the X_train DataFrame.
But there are only 244 unique Formulae!

Unique Formulae and their number of occurances in the X_train DataFrame:
W1         38
N1Ti1      35
B1Ti1      33
N1Zr1      33
O2Zr1      29
           ..
I4Mo1       3
Be1H2O2     3
I4Ti1       3
N2O4        2
I4Si1       2
Name: Formula, Length: 244, dtype: int64 

Unique Formulae and their number of occurances in the X_test DataFrame:
W1          11
Be2O4Si1    10
N1Ti1        9
K2O3Si1      9
O3Ti2        9
            ..
Li2O4S1      1
O1Pb1        1
Cl2Fe1       1
H2O2Sr1      1
N5P3         1
Name: Formula, Length: 234, dtype: int64


we see that some formulae appear in both the training and testing datasets which is not good because now we have instances of the same chemical compound appearing in both. Which means the model can cheat and in essence just memorize the training data, and during testing, look up the nearby values present in the training data!
### to fix this, we will first have to group the data by the chemical formula and then do the splitting accordingly.

In [16]:
# first we get a list of the unique formula in the dataset.
unique_f = X['Formula'].unique()
print(f'There exist {len(unique_f)} unique formulae in the dataset: \n\n{unique_f} ')

There exist 244 unique formulae in the dataset: 

['B2O3' 'Be1I2' 'Be1F3Li1' 'Al1Cl4K1' 'Al2Be1O4' 'B2H4O4' 'B2Mg1' 'Be1F2'
 'B1H4Na1' 'Br2Ca1' 'Al1N1' 'Al1Cl6Na3' 'Ba1H2O2' 'Al1Br3' 'Br3Zr1'
 'Br2Ti1' 'B1Ti1' 'Be2O4Si1' 'Br2Pb1' 'Al1' 'Br2Hg2' 'B1H3O3' 'Br3Ti1'
 'C1Cu1N1' 'B1' 'Al1F6Na3' 'Ca1H2O2' 'B2Be3O6' 'Al1Cl4Na1' 'Al1Cl6K3'
 'C0.98Nb1' 'Br2Hg1' 'Al1Cl1O1' 'Cl1H4N1O4' 'Be1F4Li2' 'C1Mg1O3' 'Br1H4N1'
 'Ca1I2' 'Al1F6Li3' 'Br4Mo1' 'Ba1' 'Br4Ti1' 'Ba1Br2' 'Be1O4S1' 'Ba1F2'
 'Ba1I2' 'Cl2Fe1' 'C1K1N1' 'Be1H2O2' 'Cs1' 'Al1H4Li1' 'C1Be2' 'Cr1'
 'Cs2O4S1' 'Cl1Cu1' 'Cu1F2' 'Al2O3' 'B1N1' 'Co1O4S1' 'Cu1O1' 'Br1Na1'
 'Cr2O3' 'Cs1F1' 'Cr2N1' 'Cl1Li1' 'Fe0.877S1' 'Cl1Na1' 'F2Hg1' 'Fe1H2O2'
 'Cs1H1O1' 'Br3Mo1' 'Br2Sr1' 'Cl2Hg2' 'Fe1O1' 'Co1' 'Cl1Cs1' 'Cu1H2O2'
 'Al1Li1O2' 'Co1F2' 'Br2Fe1' 'Fe1I2' 'Ga1' 'Cl1Li1O4' 'Cl2Cu1' 'Fe0.947O1'
 'Be1Cl2' 'Cl1K1' 'F1Na1' 'H3O4P1' 'Fe3O4' 'H1Na1O1' 'Fe2O12S3' 'H1Na1'
 'Cl1Na1O4' 'B1F4K1' 'Cu1O4S1' 'H1Li1' 'F2H1K1' 'B1H4Li1' 'Hg1O1' 'Be3N2'
 'Fe1' 'I2Mo1' 'Cu

In [17]:
# Set a random seed to ensure reproducibility across runs
np.random.seed(seed=42)

# Store a list of all unique formulae
all_formulae = unique_f.copy()

# Define the proportional size of the dataset split
val_size = 0.20
test_size = 0.10
train_size = 1 - val_size - test_size

# Calculate the number of samples in each dataset split
num_val_samples = int(round(val_size * len(unique_f)))
num_test_samples = int(round(test_size * len(unique_f)))
num_train_samples = int(round((1 - val_size - test_size) * len(unique_f)))

# Randomly choose the formulate for the validation dataset, and remove those from the unique formulae list
val_formulae = np.random.choice(all_formulae, size=num_val_samples, replace=False)
all_formulae = [f for f in all_formulae if f not in val_formulae]

# Randomly choose the formulate for the test dataset, and remove those from the unique formulae list
test_formulae = np.random.choice(all_formulae, size=num_test_samples, replace=False)
all_formulae = [f for f in all_formulae if f not in test_formulae]

# The remaining formulae will be used for the training dataset
train_formulae = all_formulae.copy()

print('Number of training formulae:', len(train_formulae))
print('Number of validation formulae:', len(val_formulae))
print('Number of testing formulae:', len(test_formulae))

Number of training formulae: 171
Number of validation formulae: 49
Number of testing formulae: 24


In [18]:
# Split the original dataset into the train/validation/test datasets using the formulae lists above
data_train = data[data['Formula'].isin(train_formulae)]
data_val = data[data['Formula'].isin(val_formulae)]
data_test = data[data['Formula'].isin(test_formulae)]

print(f'train dataset shape: {data_train.shape}')
print(f'validation dataset shape: {data_val.shape}')
print(f'test dataset shape: {data_test.shape}\n')

train dataset shape: (3221, 3)
validation dataset shape: (982, 3)
test dataset shape: (370, 3)



### Save split datasets to csv


In [19]:
data_train.to_csv(r'C:\Users\HP\Documents\Cleaned Cp\cp_data_train.csv', index=False)
data_val.to_csv(r'C:\Users\HP\Documents\Cleaned Cp\cp_data_val.csv', index=False)
data_test.to_csv(r'C:\Users\HP\Documents\Cleaned Cp\cp_data_test.csv', index=False)