# Workflow for models training



## Libraries and read in cleaned data

Data cleaning (done by Yvonne) and following steps were taken:
- removing rows with nan in RT
- removing rows with nan in concentration
- removing calibration graphs with only 1 or 2 calibration points

Data set contains 3860 rows and no nan values


In [45]:
# libraries
import pandas as pd
import numpy as np
from plotnine import *

# data
# file_path = "C:/Users/loma5202/OneDrive - Kruvelab/PhD/courses/machine_learning/project/ML_calibration_graph_linearity/0_data/data_ready_addfeatures_231122.csv"
file_path = "C:/Users/yvkr1259/Documents/data_ready_addfeatures_231122.csv"

df_calibrations = pd.read_csv(file_path)
# remove all the normaized columns 
drop_columns = ['abs_residuals_norm1', 'abs_residuals_norm2','c_real_M_norm1','c_real_M_norm2','peak_area_norm1',
'peak_area_norm2','residuals_norm1','residuals_norm2','rf_error_norm1','rf_error_norm2','rf_norm1','rf_norm2']

df_calibrations = df_calibrations.drop(drop_columns, axis=1)
df_calibrations.info()

## load data to google colab
#from google.colab import files
#uploaded = files.upload()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3860 entries, 0 to 3859
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   lab            3860 non-null   object 
 1   compound       3860 non-null   object 
 2   sample_type    3860 non-null   object 
 3   RT             3860 non-null   float64
 4   sample         3860 non-null   object 
 5   peak_area      3860 non-null   float64
 6   note           3860 non-null   object 
 7   c_real_M       3860 non-null   float64
 8   rf             3860 non-null   float64
 9   rf_error       3860 non-null   float64
 10  slope          3860 non-null   float64
 11  intercept      3860 non-null   float64
 12  residuals      3860 non-null   float64
 13  abs_residuals  3860 non-null   float64
dtypes: float64(9), object(5)
memory usage: 422.3+ KB


In [None]:
#file_path = "data_ready_addfeatures_231122.csv"
#df_calibrations = pd.read_csv(file_path)
#df_calibrations.info()

## Select features and data splitting

Data splitting should consider that points for each compound per lab belong together. Therefore an individual id for each compound lab pair is introduced. Splitting is then performed based on the id

In [48]:
df_calibrations['id'] = df_calibrations['lab'] + '_' + df_calibrations['compound']

In [49]:
import sklearn
from sklearn.model_selection import train_test_split

# Split dataset into training set and test set based on id 
unique_ids = df_calibrations['id'].unique()
np.random.seed(123)
train_ids, test_ids = train_test_split(unique_ids, test_size=0.2, random_state=42) # 80% training and 20% test


df_train = df_calibrations[df_calibrations['id'].isin(train_ids)]
df_test = df_calibrations[df_calibrations['id'].isin(test_ids)]

# Split dataset into features and target variable
X_train =  df_train.drop('note', axis=1)
y_train = df_train[['note']]
X_test = df_test.drop('note', axis=1)
y_test = df_test[['note']]


print(X_train.shape) 
print(y_train.shape) 
print(X_test.shape) 
print(y_test.shape) 

# (3093, 14)
# (3093, 1)
# (767, 14)
# (767, 1)


(3093, 14)
(3093, 1)
(767, 14)
(767, 1)


### Normalization

In general we decided to try out two different normalisation strategies:

**norm1**
$$
\text{norm1} = \frac{\text{x}}{\max(\text{x})}
$$

**norm2**
$$
\text{norm2} = \frac{\text{x} - \min(\text{x})}{\max(\text{x}) - \min(\text{x})}
$$

#### normalization strategy 1

In [51]:
# train set 
X_train['peak_area_norm1'] = X_train.groupby(['lab', 'compound'])['peak_area'].transform(lambda x: x / x.max())
X_train['c_real_M_norm1'] = X_train.groupby(['lab', 'compound'])['c_real_M'].transform(lambda x: x / x.max())
X_train['rf_norm1'] = X_train.groupby(['lab', 'compound'])['rf'].transform(lambda x: x / x.max())
X_train['rf_error_norm1'] = X_train.groupby(['lab', 'compound'])['rf_error'].transform(lambda x: x / x.max())
X_train['residuals_norm1'] = X_train.groupby(['lab', 'compound'])['residuals'].transform(lambda x: x / x.max())
X_train['abs_residuals_norm1'] = X_train.groupby(['lab', 'compound'])['abs_residuals'].transform(lambda x: x / x.max())

# test set 
X_test['peak_area_norm1'] = X_test.groupby(['lab', 'compound'])['peak_area'].transform(lambda x: x / x.max())
X_test['c_real_M_norm1'] = X_test.groupby(['lab', 'compound'])['c_real_M'].transform(lambda x: x / x.max())
X_test['rf_norm1'] = X_test.groupby(['lab', 'compound'])['rf'].transform(lambda x: x / x.max())
X_test['rf_error_norm1'] = X_test.groupby(['lab', 'compound'])['rf_error'].transform(lambda x: x / x.max())
X_test['residuals_norm1'] = X_test.groupby(['lab', 'compound'])['residuals'].transform(lambda x: x / x.max())
X_test['abs_residuals_norm1'] = X_test.groupby(['lab', 'compound'])['abs_residuals'].transform(lambda x: x / x.max())


#### normalization strategy 2

In [52]:
from sklearn.preprocessing import MinMaxScaler

columns_to_scale = ['peak_area', 'c_real_M', 'rf', 'rf_error', 'residuals', 'abs_residuals']

scaler = MinMaxScaler()

def scale_columns(group):
    for col in columns_to_scale:
        group[f'{col}_norm2'] = scaler.fit_transform(group[[col]])
    return group


X_train = X_train.groupby(['lab', 'compound']).apply(scale_columns)
X_test = X_test.groupby(['lab', 'compound']).apply(scale_columns)

In [34]:
## Decide on features for modelling
#features = ['peak_area','c_real_M']
#features = ['RT','peak_area','c_real_M']
#features = ['RT','peak_area','c_real_M', 'rf', 'rf_error']
#features = ['RT','peak_area','c_real_M', 'rf', 'rf_error', 'slope', 'intercept', 'residuals', 'abs_residuals']
features = ['RT','peak_area_norm1','c_real_M_norm1', 'rf_norm1', 'rf_error_norm1', 'slope', 'intercept', 'residuals_norm1', 'abs_residuals_norm1'] # best features
#eatures = ['RT','peak_area_norm2','c_real_M_norm2', 'rf_norm2', 'rf_error_norm2', 'slope', 'intercept', 'residuals_norm2', 'abs_residuals_norm2']

## Modeling 