## Preprocessing Data

In the following steps, the raw data was preprocessed for various regression models. To avoid influencing model performance, only model-independent preprocessing steps were applied.

### Setup

In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.linear_model import LinearRegression       
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

### Import Data

In [2]:
# import the CSV file data\raw_data.csv
df = pd.read_csv(r'C:\Users\hempe\Studium\Real_Project\Project_repo\data\raw\raw_data_all_features.csv')
df.head() #Display the first 5 rows of the DataFrame

Unnamed: 0,product,isoelectric_point,protein_format,molecular_weight_da,formulation_title,product_conc_mg_ml,tm_c,ph,kcl_conc,fructose_conc,succinate_conc,l-lysine_conc,mannitol_conc,ps50_conc,ps80_conc,citrate_conc
0,MAB5410990,6.162,IGG3,391603.56,F01,172.8,139.411938,4.0,67.5,0.0,15.0,67.5,0.0,0.4,0.0,0.0
1,MAB5410990,6.162,IGG3,391603.56,F02,172.8,138.958645,4.0,0.0,0.0,15.0,135.0,0.0,0.4,0.0,0.0
2,MAB5410990,6.162,IGG3,391603.56,F03,172.8,143.647805,4.0,0.0,0.0,15.0,67.5,135.0,0.4,0.0,0.0
3,MAB5410990,6.162,IGG3,391603.56,F04,172.8,140.801897,4.0,135.0,0.0,15.0,0.0,0.0,0.4,0.0,0.0
4,MAB5410990,6.162,IGG3,391603.56,F05,172.8,151.632333,4.0,0.0,0.0,15.0,0.0,270.0,0.4,0.0,0.0


In [3]:
df.info()  # Display the DataFrame information to check the data types and non-null counts

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1210 entries, 0 to 1209
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   product              1210 non-null   object 
 1   isoelectric_point    1151 non-null   float64
 2   protein_format       1210 non-null   object 
 3   molecular_weight_da  1210 non-null   float64
 4   formulation_title    1210 non-null   object 
 5   product_conc_mg_ml   1210 non-null   float64
 6   tm_c                 1210 non-null   float64
 7   ph                   1210 non-null   float64
 8   kcl_conc             1210 non-null   float64
 9   fructose_conc        1210 non-null   float64
 10  succinate_conc       1210 non-null   float64
 11  l-lysine_conc        1210 non-null   float64
 12  mannitol_conc        1210 non-null   float64
 13  ps50_conc            1210 non-null   float64
 14  ps80_conc            1210 non-null   float64
 15  citrate_conc         1210 non-null   f

### Data Corrections

In [4]:
# For a better overview data set is reduced to the most interessting variables we want to examine. Therefore id columns are dropped.
df = df.drop(columns=['product', 'formulation_title'])

In [5]:
# change data type in order to make pandas functions more efficient
df['protein_format'] = pd.Categorical(df['protein_format'])

In [6]:
# Eliminate spacesin all column names
df.columns = df.columns.str.replace(' ', '_')   
# Make sure column names are lower case a
df.columns = df.columns.str.lower()

In [7]:
df.info()  # Display the DataFrame information to check the data types and non-null counts

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1210 entries, 0 to 1209
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   isoelectric_point    1151 non-null   float64 
 1   protein_format       1210 non-null   category
 2   molecular_weight_da  1210 non-null   float64 
 3   product_conc_mg_ml   1210 non-null   float64 
 4   tm_c                 1210 non-null   float64 
 5   ph                   1210 non-null   float64 
 6   kcl_conc             1210 non-null   float64 
 7   fructose_conc        1210 non-null   float64 
 8   succinate_conc       1210 non-null   float64 
 9   l-lysine_conc        1210 non-null   float64 
 10  mannitol_conc        1210 non-null   float64 
 11  ps50_conc            1210 non-null   float64 
 12  ps80_conc            1210 non-null   float64 
 13  citrate_conc         1210 non-null   float64 
dtypes: category(1), float64(13)
memory usage: 124.4 KB


### Missing values handling

In [8]:
# Find all missing values in the DataFrame
missing_values = df.isnull().sum()
# Display the missing values
missing_values

isoelectric_point      59
protein_format          0
molecular_weight_da     0
product_conc_mg_ml      0
tm_c                    0
ph                      0
kcl_conc                0
fructose_conc           0
succinate_conc          0
l-lysine_conc           0
mannitol_conc           0
ps50_conc               0
ps80_conc               0
citrate_conc            0
dtype: int64

In [9]:
# Error handling for missing values in column 'isoelectric_point' by using the mean value

# Count how many values are available
available_count = df['isoelectric_point'].notna().sum()
print(f"Available data points: {available_count} rows")

# Count missing values
missing_count = df['isoelectric_point'].isna().sum()
print(f"Missing values to be filled: {missing_count} rows")

# Fill missing values with the mean of the existing ones
if missing_count > 0:
    mean_value = df['isoelectric_point'].mean()
    df['isoelectric_point'] = df['isoelectric_point'].fillna(mean_value)
    print(f"Missing values were filled using the mean: {mean_value:.2f}")
else:
    print("No missing values – nothing to fill.")


Available data points: 1151 rows
Missing values to be filled: 59 rows
Missing values were filled using the mean: 6.01


### Encoding Categorical Variables

In [10]:
# Create dummy variables (dummy columns are uint8 by default)
df = pd.get_dummies(df, columns=['protein_format'], drop_first=True)

# Identify only the dummy columns
dummy_cols = [col for col in df.columns if col.startswith('protein_format_')]

# Convert only those dummy columns to int type
df[dummy_cols] = df[dummy_cols].astype(int)

# Make sure column names are lower case a
df.columns = df.columns.str.lower()


In [11]:
df.info()  # Display the DataFrame information to check the data types and non-null counts

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1210 entries, 0 to 1209
Data columns (total 17 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   isoelectric_point         1210 non-null   float64
 1   molecular_weight_da       1210 non-null   float64
 2   product_conc_mg_ml        1210 non-null   float64
 3   tm_c                      1210 non-null   float64
 4   ph                        1210 non-null   float64
 5   kcl_conc                  1210 non-null   float64
 6   fructose_conc             1210 non-null   float64
 7   succinate_conc            1210 non-null   float64
 8   l-lysine_conc             1210 non-null   float64
 9   mannitol_conc             1210 non-null   float64
 10  ps50_conc                 1210 non-null   float64
 11  ps80_conc                 1210 non-null   float64
 12  citrate_conc              1210 non-null   float64
 13  protein_format_igg3       1210 non-null   int64  
 14  protein_

### Export Preprocessed Data

In [12]:
#export the DataFrame to a new CSV file and overwrite the existing one 
df.to_csv('../data/processed/preprocessed_data.csv', index=False)