## Build Features

### Setup

In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.linear_model import LinearRegression       
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

### Import Data

In [2]:
# im port the CSV file data\raw_data.csv
df = pd.read_csv('../data/raw/raw_data.csv')
df.head() #Display the first 5 rows of the DataFramed

Unnamed: 0,product,isoelectric_point,protein_format,molecular_weight_da,formulation_title,composition,product_conc_mg_ml,tm_c
0,MAB5410990,6.162,IGG3,391603.56,F01,15 mM Succinate + 67.5 mM L-Lysine + 67.5 mM K...,172.8,139.411938
1,MAB5410990,6.162,IGG3,391603.56,F02,15 mM Succinate + 135 mM L-Lysine + 0.4 mg/mL ...,172.8,138.958645
2,MAB5410990,6.162,IGG3,391603.56,F03,15 mM Succinate + 67.5 mM L-Lysine + 135 mM Ma...,172.8,143.647805
3,MAB5410990,6.162,IGG3,391603.56,F04,"15 mM Succinate + 135 mM KCl + 0.4 mg/mL PS50,...",172.8,140.801897
4,MAB5410990,6.162,IGG3,391603.56,F05,15 mM Succinate + 270 mM Mannitol + 0.4 mg/mL ...,172.8,151.632333


### Preprocessing String Data into Numerical Features

In [3]:
#Create new column ph
df['ph'] = df['composition'].str.split(',').str[-1].str.extract(r'(\d+)').astype(float)

Create a numeric feature for the concentration of every excipient in the 'composition'

In [4]:
#Create new column Excipients
df['composition_without_ph'] = df['composition'].str.split(',').str[0]
df.drop('composition', axis=1, inplace=True)

In [5]:
# Create a list to hold unique excipients 
excipients = []

# For every value in the column 'composition_without_ph', split the string by '+' and strip whitespace and put them into a list
for composition_without_ph in df['composition_without_ph']:
    list_conc_excipient = [c.strip() for c in composition_without_ph.split('+')] 
    # For every value in the list, split the string by ' ' and take the last part as the excipient name
    for conc_excipient in list_conc_excipient:
        parts = conc_excipient.split()
        if parts:  # Ignore empty strings
            excipients.append(parts[-1])

unique_excipients = list(set(excipients))  # Get unique excipients
print(unique_excipients)

['Mannitol', 'Fructose', 'L-Lysine', 'PS80', 'PS50', 'KCl', 'Succinate', 'Citrate']


In [6]:
# Function to extract the concentration of a specific excipient from a string
def extract_value(excipient_str,excipient):
    # Suche nach dem KCl-Teil in der Zeichenkette
    list_conc_excipient = [c.strip() for c in excipient_str.split('+')]
    for conc_excipient in list_conc_excipient:
        if excipient in conc_excipient:
            # Extrahiere den numerischen Wert
            match = re.search(r'(\d+(\.\d+)?)', conc_excipient)
            if match:
                return float(match.group(1))  
    return 0  # Wenn kein Wert gefunden wurde, None zurückgeben

In [7]:
# Create new columns for each unique excipient and extract their concentrations
for excipient in unique_excipients:
    new_column_name = excipient + '_conc' 
    # replace all uppercase letters with lowercase letters in the new column name
    new_column_name = new_column_name.lower()
    # Apply the extract_value function to the 'composition_without_ph' column for each excipient
    df[new_column_name] = df['composition_without_ph'].apply(lambda x: extract_value(x, excipient))
# Drop the 'composition_without_ph' column as it is no longer needed
df.drop('composition_without_ph', axis=1, inplace=True)


pd.set_option('display.max_colwidth', None)
df.head()  # Display the first 5 rows of the DataFrame with new columns for excipients



Unnamed: 0,product,isoelectric_point,protein_format,molecular_weight_da,formulation_title,product_conc_mg_ml,tm_c,ph,mannitol_conc,fructose_conc,l-lysine_conc,ps80_conc,ps50_conc,kcl_conc,succinate_conc,citrate_conc
0,MAB5410990,6.162,IGG3,391603.56,F01,172.8,139.411938,4.0,0.0,0.0,67.5,0.0,0.4,67.5,15.0,0.0
1,MAB5410990,6.162,IGG3,391603.56,F02,172.8,138.958645,4.0,0.0,0.0,135.0,0.0,0.4,0.0,15.0,0.0
2,MAB5410990,6.162,IGG3,391603.56,F03,172.8,143.647805,4.0,135.0,0.0,67.5,0.0,0.4,0.0,15.0,0.0
3,MAB5410990,6.162,IGG3,391603.56,F04,172.8,140.801897,4.0,0.0,0.0,0.0,0.0,0.4,135.0,15.0,0.0
4,MAB5410990,6.162,IGG3,391603.56,F05,172.8,151.632333,4.0,270.0,0.0,0.0,0.0,0.4,0.0,15.0,0.0


### Export Preprocessed Data

In [8]:
#export the DataFrame to a new CSV file and overwrite the existing one 
df.to_csv('../data/processed/raw_data_preprocessed.csv', index=False)