In [None]:
import pandas as pd
import re

In [None]:

df = pd.read_csv('Coded_FS_Data.csv') #Import the CSV file
df.head() #Display the first 5 rows of the DataFramed

Unnamed: 0,product,isoelectic_point,protein_format,molecular_weight_da,formulation_title,composition,product_conc_mg_ml,tm_c
0,MAB5410990,6.162,IGG3,391603.56,F01,"15 mM Succinate + 67.5 mM L-Lysine + 67.5 mM KCl + 0.4 mg/mL PS50, pH 4.5",172.8,139.411938
1,MAB5410990,6.162,IGG3,391603.56,F02,"15 mM Succinate + 135 mM L-Lysine + 0.4 mg/mL PS50, pH 4.5",172.8,138.958645
2,MAB5410990,6.162,IGG3,391603.56,F03,"15 mM Succinate + 67.5 mM L-Lysine + 135 mM Mannitol + 0.4 mg/mL PS50, pH 4.5",172.8,143.647805
3,MAB5410990,6.162,IGG3,391603.56,F04,"15 mM Succinate + 135 mM KCl + 0.4 mg/mL PS50, pH 4.5",172.8,140.801897
4,MAB5410990,6.162,IGG3,391603.56,F05,"15 mM Succinate + 270 mM Mannitol + 0.4 mg/mL PS50, pH 4.5",172.8,151.632333


In [167]:
# Find all missing values in the DataFrame
missing_values = df.isnull().sum()
# Display the missing values
missing_values

product                 0
isoelectic_point       59
protein_format          0
molecular_weight_da     0
formulation_title       0
composition             0
product_conc_mg_ml      0
tm_c                    0
dtype: int64

In [168]:
# Handling the missing numeric values in the DataFrame by linear regression
from sklearn.linear_model import LinearRegression       
from sklearn.model_selection import train_test_split
# Select numeric columns for regression
numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns        
# Drop rows with missing values in numeric columns
df_numeric = df[numeric_columns].dropna()   
# Split the data into features and target variable
X = df_numeric.dropna(axis=1)  # Features   
y = df_numeric[numeric_columns[0]]  # Target variable (first numeric column)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create and fit the linear regression model
model = LinearRegression()  
model.fit(X_train, y_train)
# Predict missing values in the test set
y_pred = model.predict(X_test)
# Fill missing values in the original DataFrame with predictions
df[numeric_columns[0]].fillna(pd.Series(y_pred), inplace=True)
# Save the DataFrame with filled missing values to a new CSV file
df.to_csv('Coded_FS_Data_Filled.csv', index=False)
# Display the DataFrame after filling missing values
df.head()   

ModuleNotFoundError: No module named 'sklearn'

In [None]:
#Create new column ph
df['ph'] = df['composition'].str.split(',').str[-1].str.extract('(\d+)').astype(float)

In [None]:
#Create new column Excipients
df['composition_without_ph'] = df['composition'].str.split(',').str[0]
df.drop('composition', axis=1, inplace=True)

df.head() #Display the first 5 rows of the DataFrame with new columns

Unnamed: 0,product,isoelectic_point,protein_format,molecular_weight_da,formulation_title,product_conc_mg_ml,tm_c,ph,composition_without_ph
0,MAB5410990,6.162,IGG3,391603.56,F01,172.8,139.411938,4.0,15 mM Succinate + 67.5 mM L-Lysine + 67.5 mM KCl + 0.4 mg/mL PS50
1,MAB5410990,6.162,IGG3,391603.56,F02,172.8,138.958645,4.0,15 mM Succinate + 135 mM L-Lysine + 0.4 mg/mL PS50
2,MAB5410990,6.162,IGG3,391603.56,F03,172.8,143.647805,4.0,15 mM Succinate + 67.5 mM L-Lysine + 135 mM Mannitol + 0.4 mg/mL PS50
3,MAB5410990,6.162,IGG3,391603.56,F04,172.8,140.801897,4.0,15 mM Succinate + 135 mM KCl + 0.4 mg/mL PS50
4,MAB5410990,6.162,IGG3,391603.56,F05,172.8,151.632333,4.0,15 mM Succinate + 270 mM Mannitol + 0.4 mg/mL PS50


In [None]:
# Create a list to hold unique excipients 
excipients = []

# For every value in the column 'composition_without_ph', split the string by '+' and strip whitespace and put them into a list
for composition_without_ph in df['composition_without_ph']:
    list_conc_excipient = [c.strip() for c in composition_without_ph.split('+')] 
    # For every value in the list, split the string by ' ' and take the last part as the excipient name
    for conc_excipient in list_conc_excipient:
        parts = conc_excipient.split()
        if parts:  # Ignore empty strings
            excipients.append(parts[-1])

unique_excipients = list(set(excipients))  # Get unique excipients
print(unique_excipients)

['L-Lysine', 'PS50', 'Succinate', 'PS80', 'Mannitol', 'Fructose', 'KCl', 'Citrate']


In [None]:

# Function to extract the concentration of a specific excipient from a string
def extract_value(excipient_str,excipient):
    # Suche nach dem KCl-Teil in der Zeichenkette
    list_conc_excipient = [c.strip() for c in excipient_str.split('+')]
    for conc_excipient in list_conc_excipient:
        if excipient in conc_excipient:
            # Extrahiere den numerischen Wert
            match = re.search(r'(\d+(\.\d+)?)', conc_excipient)
            if match:
                return float(match.group(1))  
    return 0  # Wenn kein Wert gefunden wurde, None zurückgeben

In [None]:

# Create new columns for each unique excipient and extract their concentrations
for excipient in unique_excipients:
    new_column_name = excipient + '_conc' 
    df[new_column_name] = df['composition_without_ph'].apply(lambda x: extract_value(x, excipient))
# Drop the 'composition_without_ph' column as it is no longer needed
df.drop('composition_without_ph', axis=1, inplace=True)


pd.set_option('display.max_colwidth', None)
df.head()  # Display the first 5 rows of the DataFrame with new columns for excipients



Unnamed: 0,product,isoelectic_point,protein_format,molecular_weight_da,formulation_title,product_conc_mg_ml,tm_c,ph,L-Lysine_conc,PS50_conc,Succinate_conc,PS80_conc,Mannitol_conc,Fructose_conc,KCl_conc,Citrate_conc
0,MAB5410990,6.162,IGG3,391603.56,F01,172.8,139.411938,4.0,67.5,0.4,15.0,0.0,0.0,0.0,67.5,0.0
1,MAB5410990,6.162,IGG3,391603.56,F02,172.8,138.958645,4.0,135.0,0.4,15.0,0.0,0.0,0.0,0.0,0.0
2,MAB5410990,6.162,IGG3,391603.56,F03,172.8,143.647805,4.0,67.5,0.4,15.0,0.0,135.0,0.0,0.0,0.0
3,MAB5410990,6.162,IGG3,391603.56,F04,172.8,140.801897,4.0,0.0,0.4,15.0,0.0,0.0,0.0,135.0,0.0
4,MAB5410990,6.162,IGG3,391603.56,F05,172.8,151.632333,4.0,0.0,0.4,15.0,0.0,270.0,0.0,0.0,0.0


In [None]:
#export the DataFrame to a new CSV file and overwrite the existing one 
df.to_csv('Coded_FS_Data_processed.csv', index=False)