In [1]:
import pandas as pd
import numpy as np
from scipy.optimize import curve_fit
import os

In [2]:
# Define the model function
def model(x, w1, w2, beta):
    return w1 * x**w2 + beta

def transform(dataset, feature):
    # Load data
    x_data = pd.read_csv('data/' + dataset + '/features_sorted.csv')[feature]
    if x_data.std() == 0:
        return x_data, x_data
    
    y1_data = pd.read_csv('data/' + dataset + '/target.csv')['min.log.lambda']
    y2_data = pd.read_csv('data/' + dataset + '/target.csv')['max.log.lambda']

    # Apply the mask to both x_lower_data and y1_data
    mask_lower = y1_data != -np.inf
    x_lower_data = x_data[mask_lower]
    y1_data = y1_data[mask_lower]

    # Apply the mask to both x_upper_data and y2_data
    mask_upper = y2_data != np.inf
    x_upper_data = x_data[mask_upper]
    y2_data = y2_data[mask_upper]

    try:
        popt_lower, _ = curve_fit(model, x_lower_data, y1_data, p0=[1, 0.1, 0.1], maxfev=10000)
    except (RuntimeError, ValueError):
        return x_data, x_data

    try:
        popt_upper, _ = curve_fit(model, x_upper_data, y2_data, p0=[1, 0.1, 0.1], maxfev=10000)
    except (RuntimeError, ValueError):
        return x_data, x_data

    return model(x_data, *popt_lower), model(x_data, *popt_upper)

In [3]:
folder_path = 'data'
datasets = [name for name in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, name))]

In [4]:
for dataset in datasets:
    df = pd.read_csv('data/' + dataset + '/features_sorted.csv')[['sequenceID']]
    for feature in [f'X{i}' for i in range(1, 21)]:
        transformed_X_lower, transformed_X_upper = transform(dataset, feature)
        df[feature + '.lower'] = transformed_X_lower
        df[feature + '.upper'] = transformed_X_upper
    
    df.to_csv('data/' + dataset + '/features_sorted_transformed.csv', index=False)

  popt_lower, _ = curve_fit(model, x_lower_data, y1_data, p0=[1, 0.1, 0.1], maxfev=10000)
  popt_upper, _ = curve_fit(model, x_upper_data, y2_data, p0=[1, 0.1, 0.1], maxfev=10000)
  popt_lower, _ = curve_fit(model, x_lower_data, y1_data, p0=[1, 0.1, 0.1], maxfev=10000)
  popt_upper, _ = curve_fit(model, x_upper_data, y2_data, p0=[1, 0.1, 0.1], maxfev=10000)
  popt_lower, _ = curve_fit(model, x_lower_data, y1_data, p0=[1, 0.1, 0.1], maxfev=10000)
  popt_upper, _ = curve_fit(model, x_upper_data, y2_data, p0=[1, 0.1, 0.1], maxfev=10000)
  popt_lower, _ = curve_fit(model, x_lower_data, y1_data, p0=[1, 0.1, 0.1], maxfev=10000)
  popt_upper, _ = curve_fit(model, x_upper_data, y2_data, p0=[1, 0.1, 0.1], maxfev=10000)
  popt_lower, _ = curve_fit(model, x_lower_data, y1_data, p0=[1, 0.1, 0.1], maxfev=10000)
  popt_upper, _ = curve_fit(model, x_upper_data, y2_data, p0=[1, 0.1, 0.1], maxfev=10000)
  popt_lower, _ = curve_fit(model, x_lower_data, y1_data, p0=[1, 0.1, 0.1], maxfev=10000)
  popt_upp