In [7]:
import os
import joblib
print(joblib.__version__)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder, StandardScaler
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor

1.3.2


In [6]:
def scale(X):
    """
    Scales (standardizes) the input data.

    Args:
    - X (pd.DataFrame): Input data to be scaled.

    Returns:
    - np.ndarray: Scaled (standardized) data.
    """
    scaler = StandardScaler()
    return scaler.fit_transform(X)

In [8]:
def convert_age_to_weeks(age_str):
    if 'pcw' in age_str:
        # Extract the number and use it directly
        return float(age_str.split(' ')[0])
    elif 'mos' in age_str:
        # Convert months to weeks
        return float(age_str.split(' ')[0]) * 4.345
    elif 'yrs' in age_str or 'years' in age_str:
        # Convert years to weeks
        return float(age_str.split(' ')[0]) * 52
    else:
        raise ValueError(f"Unknown age format: {age_str}")

In [10]:
def preprocess_data(data_path, file_type='csv'):
    """
    Preprocess the data for regression and return the train-test split.
    """

    if file_type == 'csv':
        data = pd.read_csv(data_path, index_col=0)
    elif file_type == 'excel':
        data = pd.read_excel(data_path, index_col=0)
    elif file_type == 'txt':
        data = pd.read_csv(data_path, sep='\t', index_col=0)
    else:
        raise ValueError("Unsupported file type")
    print("Data loaded successfully.")


    data['age'] = data['age'].apply(convert_age_to_weeks)

    X = data.select_dtypes(include=[np.number])
    X = X.drop(columns=['age'])
    y = data['age']

    X, y = shuffle(X, y, random_state=0)
    X_scaled = scale(X)

    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    print("Data preprocessed for regression and split into training and test sets.")
    return X_train, X_test, y_train, y_test

In [11]:
def save_data_splits(X_train, X_test, y_train, y_test, output_dir):
    """
    Saves the train-test data splits to the specified directory using joblib.

    Args:
    - X_train (pd.DataFrame or np.ndarray): Training data features.
    - X_test (pd.DataFrame or np.ndarray): Testing data features.
    - y_train (pd.Series or np.ndarray): Training data labels.
    - y_test (pd.Series or np.ndarray): Testing data labels.
    - output_dir (str): Directory path where the data splits will be saved.

    Note:
    - If the output directory does not exist, it will be created.
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    joblib.dump(X_train, os.path.join(output_dir, 'X_train_reg.pkl'))
    joblib.dump(X_test, os.path.join(output_dir, 'X_test_reg.pkl'))
    joblib.dump(y_train, os.path.join(output_dir, 'y_train_reg.pkl'))
    joblib.dump(y_test, os.path.join(output_dir, 'y_test_reg.pkl'))

In [12]:
from sklearn.metrics import mean_squared_error, r2_score
import joblib
import os

def train_evaluate_regressor(regressor, X_train, y_train, X_test, y_test, output_dir):
    """
    Trains the regressor on the training data, evaluates it on the test data,
    and saves the trained model and performance metrics to the specified directory.
    """
    regressor.fit(X_train, y_train)

    y_pred = regressor.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Mean Squared Error: {mse}")
    print(f"R-squared: {r2}")

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    with open(os.path.join(output_dir, 'performance_metrics.txt'), 'w') as f:
        f.write(f"Mean Squared Error: {mse}\n")
        f.write(f"R-squared: {r2}\n")

    joblib.dump(regressor, os.path.join(output_dir, 'trained_model.pkl'))

    return y_pred, regressor

In [13]:
def main(data_paths):
    """
    The main pipeline to preprocess data, train multiple regressors, and evaluate their performance.
    """
    regressors = [
        RandomForestRegressor(random_state=42),
        LinearRegression(),
        SVR(),
        XGBRegressor(random_state=42),
        MLPRegressor(random_state=42)
    ]

    for data_path in data_paths:
        data_type = os.path.basename(data_path).split('_')[0]

        X_train, X_test, y_train, y_test = preprocess_data(data_path)
        save_data_splits(X_train, X_test, y_train, y_test, os.path.join('baseline_regressor_outputs', data_type))

        for regressor in regressors:
            regressor_name = regressor.__class__.__name__
            output_dir = os.path.join('baseline_regressor_outputs', data_type, regressor_name)

            print(f"Training regressor: {regressor_name}")
            y_pred, trained_regressor = train_evaluate_regressor(regressor, X_train, y_train, X_test, y_test, output_dir)

In [14]:
data_paths = [#'methylation_1.csv',
              '/content/drive/MyDrive/microRNA/microRNA_1.csv',
              '/content/drive/MyDrive/rnaseq/rnaseq_1.csv'
              ]
main(data_paths)

Data loaded successfully.
Data preprocessed for regression and split into training and test sets.
Training regressor: RandomForestRegressor
Mean Squared Error: 30778.063320717498
R-squared: 0.8269864262167625
Training regressor: LinearRegression
Mean Squared Error: 42267.57540447136
R-squared: 0.7624001159631904
Training regressor: SVR
Mean Squared Error: 182316.57196859733
R-squared: -0.024861159960096746
Data loaded successfully.
Data preprocessed for regression and split into training and test sets.
Training regressor: RandomForestRegressor
Mean Squared Error: 44037.538671959745
R-squared: 0.8997170833368429
Training regressor: LinearRegression
Mean Squared Error: 11128.806008920712
R-squared: 0.9746573228384434
Training regressor: SVR
Mean Squared Error: 572133.6497244224
R-squared: -0.30287097884594094


In [None]:
!zip -r baseline_model_outputs.zip baseline_model_outputs