# Final Pipeline Script for Trainval and Test Data

In [1]:
# Load libraries
from os import getcwd
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler    # scaling the data
from sklearn.feature_selection import SelectFpr, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

In [2]:
# Import trainval data
trainval_path = getcwd() + r'\MLN Assessment Data-20240207\training_validation.csv' # Define file directory for training and validation data
trainval = pd.read_csv(trainval_path) # Import csv

# Import test data
test_path = getcwd() + r'\MLN Assessment Data-20240207\test.csv' # Define file directory for test data
test = pd.read_csv(test_path) # Import csv

In [3]:
# Check trainval shape is as expected
trainval.shape

(1339, 296)

In [4]:
# Split trainval into 2 training sets (one for stage one and one for stage two)

# Create category to stratify split
num_bins = 6        # Number of bins

# Create bin categories for age data
age_bins = pd.cut(trainval['Age'], bins=num_bins).tolist()

# Create a list by combining the bins and diagnosis
combined_age_diagnosis = [f"{age_bin}_{diagnosis}" for age_bin, diagnosis in zip(age_bins, trainval['Diagnosis'])]

# Stratify split data based on age and diagnosis
sss = StratifiedShuffleSplit(n_splits = 1, test_size = 0.15, random_state = 5)

for i, (stageone_ind, stagetwo_ind) in enumerate(sss.split(trainval, combined_age_diagnosis)): 
    stage1_trainval = trainval.iloc[stageone_ind,:]
    stage2_trainval = trainval.iloc[stagetwo_ind,:]

X1_trainval = stage1_trainval.iloc[:,4:]             # MRI features
y1_trainval = stage1_trainval['Diagnosis']      # Diagnosis

X2_trainval = stage1_trainval.iloc[:,4:]             # MRI features
y2_trainval = stage1_trainval['Age']            # Age

In [5]:
# Create function that applies stage one and creates a new dataset to be used for stage two
def prepare_stage2_data(X_data, pipe):

    """
    Perform stage one modelling by using trained pipeline to predict y values on X_data. 
    These predictions for diagnosis probabilities are then concatenated to X_data.
    This allows the X_data to be used ready for stage two.

    Parameters:
    - X_data: Training data features for stage two.
    - pipe: Trained stage one pipeline

    Returns:
    - stage2_Xtrain: Dataset for stage two modelling.
    """

    y1_pred = pipe.predict_proba(X_data)

    # Convert predicted probabilities and test data to dataframes
    pred_probs_df = pd.DataFrame(y1_pred, columns=pipe.classes_)
    stage1_data_df = pd.DataFrame(X_data)

    # Reset the index of the dataframes
    pred_probs_df.reset_index(drop=True, inplace=True)
    stage1_data_df.reset_index(drop=True, inplace=True)

    # Concatenate the test data and predicted probabilities
    stage2_data = pd.concat([stage1_data_df, pred_probs_df], axis=1)

    return stage2_data

In [6]:
# Create stage one pipeline
stage1_pipeline = Pipeline([('scaler', StandardScaler()),
                            ('selector', SelectFpr(alpha=0.025)),
                            ('classifier', RandomForestClassifier(max_depth=14, min_samples_split=4,
                                        n_estimators=150, random_state=3))])
stage1_pipeline.fit(X1_trainval, y1_trainval)

# Create X2 data ready for stage two
X2_trainval = prepare_stage2_data(X1_trainval, stage1_pipeline)
X2_test = prepare_stage2_data(test, stage1_pipeline)


In [7]:
# Transform data by adding diagnosis predictions
# Define stage 2 pipeline
stage2_pipeline = Pipeline([('scaler', StandardScaler()), 
                            ('selector', SelectFpr(alpha=0.05)),
                            ('regressor', RandomForestRegressor(max_depth=10, min_samples_split=10,
                                                                n_estimators=150, random_state=0))])
stage2_pipeline.fit(X2_trainval, y2_trainval)

y2_pred = stage2_pipeline.predict(X2_test)      

In [8]:
# Convert array to pandas Series
y2_pred_series = pd.Series(y2_pred)

# Export to CSV
y2_pred_series.to_csv('predictions.csv', index=False, header=['Age'])