In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the two datasets
features_df = pd.read_csv('../Results/radiomics_features.csv')
# Find the clinical data file in your Data/NSCLC-Radiomics folder
outcomes_df = pd.read_csv('D:/Thesis_Project/Data/NSCLC-Radiomics-Lung1.clinical-version3-Oct-2019.csv')

print("Files loaded successfully.")

Files loaded successfully.


In [3]:
# --- Clean Features ---
# Drop the PyRadiomics diagnostic columns
feature_columns = [col for col in features_df.columns if not col.startswith('diagnostics_')]
features_df = features_df[feature_columns]

# --- Clean Outcomes ---
# The patient ID in the clinical file is 'case_id', let's rename it to match
outcomes_df.rename(columns={'case_id': 'PatientID'}, inplace=True)
# We only need the PatientID, survival time, and dead/alive status
outcomes_df = outcomes_df[['PatientID', 'Survival.time', 'deadstatus.event']]

# --- Merge DataFrames ---
master_df = pd.merge(features_df, outcomes_df, on='PatientID')

print("Successfully merged features and outcomes.")
print(f"Master dataset has {master_df.shape[0]} patients.")



Successfully merged features and outcomes.
Master dataset has 421 patients.


In [4]:
# Create target variable: 1 if the patient survived > 2 years, 0 otherwise
#  consider patients who were alive at their last follow-up > 730 days as survivors
#  consider patients who died < 730 days as non-survivors
#  will exclude patients with follow-up < 730 days who were still alive, as their final outcome is unknown

# Condition 1: Died before 2 years (Non-survivor)
died_early = (master_df['deadstatus.event'] == 1) & (master_df['Survival.time'] < 730)
# Condition 2: Survived past 2 years (Survivor)
survived_long = master_df['Survival.time'] >= 730

# Apply the conditions to create the target variable
master_df['Survival_2yr'] = 0 # Default to non-survivor
master_df.loc[survived_long, 'Survival_2yr'] = 1

# Create a final dataframe for modeling, excluding the ambiguous cases
modeling_df = master_df[died_early | survived_long].copy()
# Drop the original survival columns as they are no longer needed
modeling_df.drop(columns=['Survival.time', 'deadstatus.event'], inplace=True)

print(f"Created binary target 'Survival_2yr'. Kept {modeling_df.shape[0]} patients for the analysis.")
print("\nOutcome distribution:")
print(modeling_df['Survival_2yr'].value_counts())


Created binary target 'Survival_2yr'. Kept 420 patients for the analysis.

Outcome distribution:
Survival_2yr
0    251
1    169
Name: count, dtype: int64


In [5]:
# Separate features (X) from our target (y)
X = modeling_df.drop(columns=['PatientID', 'Survival_2yr'])
y = modeling_df['Survival_2yr']

# Perform the split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Data successfully split into training and testing sets.")
print(f"Training set: {X_train.shape[0]} patients")
print(f"Testing set: {X_test.shape[0]} patients")


Data successfully split into training and testing sets.
Training set: 336 patients
Testing set: 84 patients


In [6]:
import joblib
# Save the split data to a file
data_for_modeling = {
    'X_train': X_train,
    'X_test': X_test,
    'y_train': y_train,
    'y_test': y_test
}
joblib.dump(data_for_modeling, '../Results/split_data.joblib')
print("Split data saved to file: ../Results/split_data.joblib")


Split data saved to file: ../Results/split_data.joblib
