In [2]:
import pandas as pd

# Load dataset
df = pd.read_csv("data.csv")

# Quick look
print(df.head())



  Patient_ID  Age Gender Cancer_Type  Tumor_Size (cm) Stage        Treatment  \
0      P0001   67      F  Pancreatic              5.2    II  Hormone Therapy   
1      P0002   86      M        Skin              2.0     I        Radiation   
2      P0003   88      F    Prostate              5.9   III     Chemotherapy   
3      P0004   22      M       Brain              1.0    II        Radiation   
4      P0005   34      M        Skin              5.8    IV        Radiation   

   Survival_Months   Outcome  
0               76     Alive  
1               92     Alive  
2               44     Alive  
3               78     Alive  
4               16  Deceased  


In [3]:
# Drop Patient_ID
df = df.drop(columns=["Patient_ID"])

# Map Stage to numbers
stage_mapping = {'I':1, 'II':2, 'III':3, 'IV':4}
df['Stage'] = df['Stage'].map(stage_mapping)

# One-hot encode other categorical features
df = pd.get_dummies(df, columns=['Gender', 'Cancer_Type', 'Treatment', 'Outcome'], drop_first=True)

print(df.head())


   Age  Tumor_Size (cm)  Stage  Survival_Months  Gender_M  Cancer_Type_Breast  \
0   67              5.2    2.0               76     False               False   
1   86              2.0    1.0               92      True               False   
2   88              5.9    3.0               44     False               False   
3   22              1.0    2.0               78      True               False   
4   34              5.8    4.0               16      True               False   

   Cancer_Type_Colon  Cancer_Type_Leukemia  Cancer_Type_Liver  \
0              False                 False              False   
1              False                 False              False   
2              False                 False              False   
3              False                 False              False   
4              False                 False              False   

   Cancer_Type_Lung  Cancer_Type_Ovarian  Cancer_Type_Pancreatic  \
0             False                False              

In [4]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['Survival_Months'])
y = df['Survival_Months']

# Split into train and test (80-20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Initialize model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Train
rf.fit(X_train, y_train)

# Predict
y_pred = rf.predict(X_test)

# Evaluate
# R² Score (commonly called "accuracy" for regression)
accuracy = rf.score(X_test, y_test)
print("Model Accuracy (R²):", accuracy)



Model Accuracy (R²): 0.7739253769583848


In [8]:
# ----------------------------
# Test Real-time Prediction
# ----------------------------

# Example new patient data
new_patient = {
    "Age": 70,
    "Gender": "F",
    "Cancer_Type": "Pancreatic",
    "Tumor_Size (cm)": 4.5,
    "Stage": "II",
    "Treatment": "Hormone Therapy",
    "Outcome": "Alive"
}

def predict_survival(patient_data, model, training_columns):
    """
    Predict Survival_Months for a new patient.
    
    patient_data: dict with patient info
    model: trained regression model
    training_columns: columns used during model training
    """
    df_input = pd.DataFrame([patient_data])
    
    # Encode Stage ordinally
    stage_mapping = {'I':1, 'II':2, 'III':3, 'IV':4}
    df_input['Stage'] = df_input['Stage'].map(stage_mapping)
    
    # One-hot encode categorical features
    df_input = pd.get_dummies(df_input, columns=['Gender','Cancer_Type','Treatment','Outcome'], drop_first=True)
    
    # Add missing columns if any
    for col in training_columns:
        if col not in df_input.columns:
            df_input[col] = 0
    
    # Reorder columns to match training data
    df_input = df_input[training_columns]
    
    # Predict
    return model.predict(df_input)[0]

# Use the trained Random Forest model `rf` and training columns `X.columns`
predicted_months = predict_survival(new_patient, rf, X.columns)
print(f"Predicted Survival Months: {predicted_months:.1f} months")


Predicted Survival Months: 60.5 months


In [9]:
import joblib


joblib.dump(rf, "survival_model.pkl")

print("Model saved successfully as 'survival_model.pkl'")


Model saved successfully as 'survival_model.pkl'
