## Imports

In [60]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split # Importing train_test_split function
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer 
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve
import warnings # To ignore warnings
warnings.filterwarnings('ignore')

## Loading Dataset 2


In [61]:
ds2 = pd.read_csv('dataset2.csv')
ds2.head()

Unnamed: 0,BlindedIDs,PATIENT_ID,Stratagene,DC_STUDY_ID,MICROARRAY,SITE,TESTTYPE,IN_DC_STUDY,GENDER,AGE_AT_DIAGNOSIS,...,SMOKING_HISTORY,SURGICAL_MARGINS,PATHOLOGIC_N_STAGE,PATHOLOGIC_T_STAGE,MEDIAN_INTENSITY_UNNORMALIZED,PCT_ARRAY_OUTLIER,PCT_SINGLE_OUTLIER,WARNING,LABORATORY_BATCH,Histologic grade
0,None4TestSets,151.0,0,B-NCI_U133A_1L.CHP,NCI_U133A_1L,MSKCC,Test 1,1,Male,64.0,...,Currently smoking,ALL MARGINS PATHOLOGICALLY NEGATIVE,"N1, ACCORDING TO AJCC CRITERIA","T3, ACCORDING TO AJCC CRITERIA",191.0,2.536,0.143,,,POORLY DIFFERENTIATED
1,None4TestSets,152.0,0,B-NCI_U133A_2L.CHP,NCI_U133A_2L,MSKCC,Test 1,1,Male,62.0,...,Smoked in the past,ALL MARGINS PATHOLOGICALLY NEGATIVE,"N0, ACCORDING TO AJCC CRITERIA","T2, ACCORDING TO AJCC CRITERIA",193.0,4.573,0.288,,,Moderate Differentiation
2,None4TestSets,153.0,0,B-NCI_U133A_3L.CHP,NCI_U133A_3L,MSKCC,Test 1,1,Female,70.0,...,Smoked in the past,ALL MARGINS PATHOLOGICALLY NEGATIVE,"N0, ACCORDING TO AJCC CRITERIA","T1, ACCORDING TO AJCC CRITERIA",192.0,1.512,0.131,,,Moderate Differentiation
3,None4TestSets,154.0,0,B-NCI_U133A_4L.CHP,NCI_U133A_4L,MSKCC,Test 1,1,Male,67.0,...,Smoked in the past,ALL MARGINS PATHOLOGICALLY NEGATIVE,"N2, ACCORDING TO AJCC CRITERIA","T3, ACCORDING TO AJCC CRITERIA",192.0,1.252,0.074,,,POORLY DIFFERENTIATED
4,None4TestSets,155.0,0,B-NCI_U133A_5L.CHP,NCI_U133A_5L,MSKCC,Test 1,1,Female,56.0,...,Currently smoking,ALL MARGINS PATHOLOGICALLY NEGATIVE,"N1, ACCORDING TO AJCC CRITERIA","T2, ACCORDING TO AJCC CRITERIA",193.0,1.894,0.142,,,POORLY DIFFERENTIATED


## ElasticNet Regression  Modeling

In [62]:
# Removing rows where the target variable is missing
data_clean = ds2.dropna(subset=['VITAL_STATUS'])

# Preparing the data again
X_clean = data_clean.drop('VITAL_STATUS', axis=1)
y_clean = data_clean['VITAL_STATUS']

# Identifying categorical and numerical columns
categorical_cols_clean = X_clean.select_dtypes(include=['object', 'category']).columns
numerical_cols_clean = X_clean.select_dtypes(include=['int64', 'float64']).columns

# Creating transformers for numerical and categorical data
numerical_transformer = Pipeline(steps=[ 
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Updating the ColumnTransformer with the clean data
preprocessor_clean = ColumnTransformer(
    transformers=[ 
        ('num', numerical_transformer, numerical_cols_clean),
        ('cat', categorical_transformer, categorical_cols_clean)
    ])

# Creates a Pipeline that first applies the preprocessing steps and then fits an ElasticNet model.
elastic_net_model_clean = Pipeline(steps=[
    ('preprocessor', preprocessor_clean),
    ('model', ElasticNet(random_state=0))
])
from sklearn.preprocessing import LabelEncoder

# Encoding the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y_clean)

# Splitting the data into training and testing sets with encoded target
X_train_encoded, X_test_encoded, y_train_encoded, y_test_encoded = train_test_split(
    X_clean, y_encoded, test_size=0.2, random_state=0
)

# Training the ElasticNet model on the encoded data
elastic_net_model_clean.fit(X_train_encoded, y_train_encoded)

# Predicting and evaluating the model on the encoded test data
y_pred_encoded = elastic_net_model_clean.predict(X_test_encoded)
rmse_encoded = np.sqrt(mean_squared_error(y_test_encoded, y_pred_encoded))
r2_encoded = r2_score(y_test_encoded, y_pred_encoded)

# Root Mean Squared Error and R2 Score
# metrics indicating the model's prediction error and the proportion of variance in the target variable that is predictable from the features
rmse_encoded, r2_encoded

(0.5080512739514746, -0.03968441853916471)

### Skewed Confusion Matrix
Top-Left Cell (0): True Negatives (TN) - The number of negative instances (class 0) correctly classified as negative. In your case, there are 0 true negatives, meaning that no negative class instances were correctly identified.

Top-Right Cell (52): False Positives (FP) - The number of negative instances incorrectly classified as positive. Here, there are 52 instances that were actually negative but were incorrectly predicted as positive.

Bottom-Left Cell (0): False Negatives (FN) - The number of positive instances (class 1) incorrectly classified as negative. In your model, there are 0 false negatives, indicating that all positive instances were correctly identified as positive.

Bottom-Right Cell (44): True Positives (TP) - The number of positive instances correctly classified as positive. Your model correctly identified 44 instances as positive.


### Confusion Matrix

In [63]:
# Using 0.5 as threshold for classification
y_pred_class = (y_pred_encoded > 0.5).astype(int)
conf_matrix = confusion_matrix(y_test_encoded, y_pred_class)
conf_matrix

array([[ 0, 52],
       [ 0, 44]], dtype=int64)

### Classification Report

In [64]:
class_report = classification_report(y_test_encoded, y_pred_class, output_dict= True)
df_report = pd.DataFrame(class_report).transpose()
df_report

Unnamed: 0,precision,recall,f1-score,support
0,0.0,0.0,0.0,52.0
1,0.458333,1.0,0.628571,44.0
accuracy,0.458333,0.458333,0.458333,0.458333
macro avg,0.229167,0.5,0.314286,96.0
weighted avg,0.210069,0.458333,0.288095,96.0


### ROC Curve and AUC
This value is a measure of the performance of your classification model. 


In [55]:
fpr, tpr, thresholds = roc_curve(y_test_encoded, y_pred_encoded)
roc_auc = roc_auc_score(y_test_encoded, y_pred_encoded)
roc_auc

0.5

### Cross-validation

In [65]:
cross_val_accuracy = cross_val_score(elastic_net_model_clean, X_clean, y_encoded, cv=5, scoring='accuracy')
cross_val_accuracy

array([nan, nan, nan, nan, nan])