In [None]:
# Step 1: Upload the dataset from the local device
from google.colab import files

# Upload the file from your local device
uploaded = files.upload()

# Step 2: Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import chi2
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.model_selection import GridSearchCV

# Step 3: Load the uploaded file into a pandas dataframe
# Assuming the file is 'dataR2.csv' which contains the Breast Cancer Coimbra dataset
df = pd.read_csv('dataR2.csv')

# Step 4: Display the first 5 rows to confirm the data is loaded correctly
print("First 5 rows of the dataset:")
print(df.head())

# Step 5: Check for missing values
print("Missing values in each column:")
print(df.isnull().sum())

# Handle missing values if any (using mean imputation as an example)
df = df.fillna(df.mean())

# Step 6: Convert labels (1 for Healthy and 2 for Patients to 0 and 1)
df['Classification'] = df['Classification'].replace({1: 0, 2: 1})

# Step 7: Separating features and labels
X = df.drop(columns='Classification')
y = df['Classification']

# Step 8: Feature importance using Chi-Square test before scaling
chi_scores, p_values = chi2(X, y)

# Display Chi-Square scores and p-values for each feature
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Chi2 Score': chi_scores,
    'p-value': p_values
})
print("Feature importance based on Chi-Square test:")
print(feature_importance.sort_values(by='Chi2 Score', ascending=False))

# Remove least important feature (for example 'MCP.1' as seen in the Chi-Square results)
X = X.drop(columns=['MCP.1'])

# Step 9: Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 10: Split the dataset into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Step 11: Train Decision Tree Classifier with default parameters
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train, y_train)

# Step 12: Evaluate the model with confusion matrix, accuracy, precision, and recall
y_pred = dt_classifier.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(f"Confusion Matrix:\n{cm}")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")

# Step 13: Train with "entropy" and "log_loss" criteria and evaluate
# Using criterion 'entropy'
dt_entropy = DecisionTreeClassifier(criterion='entropy', random_state=42)
dt_entropy.fit(X_train, y_train)
y_pred_entropy = dt_entropy.predict(X_test)

# Using criterion 'log_loss'
dt_logloss = DecisionTreeClassifier(criterion='log_loss', random_state=42)
dt_logloss.fit(X_train, y_train)
y_pred_logloss = dt_logloss.predict(X_test)

# Evaluate entropy-based model
accuracy_entropy = accuracy_score(y_test, y_pred_entropy)
precision_entropy = precision_score(y_test, y_pred_entropy)
recall_entropy = recall_score(y_test, y_pred_entropy)

# Evaluate log_loss-based model
accuracy_logloss = accuracy_score(y_test, y_pred_logloss)
precision_logloss = precision_score(y_test, y_pred_logloss)
recall_logloss = recall_score(y_test, y_pred_logloss)

print(f"Entropy - Accuracy: {accuracy_entropy}, Precision: {precision_entropy}, Recall: {recall_entropy}")
print(f"Log Loss - Accuracy: {accuracy_logloss}, Precision: {precision_logloss}, Recall: {recall_logloss}")

# Step 14: Perform parameter tuning using GridSearchCV
param_grid = {
    'max_depth': [10, 100],
    'min_samples_split': [4, 6, 8],
    'max_features': ['sqrt', 'log2', None]  # Updated to exclude deprecated 'auto'
}

grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Best parameters from grid search
print(f"Best parameters: {grid_search.best_params_}")

# Step 15: Evaluate the best model
best_dt = grid_search.best_estimator_
y_pred_best = best_dt.predict(X_test)

# Evaluate the best model
best_accuracy = accuracy_score(y_test, y_pred_best)
best_precision = precision_score(y_test, y_pred_best)
best_recall = recall_score(y_test, y_pred_best)

print(f"Best Model - Accuracy: {best_accuracy}, Precision: {best_precision}, Recall: {best_recall}")


Saving dataR2.csv to dataR2 (1).csv
First 5 rows of the dataset:
   Age        BMI  Glucose  Insulin      HOMA   Leptin  Adiponectin  Resistin  \
0   48  23.500000       70    2.707  0.467409   8.8071     9.702400   7.99585   
1   83  20.690495       92    3.115  0.706897   8.8438     5.429285   4.06405   
2   82  23.124670       91    4.498  1.009651  17.9393    22.432040   9.27715   
3   68  21.367521       77    3.226  0.612725   9.8827     7.169560  12.76600   
4   86  21.111111       92    3.549  0.805386   6.6994     4.819240  10.57635   

     MCP.1  Classification  
0  417.114               1  
1  468.786               1  
2  554.697               1  
3  928.220               1  
4  773.920               1  
Missing values in each column:
Age               0
BMI               0
Glucose           0
Insulin           0
HOMA              0
Leptin            0
Adiponectin       0
Resistin          0
MCP.1             0
Classification    0
dtype: int64
Feature importance based on Ch