In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
import os

In [4]:
curr_dir=os.getcwd()
#print(curr_dir)
f_path = os.path.join(curr_dir, "data", "cancer patient data sets.csv")
df=pd.read_csv(f_path)
#df=df[['Air Pollution', 'Alcohol use', 'Dust Allergy', 'OccuPational Hazards',
#       'Genetic Risk', 'Balanced Diet', 'Obesity', 'Passive Smoker',
#       'Chest Pain', 'Coughing of Blood', 'Level'
#]]
df.head()

Unnamed: 0,index,Patient Id,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,0,P1,33,1,2,4,5,4,3,2,...,3,4,2,2,3,1,2,3,4,Low
1,1,P10,17,1,3,1,5,3,4,2,...,1,3,7,8,6,2,1,7,2,Medium
2,2,P100,35,1,4,5,6,5,5,4,...,8,7,9,2,1,4,6,7,2,High
3,3,P1000,37,1,7,7,7,7,6,7,...,4,2,3,1,4,5,6,7,5,High
4,4,P101,46,1,6,8,7,7,7,6,...,3,2,4,1,4,2,4,2,3,High


In [5]:
# Selecting informative features and the target variable
features = [
    'Age', 'Alcohol use', 'Smoking', 'chronic Lung Disease', 'Fatigue',
    'Coughing of Blood', 'Chest Pain', 'Genetic Risk', 'OccuPational Hazards'
]
target = 'Level'

X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [6]:
# Basic GBDT model without tuning
basic_gbdt = GradientBoostingClassifier(random_state=42)
basic_gbdt.fit(X_train, y_train)

In [7]:
basic_pred = basic_gbdt.predict(X_test)
basic_accuracy = accuracy_score(y_test, basic_pred)
print(f'Basic Model Accuracy: {basic_accuracy}')

Basic Model Accuracy: 1.0


In [8]:
# Cross-validation
cv_scores = cross_val_score(basic_gbdt, X, y, cv=5, scoring='accuracy')
print(f'CV Accuracy Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean()}')

CV Accuracy Scores: [1. 1. 1. 1. 1.]
Mean CV Accuracy: 1.0


In [9]:
# Classification report
print(classification_report(y_test, basic_pred))

              precision    recall  f1-score   support

        High       1.00      1.00      1.00       119
         Low       1.00      1.00      1.00        84
      Medium       1.00      1.00      1.00        97

    accuracy                           1.00       300
   macro avg       1.00      1.00      1.00       300
weighted avg       1.00      1.00      1.00       300



In [10]:
# Displaying the importance of each feature
feature_importance = pd.DataFrame(basic_gbdt.feature_importances_, index=X_train.columns, columns=['importance'])
feature_importance = feature_importance.sort_values('importance', ascending=False)
print(feature_importance)

                      importance
Coughing of Blood       0.352649
Fatigue                 0.224081
Alcohol use             0.208981
OccuPational Hazards    0.072562
Chest Pain              0.042125
Smoking                 0.039222
Genetic Risk            0.024597
chronic Lung Disease    0.023247
Age                     0.012537


In [11]:
# Parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Hyperparameter tuning with GridSearchCV
tuned_gbdt = GradientBoostingClassifier(random_state=42)
grid_search = GridSearchCV(estimator=tuned_gbdt, param_grid=param_grid, scoring='accuracy', cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)
tuned_gbdt = grid_search.best_estimator_
tuned_pred = tuned_gbdt.predict(X_test)
tuned_accuracy = accuracy_score(y_test, tuned_pred)
print(f'Tuned Model Accuracy: {tuned_accuracy}')
print("Best parameters:", grid_search.best_params_)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Tuned Model Accuracy: 1.0
Best parameters: {'learning_rate': 0.01, 'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}


In [12]:
# Comparison of feature importances
basic_importance = pd.DataFrame(basic_gbdt.feature_importances_, index=X_train.columns, columns=['Basic Importance'])
tuned_importance = pd.DataFrame(tuned_gbdt.feature_importances_, index=X_train.columns, columns=['Tuned Importance'])
importance_comparison = pd.concat([basic_importance, tuned_importance], axis=1)
print(importance_comparison)

                      Basic Importance  Tuned Importance
Age                           0.012537          0.012182
Alcohol use                   0.208981          0.205380
Smoking                       0.039222          0.037390
chronic Lung Disease          0.023247          0.025132
Fatigue                       0.224081          0.221352
Coughing of Blood             0.352649          0.358052
Chest Pain                    0.042125          0.049644
Genetic Risk                  0.024597          0.028030
OccuPational Hazards          0.072562          0.062838
