In [141]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import mlflow

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay

### Conhecendo os dados

In [78]:
df = pd.read_csv('../../data/external/survey_lung_cancer.csv')
print(df.shape)
df.head(3)

(309, 16)


Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,M,69,1,2,2,1,1,2,1,2,2,2,2,2,2,YES
1,M,74,2,1,1,1,2,2,2,1,1,1,2,2,2,YES
2,F,59,1,1,1,2,1,2,1,2,1,2,2,1,2,NO


In [79]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 309 entries, 0 to 308
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   GENDER                 309 non-null    object
 1   AGE                    309 non-null    int64 
 2   SMOKING                309 non-null    int64 
 3   YELLOW_FINGERS         309 non-null    int64 
 4   ANXIETY                309 non-null    int64 
 5   PEER_PRESSURE          309 non-null    int64 
 6   CHRONIC DISEASE        309 non-null    int64 
 7   FATIGUE                309 non-null    int64 
 8   ALLERGY                309 non-null    int64 
 9   WHEEZING               309 non-null    int64 
 10  ALCOHOL CONSUMING      309 non-null    int64 
 11  COUGHING               309 non-null    int64 
 12  SHORTNESS OF BREATH    309 non-null    int64 
 13  SWALLOWING DIFFICULTY  309 non-null    int64 
 14  CHEST PAIN             309 non-null    int64 
 15  LUNG_CANCER            

### Tratamentos de  Dados

In [80]:
df.columns = df.columns.str.lower()
df.columns

Index(['gender', 'age', 'smoking', 'yellow_fingers', 'anxiety',
       'peer_pressure', 'chronic disease', 'fatigue ', 'allergy ', 'wheezing',
       'alcohol consuming', 'coughing', 'shortness of breath',
       'swallowing difficulty', 'chest pain', 'lung_cancer'],
      dtype='object')

In [81]:
colunas = {'chronic disease': 'chronic_disease',
           'alcohol consuming': 'alcohol_consuming',
           'shortness of breath': 'shortness_of_breath',
           'swallowing difficulty': 'swallowing_difficulty',
           'chest pain': 'chest_pain'
           }

df.rename(columns=colunas, inplace=True)

In [82]:
df.columns

Index(['gender', 'age', 'smoking', 'yellow_fingers', 'anxiety',
       'peer_pressure', 'chronic_disease', 'fatigue ', 'allergy ', 'wheezing',
       'alcohol_consuming', 'coughing', 'shortness_of_breath',
       'swallowing_difficulty', 'chest_pain', 'lung_cancer'],
      dtype='object')

In [83]:
data = df[['smoking', 'yellow_fingers', 'anxiety',
           'peer_pressure', 'chronic_disease', 'fatigue ',
           'allergy ', 'wheezing', 'alcohol_consuming',
           'coughing', 'shortness_of_breath',
           'swallowing_difficulty', 'chest_pain']]

In [84]:
for label in data.columns:
    data[label] = (data[label]==2).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[label] = (data[label]==2).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[label] = (data[label]==2).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[label] = (data[label]==2).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

In [86]:
def age_bracket(age: int) -> int:
    if age >= 78:
        return 5
    elif age >= 59:
        return 4
    elif age >= 43:
        return 3
    elif age >= 27:
        return 2
    else:
        return 1

def age_bracket_str(age_bracket: int) -> str:
    if age_bracket == 1:
        return 'generation_Z'
    elif age_bracket == 2:
        return 'millennials'
    elif age_bracket == 3:
        return 'generation_X'
    elif age_bracket == 4:
        return 'baby_boomers'
    else:
        return 'silent_generation'

In [87]:
df['generation'] = df['age'].apply(age_bracket)
df['gen_flag'] = df['generation'].apply(age_bracket_str)

In [88]:
data

Unnamed: 0,smoking,yellow_fingers,anxiety,peer_pressure,chronic_disease,fatigue,allergy,wheezing,alcohol_consuming,coughing,shortness_of_breath,swallowing_difficulty,chest_pain
0,0,1,1,0,0,1,0,1,1,1,1,1,1
1,1,0,0,0,1,1,1,0,0,0,1,1,1
2,0,0,0,1,0,1,0,1,0,1,1,0,1
3,1,1,1,0,0,0,0,0,1,0,0,1,1
4,0,1,0,0,0,0,0,1,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
304,0,0,0,1,1,1,0,0,1,1,1,1,0
305,1,0,0,0,0,1,1,1,1,1,1,0,1
306,1,0,0,0,0,0,1,1,1,1,0,0,1
307,1,0,1,0,0,1,1,0,1,1,1,0,1


In [89]:
df['lung_cancer'] = df['lung_cancer'].replace('YES', 1)
df['lung_cancer'] = df['lung_cancer'].replace('NO', 0)

  df['lung_cancer'] = df['lung_cancer'].replace('NO', 0)


In [90]:
df1 = df[['gen_flag', 'gender', 'lung_cancer']]

df_join = df1.join(data, how='left')
df_join

Unnamed: 0,gen_flag,gender,lung_cancer,smoking,yellow_fingers,anxiety,peer_pressure,chronic_disease,fatigue,allergy,wheezing,alcohol_consuming,coughing,shortness_of_breath,swallowing_difficulty,chest_pain
0,baby_boomers,M,1,0,1,1,0,0,1,0,1,1,1,1,1,1
1,baby_boomers,M,1,1,0,0,0,1,1,1,0,0,0,1,1,1
2,baby_boomers,F,0,0,0,0,1,0,1,0,1,0,1,1,0,1
3,baby_boomers,M,0,1,1,1,0,0,0,0,0,1,0,0,1,1
4,baby_boomers,F,0,0,1,0,0,0,0,0,1,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304,generation_X,F,1,0,0,0,1,1,1,0,0,1,1,1,1,0
305,baby_boomers,M,1,1,0,0,0,0,1,1,1,1,1,1,0,1
306,generation_X,M,1,1,0,0,0,0,0,1,1,1,1,0,0,1
307,baby_boomers,M,1,1,0,1,0,0,1,1,0,1,1,1,0,1


In [91]:
df_join.to_csv('../../data/processed/cleaned.csv', index=False)

In [108]:
X = df.drop(columns=['lung_cancer', 'generation', 'age'])
y = df['lung_cancer']

print('Shape X: {}'.format(X.shape))
print('Shape y: {}'.format(y.shape))

Shape X: (309, 15)
Shape y: (309,)


### Pré-processando Colunas Específicas

In [113]:
ohe = OneHotEncoder(handle_unknown='ignore')
preprocessed = ohe.fit_transform(X[['gender', 'gen_flag']])

In [118]:
X_preprocessed = preprocessed.toarray()

In [122]:
df_encoded = pd.DataFrame(X_preprocessed, columns=ohe.get_feature_names_out())
df_encoded.reset_index(drop=True, inplace=True)

In [123]:
df_encoded

Unnamed: 0,gender_F,gender_M,gen_flag_baby_boomers,gen_flag_generation_X,gen_flag_generation_Z,gen_flag_millennials,gen_flag_silent_generation
0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
1,0.0,1.0,1.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,1.0,1.0,0.0,0.0,0.0,0.0
4,1.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...
304,1.0,0.0,0.0,1.0,0.0,0.0,0.0
305,0.0,1.0,1.0,0.0,0.0,0.0,0.0
306,0.0,1.0,0.0,1.0,0.0,0.0,0.0
307,0.0,1.0,1.0,0.0,0.0,0.0,0.0


In [130]:
train_data = pd.concat([df_join.reset_index(drop=True), df_encoded], axis=1)
train_data

Unnamed: 0,gen_flag,gender,lung_cancer,smoking,yellow_fingers,anxiety,peer_pressure,chronic_disease,fatigue,allergy,...,shortness_of_breath,swallowing_difficulty,chest_pain,gender_F,gender_M,gen_flag_baby_boomers,gen_flag_generation_X,gen_flag_generation_Z,gen_flag_millennials,gen_flag_silent_generation
0,baby_boomers,M,1,0,1,1,0,0,1,0,...,1,1,1,0.0,1.0,1.0,0.0,0.0,0.0,0.0
1,baby_boomers,M,1,1,0,0,0,1,1,1,...,1,1,1,0.0,1.0,1.0,0.0,0.0,0.0,0.0
2,baby_boomers,F,0,0,0,0,1,0,1,0,...,1,0,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3,baby_boomers,M,0,1,1,1,0,0,0,0,...,0,1,1,0.0,1.0,1.0,0.0,0.0,0.0,0.0
4,baby_boomers,F,0,0,1,0,0,0,0,0,...,1,0,0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304,generation_X,F,1,0,0,0,1,1,1,0,...,1,1,0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
305,baby_boomers,M,1,1,0,0,0,0,1,1,...,1,0,1,0.0,1.0,1.0,0.0,0.0,0.0,0.0
306,generation_X,M,1,1,0,0,0,0,0,1,...,0,0,1,0.0,1.0,0.0,1.0,0.0,0.0,0.0
307,baby_boomers,M,1,1,0,1,0,0,1,1,...,1,0,1,0.0,1.0,1.0,0.0,0.0,0.0,0.0


In [156]:
train_columns = train_data.columns.tolist()
df_train = train_data[train_columns[2:]]
df_train.head()

Unnamed: 0,lung_cancer,smoking,yellow_fingers,anxiety,peer_pressure,chronic_disease,fatigue,allergy,wheezing,alcohol_consuming,...,shortness_of_breath,swallowing_difficulty,chest_pain,gender_F,gender_M,gen_flag_baby_boomers,gen_flag_generation_X,gen_flag_generation_Z,gen_flag_millennials,gen_flag_silent_generation
0,1,0,1,1,0,0,1,0,1,1,...,1,1,1,0.0,1.0,1.0,0.0,0.0,0.0,0.0
1,1,1,0,0,0,1,1,1,0,0,...,1,1,1,0.0,1.0,1.0,0.0,0.0,0.0,0.0
2,0,0,0,0,1,0,1,0,1,0,...,1,0,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0,1,1,1,0,0,0,0,0,1,...,0,1,1,0.0,1.0,1.0,0.0,0.0,0.0,0.0
4,0,0,1,0,0,0,0,0,1,0,...,1,0,0,1.0,0.0,1.0,0.0,0.0,0.0,0.0


In [135]:
df_train.isna().sum()

lung_cancer                   0
smoking                       0
yellow_fingers                0
anxiety                       0
peer_pressure                 0
chronic_disease               0
fatigue                       0
allergy                       0
wheezing                      0
alcohol_consuming             0
coughing                      0
shortness_of_breath           0
swallowing_difficulty         0
chest_pain                    0
gender_F                      0
gender_M                      0
gen_flag_baby_boomers         0
gen_flag_generation_X         0
gen_flag_generation_Z         0
gen_flag_millennials          0
gen_flag_silent_generation    0
dtype: int64

### Train

In [136]:
X = df_train.drop(columns=['lung_cancer'])
y = df_train[['lung_cancer']]

print('Shape X: {}'.format(X.shape))
print('Shape y: {}'.format(y.shape))

Shape X: (309, 20)
Shape y: (309, 1)


In [140]:
y.value_counts(normalize=True)

lung_cancer
1              0.873786
0              0.126214
Name: proportion, dtype: float64

In [144]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)

rus = RandomUnderSampler(random_state=42)

X_res, y_res = rus.fit_resample(X_train, y_train)

In [146]:
def rand_search_cv(model, param_grid):
        rand_search = RandomizedSearchCV(model, param_grid, cv=5, n_jobs=-1, verbose=1, scoring='precision')
        rand_search.fit(X_res, y_res)

        best_model = rand_search.best_estimator_
        predictions = best_model.predict(X_test)
        best_params = rand_search.best_params_
        best_score = rand_search.best_score_

        accuracy = accuracy_score(predictions, y_test)
        recall = recall_score(predictions, y_test)
        precision = precision_score(predictions, y_test)
        f1 = f1_score(predictions, y_test)

        print('Best Model: {}'.format(best_model))
        print('Best Params: {}'.format(best_params))
        print('Best Score: {}'.format(best_score))

        print('Acurácia: {}'.format(accuracy))
        print('Revocação: {}'.format(recall))
        print('Precisão: {}'.format(precision))
        print('F1 Score: {}'.format(f1))

        return

In [154]:
# DecisionTreeClassifier
decision_tree_params = {
    "criterion": ["gini", "entropy", "log_loss"],  # Critério para medir qualidade do split
    "splitter": ["best", "random"],  # Estratégia para dividir o nó
    "max_depth": [None, 10, 20, 30, 50],  # Profundidade máxima da árvore
    "min_samples_split": [2, 5, 10],  # Mínimo de amostras para dividir um nó
    "min_samples_leaf": [1, 2, 5],  # Mínimo de amostras em uma folha
    "max_features": [None, "sqrt", "log2"],  # Número de features a considerar em cada split
    "class_weight": [None, "balanced"]  # Balanceamento entre classes
}

# RandomForestClassifier
random_forest_params = {
    "n_estimators": [100, 200, 500],  # Número de árvores
    "criterion": ["gini", "entropy", "log_loss"],  # Critério de divisão
    "max_depth": [None, 10, 20, 30, 50],  # Profundidade máxima
    "min_samples_split": [2, 5, 10],  # Mínimo de amostras para dividir um nó
    "min_samples_leaf": [1, 2, 5],  # Mínimo de amostras em uma folha
    "max_features": [None, "sqrt", "log2"],  # Número de features para cada split
    "bootstrap": [True, False],  # Uso de amostras de bootstrapping
    "class_weight": [None, "balanced", "balanced_subsample"]  # Balanceamento entre classes
}

# GradientBoostingClassifier
gradient_boosting_params = {
    "n_estimators": [100, 200, 500],  # Número de estágios de boosting
    "learning_rate": [0.01, 0.1, 0.2],  # Taxa de aprendizado
    "max_depth": [3, 5, 10],  # Profundidade máxima das árvores
    "min_samples_split": [2, 5, 10],  # Mínimo de amostras para dividir um nó
    "min_samples_leaf": [1, 2, 5],  # Mínimo de amostras em uma folha
    "subsample": [0.8, 1.0],  # Fração de amostras usadas para treinar cada base learner
    "max_features": [None, "sqrt", "log2"]  # Número de features por split
}

# AdaBoostClassifier
adaboost_params = {
    "n_estimators": [50, 100, 200],  # Número de estimadores
    "learning_rate": [0.1, 0.5, 1.0],  # Taxa de aprendizado
    "algorithm": ["SAMME", "SAMME.R"],  # Algoritmo de boosting
}

In [149]:
rand_search_cv(DecisionTreeClassifier(), decision_tree_params)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Model: DecisionTreeClassifier(max_depth=20, max_features='sqrt', min_samples_split=10,
                       splitter='random')
Best Params: {'splitter': 'random', 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 20, 'criterion': 'gini', 'class_weight': None}
Best Score: 0.7811111111111112
Acurácia: 0.6881720430107527
Revocação: 1.0
Precisão: 0.6547619047619048
F1 Score: 0.7913669064748201


In [150]:
rand_search_cv(RandomForestClassifier(), random_forest_params)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

Best Model: RandomForestClassifier(bootstrap=False, class_weight='balanced_subsample',
                       criterion='entropy', max_depth=50, min_samples_leaf=5,
                       min_samples_split=10, n_estimators=200)
Best Params: {'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'max_depth': 50, 'criterion': 'entropy', 'class_weight': 'balanced_subsample', 'bootstrap': False}
Best Score: 0.858095238095238
Acurácia: 0.8924731182795699
Revocação: 1.0
Precisão: 0.8809523809523809
F1 Score: 0.9367088607594937


  return fit_method(estimator, *args, **kwargs)


In [151]:
rand_search_cv(GradientBoostingClassifier(), gradient_boosting_params)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Fitting 5 folds for each of 10 candidates, totalling 50 fits


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

Best Model: GradientBoostingClassifier(max_depth=5, max_features='sqrt', min_samples_leaf=5,
                           min_samples_split=5, n_estimators=500)
Best Params: {'subsample': 1.0, 'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'max_depth': 5, 'learning_rate': 0.1}
Best Score: 0.8295238095238096
Acurácia: 0.8709677419354839
Revocação: 1.0
Precisão: 0.8571428571428571
F1 Score: 0.9230769230769231


In [155]:
rand_search_cv(AdaBoostClassifier(), adaboost_params)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

Best Model: AdaBoostClassifier(learning_rate=0.5, n_estimators=100)
Best Params: {'n_estimators': 100, 'learning_rate': 0.5, 'algorithm': 'SAMME.R'}
Best Score: 0.7716666666666667
Acurácia: 0.8817204301075269
Revocação: 0.9866666666666667
Precisão: 0.8809523809523809
F1 Score: 0.9308176100628931


  y = column_or_1d(y, warn=True)
