# Preprocessing

## Import Libraries

In [1]:
# !pip install -q -U pandas
# !pip install -q -U matplotlib
# !pip install -q -U numpy
# !pip install -q -U seaborn
# !pip install -q -U scikit-learn
# !pip install -q -U imbalanced-learn
# !pip install -q -U Pillow
# !pip install -q -U xgboost
# !pip install -q -U lightgbm
# !pip install -q -U keras
# !pip install -q -U tensorflow
# !pip install -q -U joblib

In [2]:
import os
from PIL import Image

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

# text models
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier


# for handling imbalanced data
from collections import Counter
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from imblearn.pipeline import make_pipeline
from sklearn.utils import class_weight

import joblib
import pickle

import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

## Set Up

In [3]:
%matplotlib inline
plt.rcParams['figure.figsize'] = [6,4]
cmap = mpl.colormaps['viridis']
sns.set_theme(style='whitegrid', palette='muted', font_scale=1.5)

In [4]:
data_path = 'G:\\HSUHK\\COM6003\\project\\archive'

# Function

## Data Processing

**three skin cancers (BCC, MEL, and SCC) and three skin disease (ACK, NEV, and SEK)**

And we remove the "biopsed" feature, because:
- Avoiding bias towards biopsied cases
- Preventing data leakage from biopsy results
- Improving model generalization to cases without biopsy data
- Aligning the model with the intended use case of pre-biopsy diagnosis

In [5]:
class DataProcessing:
    def __init__(self, data: pd.DataFrame, path=None):
        self.path = path
        self.data = data.copy()
        self.data = self.data.drop(columns=['biopsed'])
        self.data['patient_id'] = self.data['patient_id'].str.replace('PAT_','',regex=False).astype('int64')
        self.data = self.data.replace(['UNK','NaN'], np.nan)
        self.missing_percentage = round(self.data.isna().sum()*100/self.data.shape[0], 1)
    
    def keepNanText_6(self):
        """
        Keep the nan text in the data, 
        if the percentage of missing value is less than 10, then impute the feature using IterativeImputer,
        else if the percentage of missing value is higher than 10, then replace the nan text with 'Unknown',
        use it as a new feature.
        """
        imputer = IterativeImputer()
        enc = LabelEncoder()
        for col, percentage in self.missing_percentage.items():
            if self.data[col].dtype == 'object':
                if percentage < 10:
                    self.data[col] = enc.fit_transform(self.data[col])
                    self.data[col] = imputer.fit_transform(self.data[[col]])
                else:
                    self.data[col] = self.data[col].fillna('Unknown')
                    self.data[col] = self.data[col].astype('str')
                    self.data[col] = enc.fit_transform(self.data[col])
                    self.data[col] = self.data[col].replace(len(enc.classes_), -1).astype('int64')

                    # output the encode method
                    encoding_mapping = dict(zip(enc.classes_, range(len(enc.classes_))))
                    print(f"Feature '{col}':")
                    for category, encoding in encoding_mapping.items():
                        if category == 'Unknown':
                            print(f"{category}: -1")
                        else:
                            print(f"{category}: {encoding}")
                    print('-'*30)
            else:
                if percentage < 10:
                    self.data[col] = imputer.fit_transform(self.data[[col]])
                else:
                    self.data[col] = self.data[col].fillna(-1)
                    self.data[col] = self.data[col].astype('int64')
        
        self.data.drop(columns='img_id', inplace=True)

        return self.data
    
    def dropNanText_6(self):
        """
        Drop the rows with missing values in the data,
        if the percentage of missing value is less than 10, then impute the feature using IterativeImputer,
        else if the percentage of missing value is higher than 10, then drop the columns.
        """
        imputer = IterativeImputer()
        enc = LabelEncoder()
        for col, percentage in self.missing_percentage.items():
            if self.data[col].dtype == 'object':
                if percentage < 10:
                    self.data[col] = enc.fit_transform(self.data[col])
                    self.data[col] = imputer.fit_transform(self.data[[col]])
                else:
                    self.data.drop(columns=[col], inplace=True)
            else:
                if percentage < 10:
                    self.data[col] = imputer.fit_transform(self.data[[col]])
                else:
                    self.data.drop(columns=[col], inplace=True)

        self.data.drop(columns='img_id', inplace=True)

        return self.data
    
    def keepNanText_2(self):
        """
        Keep the nan text in the data, and change the y label to binary.
        """
        imputer = IterativeImputer()
        enc = LabelEncoder()
        for col, percentage in self.missing_percentage.items():
            if col == 'diagnostic':
                self.data['is_cancer'] = np.where(self.data['diagnostic'].isin(['BCC','MEL','SCC']),1,0)
                self.data.drop(columns=['diagnostic'], inplace=True)
            elif self.data[col].dtype == 'object':
                if percentage < 10:
                    self.data[col] = enc.fit_transform(self.data[col])
                    self.data[col] = imputer.fit_transform(self.data[[col]])
                else:
                    self.data[col] = self.data[col].fillna('Unknown')
                    self.data[col] = self.data[col].astype('str')
                    self.data[col] = enc.fit_transform(self.data[col])
                    self.data[col] = self.data[col].replace(len(enc.classes_), -1).astype('int64')

                    # output the encode method
                    encoding_mapping = dict(zip(enc.classes_, range(len(enc.classes_))))
                    print(f"Feature '{col}':")
                    for category, encoding in encoding_mapping.items():
                        if category == 'Unknown':
                            print(f"{category}: -1")
                        else:
                            print(f"{category}: {encoding}")
                    print('-'*30)
            else:
                if percentage < 10:
                    self.data[col] = imputer.fit_transform(self.data[[col]])
                else:
                    self.data[col] = self.data[col].fillna(-1)
                    self.data[col] = self.data[col].astype('int64')

        self.data.drop(columns='img_id', inplace=True)

        return self.data
    
    def dropNanText_2(self):
        """
        Drop the nan columns in the data, and change the y label to binary.
        """
        imputer = IterativeImputer()
        enc = LabelEncoder()
        for col, percentage in self.missing_percentage.items():
            if col == 'diagnostic':
                self.data['is_cancer'] = np.where(self.data['diagnostic'].isin(['BCC','MEL','SCC']),1,0)
                self.data.drop(columns=['diagnostic'], inplace=True)
            elif self.data[col].dtype == 'object':
                if percentage < 10:
                    self.data[col] = enc.fit_transform(self.data[col])
                    self.data[col] = imputer.fit_transform(self.data[[col]])
                else:
                    self.data.drop(columns=[col], inplace=True)
            else:
                if percentage < 10:
                    self.data[col] = imputer.fit_transform(self.data[[col]])
                else:
                    self.data.drop(columns=[col], inplace=True)

        self.data.drop(columns='img_id', inplace=True)

        return self.data
    
    def imageIndex2(self):
        """
        return a index of two labels of images.
        """
        for i in self.data.columns:
            if i == 'img_id':
                pass
            elif i == 'diagnostic':
                self.data['is_cancer'] = np.where(self.data['diagnostic'].isin(['BCC','MEL','SCC']),1,0)
                self.data.drop(columns='diagnostic', inplace=True)
            else:
                self.data.drop(columns=[i], inplace=True)
        
        self.data = self.data.reset_index()
        
        return self.data
    
    def imageIndex6(self):
        """
        return a index of six labels of images.
        """
        enc = LabelEncoder()
        for i in self.data.columns:
            if i == 'img_id':
                pass
            elif i == 'diagnostic':
                self.data['diagnostic'] = enc.fit_transform(self.data['diagnostic'])
                # output the encode method
                encoding_mapping = dict(zip(enc.classes_, range(len(enc.classes_))))
                print(f"Feature 'diagnostic':")
                for category, encoding in encoding_mapping.items():
                    if category == 'Unknown':
                        print(f"{category}: -1")
                    else:
                        print(f"{category}: {encoding}")
            else:
                self.data.drop(columns=[i], inplace=True)
        
        self.data = self.data.reset_index()
        
        return self.data


## Imbalanced Data

In [6]:
def balance_data(X, y):
    # calculate the number of samples in each class
    class_counts = Counter(y)
    min_class_count = min(class_counts.values())

    # set the sampling strategy for both over and under sampling
    over_sample_strategy = {label: 2 * min_class_count for label in class_counts.keys() if class_counts[label] <= (2 * min_class_count)}
    under_sample_strategy = {label: 2 * min_class_count for label in class_counts.keys() if class_counts[label] > (2 * min_class_count)}

    # create a pipeline for resampling
    pipe = make_pipeline(
        SMOTE(sampling_strategy=over_sample_strategy),
        NearMiss(sampling_strategy=under_sample_strategy)
    )

    # resample the data
    X_resampled, y_resampled = pipe.fit_resample(X, y)

    # calculate the class weights
    class_weights = class_weight.compute_class_weight(class_weight='balanced',
                                                      classes=np.unique(y),
                                                      y=y)
    class_weights = dict(enumerate(class_weights))

    return X_resampled, y_resampled, class_weights

## Text Model

In [7]:
def textModel(x_train, y_train, x_test, y_test, class_weights=None):
    svm = SVC(kernel='linear', random_state=42)
    svm.fit(x_train, y_train)
    y_pred = svm.predict(x_test)
    print(f'Model 1: {classification_report(y_test, y_pred)}')
    
    rf = RandomForestClassifier(n_estimators=100, class_weight=class_weights, random_state=42)
    rf.fit(x_train, y_train)
    y_pred = rf.predict(x_test)
    print(f'Model 2: {classification_report(y_test, y_pred)}')

    xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
    xgb.fit(x_train, y_train)
    y_pred = xgb.predict(x_test)
    print(f'Model 3: {classification_report(y_test, y_pred)}')

    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(x_train, y_train)
    y_pred = knn.predict(x_test)
    print(f'Model 4: {classification_report(y_test, y_pred)}')

    num_class = len(np.unique(y_train))
    if num_class == 2:
        num_class = 1

    lgbm = LGBMClassifier(num_class=num_class, class_weight=class_weights, random_state=42, verbose=-1)
    lgbm.fit(x_train, y_train)
    y_pred = lgbm.predict(x_test)
    print(f'Model 5: {classification_report(y_test, y_pred)}')

    return svm, rf, xgb, knn, lgbm

## Load Dataset

Explain of the features:
- background_father: The history of any diseases or health conditions related to the patient's father, including any history of skin cancer or other diseases that may be related to skin cancer
- background_mother: The history of any diseases or health conditions related to the patient's mother, including any history of skin cancer or other diseases that may be related to skin cancer
- has_piped_water: Indicates whether the location or area of the patient's residence has access to piped water or not
- has_sewage_system: Indicates whether the location or area of the patient's residence has a proper sewage system or not
- fitspatrick: Skin tolerance to sunlight
- itch: Whether the lesion or wound has itched or not
- elevation: Description of the of the lesion or wound relative to the skin surface of the patient
- biopsed: Whether the lesion or wound has been biopsied or not

In [8]:
metadata = pd.read_csv(os.path.join(data_path, 'metadata.csv'))

## Data Information

In [9]:
metadata.shape

(2298, 26)

In [10]:
metadata.head()

Unnamed: 0,patient_id,lesion_id,smoke,drink,background_father,background_mother,age,pesticide,gender,skin_cancer_history,...,diameter_2,diagnostic,itch,grew,hurt,changed,bleed,elevation,img_id,biopsed
0,PAT_1516,1765,,,,,8,,,,...,,NEV,False,False,False,False,False,False,PAT_1516_1765_530.png,False
1,PAT_46,881,False,False,POMERANIA,POMERANIA,55,False,FEMALE,True,...,5.0,BCC,True,True,False,True,True,True,PAT_46_881_939.png,True
2,PAT_1545,1867,,,,,77,,,,...,,ACK,True,False,False,False,False,False,PAT_1545_1867_547.png,False
3,PAT_1989,4061,,,,,75,,,,...,,ACK,True,False,False,False,False,False,PAT_1989_4061_934.png,False
4,PAT_684,1302,False,True,POMERANIA,POMERANIA,79,False,MALE,True,...,5.0,BCC,True,True,False,False,True,True,PAT_684_1302_588.png,True


In [11]:
def count_is_null(data:pd.DataFrame):
    countNaN = data.isna().sum()
    return f'{countNaN}({countNaN*100/data.shape[0]:.1f}%)'
def count_is_null_unique(data:pd.DataFrame):
    return data.count()-data.nunique()
def data_info(data:pd.DataFrame):
    return data.agg(['count', 'nunique', count_is_null_unique, count_is_null, 'dtype']).T

In [12]:
data_info(metadata)

Unnamed: 0,count,nunique,count_is_null_unique,count_is_null,dtype
patient_id,2298,1373,925,0(0.0%),object
lesion_id,2298,1641,657,0(0.0%),int64
smoke,1494,2,1492,804(35.0%),object
drink,1494,2,1492,804(35.0%),object
background_father,1480,13,1467,818(35.6%),object
background_mother,1476,11,1465,822(35.8%),object
age,2298,84,2214,0(0.0%),int64
pesticide,1494,2,1492,804(35.0%),object
gender,1494,2,1492,804(35.0%),object
skin_cancer_history,1494,2,1492,804(35.0%),object


# Method One

## Data Processing

In [13]:
keep_df6 = DataProcessing(metadata).keepNanText_6()

Feature 'smoke':
False: 0
True: 1
Unknown: -1
------------------------------
Feature 'drink':
False: 0
True: 1
Unknown: -1
------------------------------
Feature 'background_father':
AUSTRIA: 0
BRASIL: 1
BRAZIL: 2
CZECH: 3
GERMANY: 4
ISRAEL: 5
ITALY: 6
NETHERLANDS: 7
POLAND: 8
POMERANIA: 9
PORTUGAL: 10
SPAIN: 11
Unknown: -1
------------------------------
Feature 'background_mother':
BRAZIL: 0
FRANCE: 1
GERMANY: 2
ITALY: 3
NETHERLANDS: 4
NORWAY: 5
POLAND: 6
POMERANIA: 7
PORTUGAL: 8
SPAIN: 9
Unknown: -1
------------------------------
Feature 'pesticide':
False: 0
True: 1
Unknown: -1
------------------------------
Feature 'gender':
FEMALE: 0
MALE: 1
Unknown: -1
------------------------------
Feature 'skin_cancer_history':
False: 0
True: 1
Unknown: -1
------------------------------
Feature 'cancer_history':
False: 0
True: 1
Unknown: -1
------------------------------
Feature 'has_piped_water':
False: 0
True: 1
Unknown: -1
------------------------------
Feature 'has_sewage_system':
False: 0


In [14]:
data_info(keep_df6)

Unnamed: 0,count,nunique,count_is_null_unique,count_is_null,dtype
patient_id,2298,1373,925,0(0.0%),float64
lesion_id,2298,1641,657,0(0.0%),float64
smoke,2298,3,2295,0(0.0%),int64
drink,2298,3,2295,0(0.0%),int64
background_father,2298,13,2285,0(0.0%),int64
background_mother,2298,11,2287,0(0.0%),int64
age,2298,84,2214,0(0.0%),float64
pesticide,2298,3,2295,0(0.0%),int64
gender,2298,3,2295,0(0.0%),int64
skin_cancer_history,2298,3,2295,0(0.0%),int64


In [15]:
keep_df6.drop(columns=['patient_id', 'lesion_id'], inplace=True)
keep_df6 = keep_df6.astype('int64')

In [16]:
keep_df6.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
smoke,2298.0,0.787641,0.931332,0.0,0.0,0.0,2.0,2.0
drink,2298.0,0.859878,0.905861,0.0,0.0,1.0,2.0,2.0
background_father,2298.0,8.544386,3.396583,0.0,6.0,9.0,12.0,12.0
background_mother,2298.0,6.445605,3.463831,0.0,3.0,7.0,10.0,10.0
age,2298.0,60.464752,15.894866,6.0,52.0,62.0,72.0,94.0
pesticide,2298.0,0.95953,0.859589,0.0,0.0,1.0,2.0,2.0
gender,2298.0,1.022193,0.823011,0.0,0.0,1.0,2.0,2.0
skin_cancer_history,2298.0,0.996084,0.839015,0.0,0.0,1.0,2.0,2.0
cancer_history,2298.0,1.038729,0.812279,0.0,0.0,1.0,2.0,2.0
has_piped_water,2298.0,1.101393,0.767021,0.0,1.0,1.0,2.0,2.0


## Standardize and Data Split

In [17]:
x = keep_df6.drop(columns=['diagnostic'])
columns = x.columns
scaler = MinMaxScaler()
x = scaler.fit_transform(x)
x = pd.DataFrame(x, columns=columns)
y = keep_df6['diagnostic']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

## resample

In [18]:
x_train, y_train, class_weights = balance_data(x_train, y_train)

## Model

In [19]:
svm, rf, xgb, knn, lgbm = textModel(x_train, y_train, x_test, y_test, class_weights)

Model 1:               precision    recall  f1-score   support

           0       0.55      0.29      0.38       133
           1       0.68      0.17      0.27       177
           2       0.31      0.62      0.41        13
           3       0.63      0.54      0.58        50
           4       0.18      0.57      0.27        44
           5       0.25      0.79      0.38        43

    accuracy                           0.35       460
   macro avg       0.43      0.50      0.38       460
weighted avg       0.54      0.35      0.35       460

Model 2:               precision    recall  f1-score   support

           0       0.58      0.29      0.38       133
           1       0.78      0.31      0.44       177
           2       0.48      0.77      0.59        13
           3       0.70      0.66      0.68        50
           4       0.21      0.64      0.32        44
           5       0.27      0.79      0.41        43

    accuracy                           0.43       460
   ma

## Stacking

In [20]:
# stack model
estimators1 = [('svc', svm), ('rf', rf), ('xgb', xgb), ('knn', knn), ('lgbm', lgbm)]
stack_model1 = StackingClassifier(estimators=estimators1, final_estimator=XGBClassifier())

In [21]:
# fit the model on the training data
stack_model1.fit(x_train, y_train)
# make predictions
y_pred = stack_model1.predict(x_test)
# calculate the classification report
stack_model1_result = classification_report(y_test, y_pred)
print(f'Stack Model: {stack_model1_result}')

Stack Model:               precision    recall  f1-score   support

           0       0.57      0.26      0.35       133
           1       0.76      0.22      0.34       177
           2       0.59      0.77      0.67        13
           3       0.54      0.56      0.55        50
           4       0.21      0.66      0.31        44
           5       0.22      0.70      0.33        43

    accuracy                           0.37       460
   macro avg       0.48      0.53      0.43       460
weighted avg       0.57      0.37      0.37       460



# Method Two

## Data Processing

In [22]:
drop_df6 = DataProcessing(metadata).dropNanText_6()

In [23]:
drop_df6.drop(columns=['patient_id', 'lesion_id'], inplace=True)
drop_df6 = drop_df6.astype('int64')

In [24]:
drop_df6.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,2298.0,60.464752,15.894866,6.0,52.0,62.0,72.0,94.0
region,2298.0,5.464752,3.202201,0.0,3.0,5.0,7.0,13.0
diagnostic,2298.0,1.577023,1.679717,0.0,0.0,1.0,3.0,5.0
itch,2298.0,0.638381,0.485979,0.0,0.0,1.0,1.0,2.0
hurt,2298.0,0.181462,0.396617,0.0,0.0,0.0,0.0,2.0
bleed,2298.0,0.272411,0.451125,0.0,0.0,0.0,1.0,2.0
elevation,2298.0,0.625326,0.485939,0.0,0.0,1.0,1.0,2.0


## Standardize and Data Split

In [25]:
x = drop_df6.drop(columns=['diagnostic'])
columns = x.columns
scaler = MinMaxScaler()
x = scaler.fit_transform(x)
x = pd.DataFrame(x, columns=columns)
y = drop_df6['diagnostic']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

## resample

In [26]:
x_train, y_train, class_weights = balance_data(x_train, y_train)

## Model

In [27]:
svm, rf, xgb, knn, lgbm = textModel(x_train, y_train, x_test, y_test, class_weights)

Model 1:               precision    recall  f1-score   support

           0       0.74      0.37      0.49       133
           1       0.44      0.17      0.24       177
           2       0.15      0.62      0.24        13
           3       0.81      0.60      0.69        50
           4       0.13      0.64      0.21        44
           5       0.44      0.19      0.26        43

    accuracy                           0.33       460
   macro avg       0.45      0.43      0.36       460
weighted avg       0.53      0.33      0.36       460

Model 2:               precision    recall  f1-score   support

           0       0.78      0.14      0.23       133
           1       0.40      0.10      0.16       177
           2       0.18      0.38      0.24        13
           3       0.60      0.72      0.65        50
           4       0.14      0.84      0.24        44
           5       0.32      0.30      0.31        43

    accuracy                           0.28       460
   ma

Model 3:               precision    recall  f1-score   support

           0       0.77      0.17      0.28       133
           1       0.42      0.14      0.21       177
           2       0.22      0.38      0.28        13
           3       0.62      0.72      0.67        50
           4       0.14      0.80      0.23        44
           5       0.25      0.21      0.23        43

    accuracy                           0.29       460
   macro avg       0.40      0.40      0.32       460
weighted avg       0.49      0.29      0.29       460

Model 4:               precision    recall  f1-score   support

           0       0.60      0.18      0.28       133
           1       0.31      0.11      0.16       177
           2       0.19      0.23      0.21        13
           3       0.62      0.66      0.64        50
           4       0.12      0.73      0.21        44
           5       0.22      0.14      0.17        43

    accuracy                           0.25       460
   ma

## Stacking

In [28]:
# stack model
estimators2 = [('svc', svm), ('rf', rf), ('xgb', xgb), ('knn', knn), ('lgbm', lgbm)]
stack_model2 = StackingClassifier(estimators=estimators2, final_estimator=XGBClassifier())

In [29]:
# fit the model on the training data
stack_model2.fit(x_train, y_train)
# make predictions
y_pred = stack_model2.predict(x_test)
# calculate the classification report
stack_model2_result = classification_report(y_test, y_pred)
print(f'Stack Model: {stack_model2_result}')

Stack Model:               precision    recall  f1-score   support

           0       0.54      0.10      0.17       133
           1       0.34      0.07      0.12       177
           2       0.13      0.46      0.21        13
           3       0.64      0.72      0.68        50
           4       0.13      0.77      0.22        44
           5       0.26      0.21      0.23        43

    accuracy                           0.24       460
   macro avg       0.34      0.39      0.27       460
weighted avg       0.40      0.24      0.22       460



# Method Three

## Data Processing

In [30]:
keep_df2 = DataProcessing(metadata).keepNanText_2()

Feature 'smoke':
False: 0
True: 1
Unknown: -1
------------------------------
Feature 'drink':
False: 0
True: 1
Unknown: -1
------------------------------
Feature 'background_father':
AUSTRIA: 0
BRASIL: 1
BRAZIL: 2
CZECH: 3
GERMANY: 4
ISRAEL: 5
ITALY: 6
NETHERLANDS: 7
POLAND: 8
POMERANIA: 9
PORTUGAL: 10
SPAIN: 11
Unknown: -1
------------------------------
Feature 'background_mother':
BRAZIL: 0
FRANCE: 1
GERMANY: 2
ITALY: 3
NETHERLANDS: 4
NORWAY: 5
POLAND: 6
POMERANIA: 7
PORTUGAL: 8
SPAIN: 9
Unknown: -1
------------------------------
Feature 'pesticide':
False: 0
True: 1
Unknown: -1
------------------------------
Feature 'gender':
FEMALE: 0
MALE: 1
Unknown: -1
------------------------------
Feature 'skin_cancer_history':
False: 0
True: 1
Unknown: -1
------------------------------
Feature 'cancer_history':
False: 0
True: 1
Unknown: -1
------------------------------
Feature 'has_piped_water':
False: 0
True: 1
Unknown: -1
------------------------------
Feature 'has_sewage_system':
False: 0


In [31]:
keep_df2.drop(columns=['patient_id', 'lesion_id'], inplace=True)
keep_df2 = keep_df2.astype('int64')

In [32]:
keep_df2.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
smoke,2298.0,0.787641,0.931332,0.0,0.0,0.0,2.0,2.0
drink,2298.0,0.859878,0.905861,0.0,0.0,1.0,2.0,2.0
background_father,2298.0,8.544386,3.396583,0.0,6.0,9.0,12.0,12.0
background_mother,2298.0,6.445605,3.463831,0.0,3.0,7.0,10.0,10.0
age,2298.0,60.464752,15.894866,6.0,52.0,62.0,72.0,94.0
pesticide,2298.0,0.95953,0.859589,0.0,0.0,1.0,2.0,2.0
gender,2298.0,1.022193,0.823011,0.0,0.0,1.0,2.0,2.0
skin_cancer_history,2298.0,0.996084,0.839015,0.0,0.0,1.0,2.0,2.0
cancer_history,2298.0,1.038729,0.812279,0.0,0.0,1.0,2.0,2.0
has_piped_water,2298.0,1.101393,0.767021,0.0,1.0,1.0,2.0,2.0


## Standardize and Data Split

In [33]:
x = keep_df2.drop(columns=['is_cancer'])
columns = x.columns
scaler = MinMaxScaler()
x = scaler.fit_transform(x)
x = pd.DataFrame(x, columns=columns)
y = keep_df2['is_cancer']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

## Model

In [34]:
svm, rf, xgb, knn, lgbm = textModel(x_train, y_train, x_test, y_test)

Model 1:               precision    recall  f1-score   support

           0       0.90      0.83      0.86       226
           1       0.85      0.91      0.88       234

    accuracy                           0.87       460
   macro avg       0.87      0.87      0.87       460
weighted avg       0.87      0.87      0.87       460

Model 2:               precision    recall  f1-score   support

           0       0.96      0.88      0.92       226
           1       0.89      0.97      0.93       234

    accuracy                           0.92       460
   macro avg       0.93      0.92      0.92       460
weighted avg       0.93      0.92      0.92       460

Model 3:               precision    recall  f1-score   support

           0       0.94      0.90      0.92       226
           1       0.91      0.94      0.92       234

    accuracy                           0.92       460
   macro avg       0.92      0.92      0.92       460
weighted avg       0.92      0.92      0.92    

## Stacking

In [35]:
# stack model
estimators3 = [('svc', svm), ('rf', rf), ('xgb', xgb), ('knn', knn), ('lgbm', lgbm)]
stack_model3 = StackingClassifier(estimators=estimators3, final_estimator=LGBMClassifier(verbose=-1))

In [36]:
# fit the model on the training data
stack_model3.fit(x_train, y_train)
# make predictions
y_pred = stack_model3.predict(x_test)
# calculate the classification report
stack_model3_result = classification_report(y_test, y_pred)
print(f'Stack Model: {stack_model3_result}')

Stack Model:               precision    recall  f1-score   support

           0       0.93      0.89      0.91       226
           1       0.90      0.93      0.92       234

    accuracy                           0.91       460
   macro avg       0.91      0.91      0.91       460
weighted avg       0.91      0.91      0.91       460



# Method Four

## Data Processing

In [37]:
drop_df2 = DataProcessing(metadata).dropNanText_2()

In [38]:
drop_df2.drop(columns=['patient_id', 'lesion_id'], inplace=True)
drop_df2 = drop_df2.astype('int64')

In [39]:
drop_df2.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,2298.0,60.464752,15.894866,6.0,52.0,62.0,72.0,94.0
region,2298.0,5.464752,3.202201,0.0,3.0,5.0,7.0,13.0
itch,2298.0,0.638381,0.485979,0.0,0.0,1.0,1.0,2.0
hurt,2298.0,0.181462,0.396617,0.0,0.0,0.0,0.0,2.0
bleed,2298.0,0.272411,0.451125,0.0,0.0,0.0,1.0,2.0
elevation,2298.0,0.625326,0.485939,0.0,0.0,1.0,1.0,2.0
is_cancer,2298.0,0.47389,0.499426,0.0,0.0,0.0,1.0,1.0


## Standardize and Data Split

In [40]:
x = drop_df2.drop(columns=['is_cancer'])
columns = x.columns
scaler = MinMaxScaler()
x = scaler.fit_transform(x)
x = pd.DataFrame(x, columns=columns)
y = drop_df2['is_cancer']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

## Model

In [41]:
svm, rf, xgb, knn, lgbm = textModel(x_train, y_train, x_test, y_test)

Model 1:               precision    recall  f1-score   support

           0       0.63      0.86      0.73       226
           1       0.79      0.51      0.62       234

    accuracy                           0.68       460
   macro avg       0.71      0.69      0.68       460
weighted avg       0.71      0.68      0.68       460

Model 2:               precision    recall  f1-score   support

           0       0.81      0.77      0.79       226
           1       0.79      0.83      0.81       234

    accuracy                           0.80       460
   macro avg       0.80      0.80      0.80       460
weighted avg       0.80      0.80      0.80       460

Model 3:               precision    recall  f1-score   support

           0       0.78      0.79      0.79       226
           1       0.79      0.79      0.79       234

    accuracy                           0.79       460
   macro avg       0.79      0.79      0.79       460
weighted avg       0.79      0.79      0.79    

## Stacking

In [42]:
# stack model
estimators4 = [('svc', svm), ('rf', rf), ('xgb', xgb), ('knn', knn), ('lgbm', lgbm)]
stack_model4 = StackingClassifier(estimators=estimators4, final_estimator=RandomForestClassifier())

In [43]:
# fit the model on the training data
stack_model4.fit(x_train, y_train)
# make predictions
y_pred = stack_model4.predict(x_test)
# calculate the classification report
stack_model4_result = classification_report(y_test, y_pred)
print(f'Stack Model: {stack_model4_result}')

Stack Model:               precision    recall  f1-score   support

           0       0.78      0.79      0.79       226
           1       0.79      0.79      0.79       234

    accuracy                           0.79       460
   macro avg       0.79      0.79      0.79       460
weighted avg       0.79      0.79      0.79       460



# Conclusion

In [44]:
print(f"Stack Model 1: {stack_model1_result}")
print('-'*30)
print(f"Stack Model 2: {stack_model2_result}")
print('-'*30)
print(f"Stack Model 3: {stack_model3_result}")
print('-'*30)
print(f"Stack Model 4: {stack_model4_result}")

Stack Model 1:               precision    recall  f1-score   support

           0       0.57      0.26      0.35       133
           1       0.76      0.22      0.34       177
           2       0.59      0.77      0.67        13
           3       0.54      0.56      0.55        50
           4       0.21      0.66      0.31        44
           5       0.22      0.70      0.33        43

    accuracy                           0.37       460
   macro avg       0.48      0.53      0.43       460
weighted avg       0.57      0.37      0.37       460

------------------------------
Stack Model 2:               precision    recall  f1-score   support

           0       0.54      0.10      0.17       133
           1       0.34      0.07      0.12       177
           2       0.13      0.46      0.21        13
           3       0.64      0.72      0.68        50
           4       0.13      0.77      0.22        44
           5       0.26      0.21      0.23        43

    accuracy    

# Save the Model

In [45]:
# save the model
joblib.dump(stack_model3, 'stack_model3.joblib')
with open('stack_model3.pkl', 'wb') as f:
    pickle.dump(stack_model3, f)

In [46]:
image_df2 = DataProcessing(metadata).imageIndex2()
image_df2.to_csv(os.path.join(data_path, 'imageIndex2.csv'), index=False)

In [47]:
image_df6 = DataProcessing(metadata).imageIndex6()
image_df6.to_csv(os.path.join(data_path, 'imageIndex6.csv'), index=False)

Feature 'diagnostic':
ACK: 0
BCC: 1
MEL: 2
NEV: 3
SCC: 4
SEK: 5


# Next Step:
1. evaluate model
2. feature augmentation
3. feture selection
4. multicollinear feature