In [1]:
import os

import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import classification_report

import joblib

In [2]:
data_path = 'G:\\HSUHK\\COM6003\\project\\archive'
meta_df = pd.read_csv(data_path + '\\metadata.csv')  # original dataset
img_train_2 = pd.read_csv(data_path+'\\meta_input_2.csv')
img_train_6 = pd.read_csv(data_path+'\\meta_input_6.csv')
img_val_2 = pd.read_csv(data_path+'\\meta_val_2.csv')
img_val_6 = pd.read_csv(data_path+'\\meta_val_6.csv')
img_test_2 = pd.read_csv(data_path+'\\meta_test_2.csv')
img_test_6 = pd.read_csv(data_path+'\\meta_test_6.csv')
img_index_2 = pd.read_csv(data_path+'\\imageIndex2.csv')
img_index_6 = pd.read_csv(data_path+'\\imageIndex6.csv')

# Metadata Processing

In [3]:
meta_df.drop(columns=['biopsed','patient_id','lesion_id'],inplace=True)
meta_df.replace(['UNK','NaN',np.nan], 'UK',inplace=True)

In [4]:
meta_df.replace('UK', np.nan,inplace=True)

In [5]:
img_index_6.drop(columns='index',inplace=True)
img_index_2.drop(columns='index',inplace=True)

In [6]:
x_train_2 = pd.concat([img_train_2,img_val_2],axis=0,ignore_index=True)
x_train_6 = pd.concat([img_train_6,img_val_6],axis=0,ignore_index=True)
x_test_2 = img_test_2
x_test_6 = img_test_6

In [7]:
x_train_2.drop(columns=['Unnamed: 0'],inplace=True)
x_train_6.drop(columns=['Unnamed: 0'],inplace=True)
x_test_2.drop(columns=['Unnamed: 0'],inplace=True)
x_test_6.drop(columns=['Unnamed: 0'],inplace=True)

In [8]:
x_train_2['img_id'] = x_train_2['0'].str.extract(r'([^\\]+\.png$)',expand=False)
x_train_2.drop(columns=['0'],inplace=True)

In [9]:
x_train_6['img_id'] = x_train_6['0'].str.extract(r'([^\\]+\.png$)',expand=False)
x_train_6.drop(columns=['0'],inplace=True)

In [10]:
x_test_2['img_id'] = x_test_2['0'].str.extract(r'([^\\]+\.png$)',expand=False)
x_test_2.drop(columns=['0'],inplace=True)

In [11]:
x_test_6['img_id'] = x_test_6['0'].str.extract(r'([^\\]+\.png$)',expand=False)
x_test_6.drop(columns=['0'],inplace=True)

In [12]:
y_train_2 = img_index_2[img_index_2['img_id'].isin(x_train_2['img_id'])]
y_train_6 = img_index_6[img_index_6['img_id'].isin(x_train_6['img_id'])]
y_test_2 = img_index_2[img_index_2['img_id'].isin(x_test_2['img_id'])]
y_test_6 = img_index_6[img_index_6['img_id'].isin(x_test_6['img_id'])]

In [13]:
df_train_2 = pd.merge(x_train_2,meta_df,on='img_id',how='inner')
df_train_6 = pd.merge(x_train_6,meta_df,on='img_id',how='inner')
df_test_2 = pd.merge(x_test_2,meta_df,on='img_id',how='inner')
df_test_6 = pd.merge(x_test_6,meta_df,on='img_id',how='inner')

In [14]:
df_train_2.drop(columns='diagnostic',inplace=True)
df_train_6.drop(columns='diagnostic',inplace=True)
df_test_2.drop(columns='diagnostic',inplace=True)
df_test_6.drop(columns='diagnostic',inplace=True)

In [15]:
df_train_2.replace('UK',np.nan,inplace=True)
df_train_6.replace('UK',np.nan,inplace=True)
df_test_2.replace('UK',np.nan,inplace=True)
df_test_6.replace('UK',np.nan,inplace=True)

In [16]:
def preprocess_data(df):
    imputer = IterativeImputer()
    enc = LabelEncoder()
    missing_percentage = round(df.isna().sum()*100/df.shape[0],1)
    for col, percentage in missing_percentage.items():
        if df[col].dtype == 'object':
            if percentage < 10:
                df[col] = enc.fit_transform(df[col])
                df[col] = imputer.fit_transform(df[[col]])
            else:
                df[col] = df[col].fillna('Unknown')
                df[col] = df[col].astype('str')
                df[col] = enc.fit_transform(df[col])
                df[col] = df[col].replace(len(enc.classes_), -1).astype('int64')

                # output the encode method
                encoding_mapping = dict(zip(enc.classes_, range(len(enc.classes_))))
                print(f"Feature '{col}':")
                for category, encoding in encoding_mapping.items():
                    if category == 'Unknown':
                        print(f"{category}: -1")
                    else:
                        print(f"{category}: {encoding}")
                print('-' * 30)
        else:
            if percentage < 10:
                df[col] = imputer.fit_transform(df[[col]])
            else:
                df[col] = df[col].fillna(-1)
                df[col] = df[col].astype('int64')

    return df

In [17]:
df_train_2 = preprocess_data(df_train_2)
df_train_6 = preprocess_data(df_train_6)
df_test_2 = preprocess_data(df_test_2)
df_test_6 = preprocess_data(df_test_6)

Feature 'smoke':
False: 0
True: 1
Unknown: -1
------------------------------
Feature 'drink':
False: 0
True: 1
Unknown: -1
------------------------------
Feature 'background_father':
BRAZIL: 0
CZECH: 1
GERMANY: 2
ITALY: 3
NETHERLANDS: 4
POLAND: 5
POMERANIA: 6
PORTUGAL: 7
Unknown: -1
------------------------------
Feature 'background_mother':
BRAZIL: 0
FRANCE: 1
GERMANY: 2
ITALY: 3
NETHERLANDS: 4
POLAND: 5
POMERANIA: 6
PORTUGAL: 7
SPAIN: 8
Unknown: -1
------------------------------
Feature 'pesticide':
False: 0
True: 1
Unknown: -1
------------------------------
Feature 'gender':
FEMALE: 0
MALE: 1
Unknown: -1
------------------------------
Feature 'skin_cancer_history':
False: 0
True: 1
Unknown: -1
------------------------------
Feature 'cancer_history':
False: 0
True: 1
Unknown: -1
------------------------------
Feature 'has_piped_water':
False: 0
True: 1
Unknown: -1
------------------------------
Feature 'has_sewage_system':
False: 0
True: 1
Unknown: -1
------------------------------
F

In [18]:
scaler = MinMaxScaler()
df_train_2 = scaler.fit_transform(df_train_2)
df_train_6 = scaler.fit_transform(df_train_6)
df_test_2 = scaler.fit_transform(df_test_2)
df_test_6 = scaler.fit_transform(df_test_6)

In [19]:
y_train_2.drop(columns='img_id',inplace=True)
y_train_2.values.ravel()
y_train_6.drop(columns='img_id',inplace=True)
y_train_6.values.ravel()
y_test_2.drop(columns='img_id',inplace=True)
y_test_2.values.ravel()
y_test_6.drop(columns='img_id',inplace=True)
y_test_6.values.ravel()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_train_2.drop(columns='img_id',inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_train_6.drop(columns='img_id',inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_test_2.drop(columns='img_id',inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_test_6.drop(columns='img_id',

array([5, 1, 1, 1, 1, 1, 0, 1, 5, 1, 4, 1, 0, 1, 0, 0, 5, 1, 2, 1, 0, 3,
       1, 0, 3, 1, 0, 1, 1, 1, 1, 1, 1, 4, 0, 1, 0, 0, 4, 3, 1, 3, 1, 1,
       3, 1, 1, 1, 1, 0, 1, 1, 1, 0, 4, 1, 0, 1, 4, 1, 3, 0, 1, 0, 5, 0,
       0, 1, 3, 3, 3, 5, 3, 3, 1, 1, 1, 1, 1, 3, 4, 5, 5, 0, 0, 0, 1, 1,
       4, 0, 1, 1, 0, 0, 5, 0, 3, 0, 3, 1, 1, 3, 4, 1, 0, 0, 4, 1, 4, 3,
       4, 0, 0, 3, 1, 1, 1, 5, 1, 0, 1, 4, 5, 0, 1, 0, 4, 0, 0, 0, 0, 1,
       3, 5, 1, 3, 0, 1, 0, 1, 1, 1, 0, 0, 1, 3, 0, 0, 0, 5, 0, 0, 1, 0,
       1, 1, 1, 0, 3, 0, 1, 1, 5, 0, 0, 5, 5, 5, 4, 1, 0, 1, 3, 0, 1, 4,
       3, 5, 0, 1, 0, 4, 1, 5, 0, 1, 0, 1, 4, 0, 5, 0, 4, 3, 3, 3, 1, 1,
       1, 0, 0, 0, 3, 3, 0, 1, 1, 1, 2, 5, 5, 1, 0, 5, 3, 3, 0, 1, 1, 0,
       0, 1, 1, 0, 1, 0, 1, 1, 0, 3], dtype=int64)

In [20]:
def textModel(x_train, y_train, x_test, y_test, class_weights=None):
    svm = SVC(kernel='linear', random_state=42)
    svm.fit(x_train, y_train)
    y_pred = svm.predict(x_test)
    print(f'Model 1: {classification_report(y_test, y_pred)}')
    
    rf = RandomForestClassifier(n_estimators=100, class_weight=class_weights, random_state=42)
    rf.fit(x_train, y_train)
    y_pred = rf.predict(x_test)
    print(f'Model 2: {classification_report(y_test, y_pred)}')

    xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
    xgb.fit(x_train, y_train)
    y_pred = xgb.predict(x_test)
    print(f'Model 3: {classification_report(y_test, y_pred)}')

    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(x_train, y_train)
    y_pred = knn.predict(x_test)
    print(f'Model 4: {classification_report(y_test, y_pred)}')

    num_class = len(np.unique(y_train))
    if num_class == 2:
        num_class = 1

    lgbm = LGBMClassifier(num_class=num_class, class_weight=class_weights, random_state=42, verbose=-1)
    lgbm.fit(x_train, y_train)
    y_pred = lgbm.predict(x_test)
    print(f'Model 5: {classification_report(y_test, y_pred)}')

    return svm, rf, xgb, knn, lgbm

In [21]:
svm1, rf1, xgb1, knn1, lgbm1 = textModel(df_train_2,y_train_2,df_test_2,y_test_2)

Model 1:               precision    recall  f1-score   support

           0       0.55      0.85      0.67       122
           1       0.56      0.21      0.31       108

    accuracy                           0.55       230
   macro avg       0.56      0.53      0.49       230
weighted avg       0.56      0.55      0.50       230



  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)


Model 2:               precision    recall  f1-score   support

           0       0.50      0.60      0.55       122
           1       0.42      0.33      0.37       108

    accuracy                           0.47       230
   macro avg       0.46      0.47      0.46       230
weighted avg       0.47      0.47      0.47       230

Model 3:               precision    recall  f1-score   support

           0       0.60      0.50      0.55       122
           1       0.53      0.63      0.57       108

    accuracy                           0.56       230
   macro avg       0.57      0.56      0.56       230
weighted avg       0.57      0.56      0.56       230



  return self._fit(X, y)


Model 4:               precision    recall  f1-score   support

           0       0.55      0.47      0.51       122
           1       0.49      0.57      0.53       108

    accuracy                           0.52       230
   macro avg       0.52      0.52      0.52       230
weighted avg       0.52      0.52      0.52       230

Model 5:               precision    recall  f1-score   support

           0       0.54      0.40      0.46       122
           1       0.47      0.61      0.53       108

    accuracy                           0.50       230
   macro avg       0.51      0.51      0.50       230
weighted avg       0.51      0.50      0.49       230



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [22]:
svm2, rf2, xgb2, knn2, lgbm2 = textModel(df_train_6,y_train_6,df_test_6,y_test_6)

Model 1:               precision    recall  f1-score   support

           0       0.29      0.51      0.37        69
           1       0.31      0.23      0.26        88
           2       0.00      0.00      0.00         2
           3       0.10      0.10      0.10        31
           4       0.00      0.00      0.00        18
           5       0.00      0.00      0.00        22

    accuracy                           0.25       230
   macro avg       0.12      0.14      0.12       230
weighted avg       0.22      0.25      0.23       230



  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)


Model 2:               precision    recall  f1-score   support

           0       0.29      0.42      0.34        69
           1       0.35      0.26      0.30        88
           2       0.00      0.00      0.00         2
           3       0.00      0.00      0.00        31
           4       0.00      0.00      0.00        18
           5       0.14      0.14      0.14        22

    accuracy                           0.24       230
   macro avg       0.13      0.14      0.13       230
weighted avg       0.24      0.24      0.23       230

Model 3:               precision    recall  f1-score   support

           0       0.31      0.32      0.32        69
           1       0.38      0.19      0.26        88
           2       0.00      0.00      0.00         2
           3       0.12      0.16      0.14        31
           4       0.19      0.28      0.22        18
           5       0.13      0.18      0.15        22

    accuracy                           0.23       230
   ma

  return self._fit(X, y)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Model 5:               precision    recall  f1-score   support

           0       0.27      0.29      0.28        69
           1       0.41      0.27      0.33        88
           2       0.00      0.00      0.00         2
           3       0.08      0.06      0.07        31
           4       0.14      0.22      0.17        18
           5       0.17      0.18      0.17        22

    accuracy                           0.23       230
   macro avg       0.18      0.17      0.17       230
weighted avg       0.27      0.23      0.25       230



In [23]:
estimators1 = [('svc', svm1), ('rf', rf1), ('xgb', xgb1), ('knn', knn1), ('lgbm', lgbm1)]
estimators2 = [('svc', svm2), ('rf', rf2), ('xgb', xgb2), ('knn', knn2), ('lgbm', lgbm2)]
stack_model1 = StackingClassifier(estimators=estimators1, final_estimator=XGBClassifier())
stack_model2 = StackingClassifier(estimators=estimators2, final_estimator=XGBClassifier())

In [24]:
# fit the model on the training data
stack_model1.fit(df_train_2, y_train_2)
# make predictions
y_test_2_pred = stack_model1.predict(df_test_2)
# calculate the classification report
stack_model1_result = classification_report(y_test_2, y_test_2_pred)
print(f'Stacking Method for 2 Class Classification Result:\n{stack_model1_result}')

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Stacking Method for 2 Class Classification Result:
              precision    recall  f1-score   support

           0       0.57      0.61      0.59       122
           1       0.52      0.48      0.50       108

    accuracy                           0.55       230
   macro avg       0.54      0.54      0.54       230
weighted avg       0.55      0.55      0.55       230



In [25]:
stack_model2.fit(df_train_6, y_train_6)
y_test_6_pred = stack_model2.predict(df_test_6)
stack_model2_result = classification_report(y_test_6, y_test_6_pred)
print(f'Stacking Method for 6 Class Classification Result:\n{stack_model2_result}')

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Stacking Method for 6 Class Classification Result:
              precision    recall  f1-score   support

           0       0.28      0.32      0.30        69
           1       0.33      0.25      0.29        88
           2       0.00      0.00      0.00         2
           3       0.10      0.10      0.10        31
           4       0.00      0.00      0.00        18
           5       0.09      0.09      0.09        22

    accuracy                           0.21       230
   macro avg       0.13      0.13      0.13       230
weighted avg       0.23      0.21      0.22       230



In [26]:
joblib.dump(stack_model1_result, 'G:\\HSUHK\\COM6003\\project\\archive\\model_resultfinal_stack_2.joblib')
joblib.dump(stack_model2_result, 'G:\\HSUHK\\COM6003\\project\\archive\\model_resultfinal_stack_6.joblib')

['G:\\HSUHK\\COM6003\\project\\archive\\model_resultfinal_stack_6.joblib']