## Import dependencies

In [1]:
import pandas as pd
import numpy as np
import sqlite3
from PIL import Image
from io import BytesIO
from sklearn.preprocessing import LabelEncoder

## Load data from DB

In [2]:
db_path = "../db_images.sqlite3"
table_name = "images"
row_limit = None  # Replace None with a specific value if needed
start_row = 0  # default is 0


def load_data():
    # DB connection
    connection = sqlite3.connect(db_path)

    # Load images
    query = f"SELECT * FROM {table_name}"
    if row_limit:
        query += f" LIMIT {row_limit}"
    if start_row:
        query += f" OFFSET {start_row}"
    query += ";"

    # Load Data into Pandas DataFrame
    df = pd.read_sql_query(query, connection)
    # Close the Connection
    connection.close()
    return df
    
data = load_data()
print(data)

           image_id                                              image   age  \
0      ISIC_0027419  b'\xff\xd8\xff\xdb\x00C\x00\x01\x01\x01\x01\x0...  80.0   
1      ISIC_0025030  b'\xff\xd8\xff\xdb\x00C\x00\x01\x01\x01\x01\x0...  80.0   
2      ISIC_0026769  b'\xff\xd8\xff\xdb\x00C\x00\x01\x01\x01\x01\x0...  80.0   
3      ISIC_0025661  b'\xff\xd8\xff\xdb\x00C\x00\x01\x01\x01\x01\x0...  80.0   
4      ISIC_0031633  b'\xff\xd8\xff\xdb\x00C\x00\x01\x01\x01\x01\x0...  75.0   
...             ...                                                ...   ...   
10010  ISIC_0033084  b'\xff\xd8\xff\xdb\x00C\x00\x01\x01\x01\x01\x0...  40.0   
10011  ISIC_0033550  b'\xff\xd8\xff\xdb\x00C\x00\x01\x01\x01\x01\x0...  40.0   
10012  ISIC_0033536  b'\xff\xd8\xff\xdb\x00C\x00\x01\x01\x01\x01\x0...  40.0   
10013  ISIC_0032854  b'\xff\xd8\xff\xdb\x00C\x00\x01\x01\x01\x01\x0...  80.0   
10014  ISIC_0032258  b'\xff\xd8\xff\xdb\x00C\x00\x01\x01\x01\x01\x0...  70.0   

          sex localization lesion_type 

### cleaning data
This function cleans the data by removing any datapoints with empty cells, verifying the correct format of the columns, and eliminating duplicated images in the dataset.

In [3]:
def clean_data(df):
    # Drop data points with empty values such as Nan
    df.dropna(inplace=True)

    # check for the correct format of each property
    df["sex"] = df["sex"].str.lower()  # lower case for "sex" property
    valid_sex = ["female", "male"]
    df = df[df["sex"].isin(valid_sex)]

    # check for the localization values
    # unique_localizations = df['localization'].unique()
    df.loc[:, "localization"] = df["localization"].str.lower()
    valid_localization = [
        "scalp",
        "ear",
        "face",
        "back",
        "trunk",
        "chest",
        "upper extremity",
        "abdomen",
        "lower extremity",
        "genital",
        "neck",
        "hand",
        "foot",
        "acral",
    ]
    df = df[df["localization"].isin(valid_localization)]

    # drop duplicated images
    df.drop_duplicates(subset=["image", "image_id"], inplace=True, keep="last")

    # Clean age with wrong format and wrong range
    df["age"] = pd.to_numeric(
        df["age"], errors="coerce"
    )  # Makes sure Age is a numerical value
    df = df[(df["age"] > 0) & (df["age"] <= 120)]

    return df

cleaned_data = clean_data(data)
print(cleaned_data)

           image_id                                              image   age  \
0      ISIC_0027419  b'\xff\xd8\xff\xdb\x00C\x00\x01\x01\x01\x01\x0...  80.0   
1      ISIC_0025030  b'\xff\xd8\xff\xdb\x00C\x00\x01\x01\x01\x01\x0...  80.0   
2      ISIC_0026769  b'\xff\xd8\xff\xdb\x00C\x00\x01\x01\x01\x01\x0...  80.0   
3      ISIC_0025661  b'\xff\xd8\xff\xdb\x00C\x00\x01\x01\x01\x01\x0...  80.0   
4      ISIC_0031633  b'\xff\xd8\xff\xdb\x00C\x00\x01\x01\x01\x01\x0...  75.0   
...             ...                                                ...   ...   
10010  ISIC_0033084  b'\xff\xd8\xff\xdb\x00C\x00\x01\x01\x01\x01\x0...  40.0   
10011  ISIC_0033550  b'\xff\xd8\xff\xdb\x00C\x00\x01\x01\x01\x01\x0...  40.0   
10012  ISIC_0033536  b'\xff\xd8\xff\xdb\x00C\x00\x01\x01\x01\x01\x0...  40.0   
10013  ISIC_0032854  b'\xff\xd8\xff\xdb\x00C\x00\x01\x01\x01\x01\x0...  80.0   
10014  ISIC_0032258  b'\xff\xd8\xff\xdb\x00C\x00\x01\x01\x01\x01\x0...  70.0   

          sex localization lesion_type 

## Feature extraction and preprocessing function
   * The `preprocess_images()` function converts binary images into a modifiable format, resizes them for compatibility with CNNs, and applies random brightness adjustments and mirroring to replicate various angles and lighting conditions that users might upload images under.

 * The `feautre_preprocessing()` function removes the unuseful features from the dataset, executres the `preprocess_images()` function and applies encoding to features with string values, make them ready to be used by the CNN model



In [4]:
# function for preprocessing images
def preprocess_images(binary_data):
    # Load the image from binary data
    image = Image.open(BytesIO(binary_data)).convert("RGB")

    # Resize the image to 224x224
    image = image.resize((224, 224), Image.Resampling.LANCZOS)

    # Normalize to [0, 1]
    image_array = np.array(image) / 255.0

    return image_array


def feature_preprocessing(df):
    # Discarding the lesion_type feature
    df.drop(columns="lesion_type", axis=1, inplace=True)

    # dropping image_id column
    df.drop(columns=["image_id"], inplace=True)

    # Image preprocessing
    df["image"] = df["image"].apply(preprocess_images)

    # Label encoding the localization feature
    encoder = LabelEncoder()
    df["localization"] = encoder.fit_transform(df["localization"])
    # One-hot encoding sex feature
    df_encoded = pd.get_dummies(df, columns=["sex"], dtype=int, drop_first=True)

    return df_encoded

## Image preprocessing visualization example
Here you can see the preprocessing that `preprocess_images()` function does to images for (For purpose of trying out different methods and check the output)

In [5]:
def show_image_with_pillow(image_array):
    image = Image.fromarray((image_array * 255).astype('uint8'))  # Scale back to [0, 255]
    image.show()

# Randomly sample 100 images
subset_df = cleaned_data.sample(n=100, random_state=42)
# Preprocess only the sampled subset
subset_df['processed_images'] = subset_df['image'].apply(preprocess_images)
print(subset_df.head())
# Example: View first processed image in the DataFrame
show_image_with_pillow(subset_df['processed_images'].iloc[0])


          image_id                                              image   age  \
1170  ISIC_0033891  b'\xff\xd8\xff\xdb\x00C\x00\x01\x01\x01\x01\x0...  35.0   
8634  ISIC_0026099  b'\xff\xd8\xff\xdb\x00C\x00\x01\x01\x01\x01\x0...  65.0   
3950  ISIC_0032056  b'\xff\xd8\xff\xdb\x00C\x00\x01\x01\x01\x01\x0...  65.0   
1643  ISIC_0032638  b'\xff\xd8\xff\xdb\x00C\x00\x01\x01\x01\x01\x0...  35.0   
9281  ISIC_0026657  b'\xff\xd8\xff\xdb\x00C\x00\x01\x01\x01\x01\x0...  60.0   

         sex     localization lesion_type  \
1170  female  lower extremity          df   
8634  female             back          nv   
3950  female  lower extremity          nv   
1643    male             back         mel   
9281  female          abdomen          nv   

                                       processed_images  
1170  [[[0.10196078431372549, 0.047058823529411764, ...  
8634  [[[0.6078431372549019, 0.5254901960784314, 0.4...  
3950  [[[0.9215686274509803, 0.6313725490196078, 0.6...  
1643  [[[0.07450980392

## Preprocess data 
Here you can see the output of final step `feature_preprocessing()` function (**Note**: This function is memory intensive and it will be recommended to use it with a small random subset of the data)

In [6]:
preprocessed_data = feature_preprocessing(cleaned_data)
print(preprocessed_data )

                                                   image   age  localization  \
0      [[[0.7333333333333333, 0.5882352941176471, 0.7...  80.0            11   
1      [[[0.09411764705882353, 0.050980392156862744, ...  80.0            11   
2      [[[0.7294117647058823, 0.4980392156862745, 0.5...  80.0            11   
3      [[[0.09019607843137255, 0.043137254901960784, ...  80.0            11   
4      [[[0.48627450980392156, 0.3254901960784314, 0....  75.0             4   
...                                                  ...   ...           ...   
10010  [[[0.44313725490196076, 0.3568627450980392, 0....  40.0             0   
10011  [[[0.01568627450980392, 0.023529411764705882, ...  40.0             0   
10012  [[[0.4117647058823529, 0.3568627450980392, 0.3...  40.0             0   
10013  [[[0.6078431372549019, 0.45098039215686275, 0....  80.0             5   
10014  [[[0.6941176470588235, 0.5647058823529412, 0.4...  70.0             2   

       sex_male  
0             1  
1  