In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import cv2
from skimage.feature import greycomatrix, greycoprops, local_binary_pattern
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
PART1 = "..\HAM10000_images_part_1"
PART2 = "..\HAM10000_images_part_2"

In [3]:
file_paths = []
for folder in [PART1, PART2]:
    file_paths += [os.path.join(folder, f) for f in os.listdir(folder) if f.endswith(('.jpg', '.png'))]

In [9]:
def extract_glcm_features(image, distances=[1], angles=[0, np.pi/4, np.pi/2, 3*np.pi/4]):
    # Convert to grayscale if necessary
    if len(image.shape) == 3:
        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Normalize image to 8-bit (if not already)
    image = cv2.normalize(image, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)

    # Compute GLCM
    glcm = greycomatrix(image, distances=distances, angles=angles, levels=256, symmetric=True, normed=True)

    # Extract statistical properties
    contrast = greycoprops(glcm, 'contrast').mean()
    dissimilarity = greycoprops(glcm, 'dissimilarity').mean()
    homogeneity = greycoprops(glcm, 'homogeneity').mean()
    energy = greycoprops(glcm, 'energy').mean()
    correlation = greycoprops(glcm, 'correlation').mean()

    return [contrast, dissimilarity, homogeneity, energy, correlation]

In [10]:
glcm = []

for file in tqdm(file_paths, desc="Extracting GLCM features"):
    try:
        img = cv2.imread(file)
        if img is not None:
            features = extract_glcm_features(img)
            # Include folder name to track the source
            folder = os.path.basename(os.path.dirname(file))
            glcm.append([file, folder] + features)
    except Exception as e:
        print(f"Error processing {file}: {e}")

Extracting GLCM features: 100%|██████████| 10015/10015 [14:28<00:00, 11.54it/s]


In [11]:
columns = ['file_name', 'folder', 'contrast', 'dissimilarity', 'homogeneity', 'energy', 'correlation']
df = pd.DataFrame(glcm, columns=columns)

# Save to CSV
df.to_csv('glcm_features.csv', index=False)

# Save to Pickle (optional for faster loading)
df.to_pickle('glcm_features.pkl')

In [5]:
def extract_lbp_features(image, P=8, R=1, method='uniform'):
    # Convert to grayscale if necessary
    if len(image.shape) == 3:
        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Normalize image to 8-bit (if not already)
    image = cv2.normalize(image, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)

    # Compute LBP
    lbp = local_binary_pattern(image, P=P, R=R, method=method)
    
    # Generate histogram of LBP values
    (hist, _) = np.histogram(lbp.ravel(), bins=np.arange(0, P + 3), range=(0, P + 2))

    # Normalize the histogram
    hist = hist.astype("float")
    hist /= hist.sum()

    return hist.tolist()

In [6]:
lbp = []

for file in tqdm(file_paths, desc="Extracting LBP features"):
    try:
        img = cv2.imread(file)
        if img is not None:
            features = extract_lbp_features(img)
            # Include folder name to track the source
            folder = os.path.basename(os.path.dirname(file))
            lbp.append([file, folder] + features)
    except Exception as e:
        print(f"Error processing {file}: {e}")

Extracting LBP features: 100%|██████████| 10015/10015 [26:53<00:00,  6.21it/s] 


In [7]:
columns = ['file_name', 'folder'] + [f'lbp_{i}' for i in range(len(lbp[0]) - 2)]
df = pd.DataFrame(lbp, columns=columns)

# Save to CSV
df.to_csv('combined_lbp_features.csv', index=False)

In [8]:
def extract_color_variance(image):
    # Convert to RGB if image is in BGR format (OpenCV default)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
    # Split channels (R, G, B)
    r, g, b = cv2.split(image)
    
    # Mean values
    mean_r = np.mean(r)
    mean_g = np.mean(g)
    mean_b = np.mean(b)
    
    # Variance values
    var_r = np.var(r)
    var_g = np.var(g)
    var_b = np.var(b)
    
    # Overall variance (across all channels)
    overall_var = np.var(image)
    
    return [mean_r, mean_g, mean_b, var_r, var_g, var_b, overall_var]

In [None]:
variance = []

for file in tqdm(file_paths, desc="Extracting color variance features"):
    try:
        img = cv2.imread(file)
        if img is not None:
            features = extract_color_variance(img)
            # Include folder name to track the source
            folder = os.path.basename(os.path.dirname(file))
            variance.append([file, folder] + features)
    except Exception as e:
        print(f"Error processing {file}: {e}")