# Feature Extraction Pipeline

## Imports and functions definition

In [None]:
# Potentially needed installs
pip install gdown

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import tensorflow as tf
import gdown
import zipfile
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [None]:
def dataset_generator(data_dir,size):
    image_generator = tf.keras.preprocessing.image.ImageDataGenerator()

    datagen = image_generator.flow_from_directory(
            data_dir,
            target_size=(size, size),
            batch_size=1,
            color_mode='grayscale',
            class_mode='categorical',
            shuffle=False)

    X = []
    y = []
    for image_batch, label_batch in tqdm(datagen):
        image = image_batch.flatten()
        image = image.reshape(size, size, 1)
        X.append(image)
        label = label_batch.flatten()
        y.append(label)
        if len(X) == len(datagen):
            break

    X = np.array(X)
    y = np.array(y)
    
    return X, y

## Loading the feature extractor model

In [None]:
# URL of Model
url_model = "https://drive.google.com/uc?id=1yd3HDakuh_ckFGmzUa_SFAmkKgsa0gkl"

# Download the dataset
output_model = '/feature_extractor.h5'
gdown.download(url_model, output_model, quiet=False)

In [None]:
# create a function to load the model
def load_model():
    # load the model from the saved file
    model = tf.keras.models.load_model('/feature_extractor.h5')
    return model
# load the model
with tf.device('/device:GPU:0'):
    model = load_model()

## Creations of datasets

### Base dataset

In [None]:
# URL of dataset
url_cohort = "https://drive.google.com/uc?id=1u1bYdXnffUfAM4UeAJIRoH_6Q7C6n0xQ"

# Download the dataset
output_cohort = '/content/cohort.zip'
gdown.download(url_cohort, output_cohort, quiet=False)

In [None]:
# Extract the zip of the data
with zipfile.ZipFile('/content/cohort.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/data')

  2%|▏         | 185M/8.79G [08:23<6:31:18, 366kB/s]


In [None]:
# Create the dataset which will be readable by the feature extractor
X_core, y_core = dataset_generator('/content/data/content/data/aug/',224)

Found 21400 images belonging to 30 classes.


100%|█████████▉| 21399/21400 [05:02<00:00, 70.71it/s]


#### PCA feature extraction

In [None]:
# Get the number of images and the size of each image
num_images, image_height, image_width, _ = X_core.shape

# Reshape each image to a flattened form
X_core_flat = X_core.reshape(num_images, image_height * image_width)

In [None]:
# Create a StandardScaler object
scaler = StandardScaler()

# Fit the scaler to the data and transform the data
X_core_standardized = scaler.fit_transform(X_core_flat)

In [None]:
# Apply PCA
pca = PCA(n_components=200)  # Specify the number of components
X_pca_core = pca.fit_transform(X_core_standardized)

#### CNN feature extraction

In [None]:
# predict the labels
X_CNN_features_core = model.predict(X_core)



#### Save all extracted data

In [None]:
np.save('/X_CNN_features_core.npy', X_CNN_features_core)
np.save('/X_pca_core.npy', X_pca_core)
np.save('/y_core.npy', y_core)

### Cross-validation dataset

In [None]:
# URL of dataset
url_cohort = "https://drive.google.com/uc?id=1FL6wP5e-BP9MrqLVCT680T9BW1sk3sZ9"

# Download the dataset
output_cohort = '/content/crossval.zip'
gdown.download(url_cohort, output_cohort, quiet=False)

In [None]:
# Extract the zip of the data
with zipfile.ZipFile('/content/crossval.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/data')

In [None]:
# Create the dataset which will be readable by the feature extractor
X_cross, y_cross = dataset_generator('/content/data/cross_val_content/cross_val_data/aug/',224)

#### CNN feature extraction

In [None]:
# predict the labels
X_CNN_features_cross = model.predict(X_cross)

#### Save all extracted data

In [None]:
np.save('/X_CNN_features_cross.npy', X_CNN_features_cross)
np.save('/y_cross.npy', y_cross)