In [None]:
!pip install pycuda



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
cuda_code = """
__global__ void normalize_image(float *img, int width, int height, float min_val, float max_val) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x < width && y < height) {
        int idx = y * width + x;
        img[idx] = (img[idx] - min_val) / (max_val - min_val);
    }
}
"""

In [None]:
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy as np

# Compile the CUDA code
mod = SourceModule(cuda_code)

# Get the kernel function
normalize_image = mod.get_function("normalize_image")
def normalize_image_gpu(images, min_val, max_val, batch_size=8):
    num_images, height, width, channels = images.shape
    normalized_images = np.empty_like(images, dtype=np.float32)

    for i in range(0, num_images, batch_size):
        batch_images = images[i:i + batch_size].astype(np.float32).reshape(-1)
        d_img = cuda.mem_alloc(batch_images.nbytes)
        cuda.memcpy_htod(d_img, batch_images)

        block_size = (16, 16, 1)
        grid_size = ((width + block_size[0] - 1) // block_size[0], (height + block_size[1] - 1) // block_size[1], 1)

        normalize_image(d_img, np.int32(width), np.int32(height), np.float32(min_val), np.float32(max_val), block=block_size, grid=grid_size)

        cuda.memcpy_dtoh(batch_images, d_img)
        d_img.free()

        normalized_images[i:i + batch_size] = batch_images.reshape(-1, height, width, channels)

    return normalized_images

# Example usage
images = np.random.rand(10, 128, 128, 3).astype(np.float32)
min_val, max_val = 0.0, 1.0
normalized_images = normalize_image_gpu(images, min_val, max_val)


In [None]:
# Write the CUDA code to a file (this one is for computing)
cuda_code = """
__global__ void relu_forward(float* input, float* output, int size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    if (idx < size) {
        output[idx] = max(0.0f, input[idx]);
    }
}
"""

In [None]:
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy as np

# Compile the CUDA code
mod = SourceModule(cuda_code)

# Get the kernel function
relu_layer = mod.get_function("relu_forward")

def custom_relu(input_data):
    size = input_data.size
    output_data = np.empty_like(input_data, dtype=np.float32)

    d_input = cuda.mem_alloc(input_data.nbytes)
    d_output = cuda.mem_alloc(output_data.nbytes)

    cuda.memcpy_htod(d_input, input_data)

    block_size = 256
    num_blocks = (size + block_size - 1) // block_size

    relu_layer(d_input, d_output, np.int32(size), block=(block_size, 1, 1), grid=(num_blocks, 1))

    cuda.memcpy_dtoh(output_data, d_output)
    d_input.free()
    d_output.free()

    return output_data

# Example usage
input_data = np.random.rand(1000).astype(np.float32)
output_data = custom_relu(input_data)
print(output_data)


[1.18572665e-02 1.75320789e-01 9.96459007e-01 9.22944546e-01
 8.03771198e-01 2.19261020e-01 9.89260972e-01 1.42660052e-01
 2.81161755e-01 9.12258685e-01 2.87072301e-01 9.31347430e-01
 4.05042320e-01 5.56055188e-01 6.60269678e-01 1.99712068e-02
 2.97770590e-01 6.75668791e-02 9.53897178e-01 9.41719532e-01
 2.53090858e-01 7.06065893e-01 3.46261442e-01 8.85691404e-01
 1.58996522e-01 5.02579808e-01 3.56662780e-01 2.72213131e-01
 5.19051850e-02 9.43120599e-01 3.92505467e-01 3.59832048e-01
 1.38048932e-01 9.47691739e-01 6.01950526e-01 8.94782722e-01
 6.10540986e-01 2.04158902e-01 7.44522393e-01 6.21400416e-01
 8.58959854e-01 1.73515752e-01 1.82932645e-01 5.18565416e-01
 3.16558450e-01 3.60562384e-01 5.69569111e-01 9.97245871e-03
 7.32277870e-01 2.16897547e-01 2.72097737e-01 4.30449158e-01
 5.26340663e-01 6.83251739e-01 1.69200718e-01 4.61802155e-01
 2.83002645e-01 8.69428456e-01 3.99509119e-03 3.58003318e-01
 4.10294712e-01 5.46375692e-01 5.55820279e-02 7.59413019e-02
 2.83136815e-01 1.434875

In [None]:
from tensorflow.keras.applications import VGG16
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Dropout, Flatten, Input, Layer
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import pandas as pd
import os
import matplotlib.pyplot as plt
import concurrent.futures
import numpy as np
from tqdm import tqdm  # to show progress bar

class CustomReLULayer(Layer):
    def __init__(self):
        super(CustomReLULayer, self).__init__()

    def call(self, inputs):
        input_data = inputs.numpy().astype(np.float32)
        output_data = custom_relu(input_data)
        return tf.convert_to_tensor(output_data)


# Load the driver image list
df = pd.read_csv('/content/drive/My Drive/Acads/MEngAI/CS 239/Datasets/Distracted Driving/driver_imgs_list.csv')
df['img'] = df.apply(lambda row: os.path.join('/content/drive/My Drive/Acads/MEngAI/CS 239/Datasets/Distracted Driving/imgs/train', row['classname'], row['img']), axis=1)

# Load and preprocess images
image_paths = df['img'].values
labels = pd.get_dummies(df['classname']).values

# Load images (this should be done with an efficient data loader)
images = np.array([plt.imread(img_path) for img_path in tqdm(image_paths, desc="Processing images")])


# Normalize images using CUDA
images = normalize_image_gpu(images, 0.0, 255.0)

# Split the data
X_train, X_val, y_train, y_val = train_test_split(images, labels, test_size=0.2, random_state=42)

# Load VGG16 model without top layers
base_model = VGG16(weights='imagenet', include_top=False, input_shape=(128, 128, 3))

# Freeze the base model layers
base_model.trainable = False

# Add custom layers
x = base_model.output
x = Flatten()(x)
x = CustomReLULayer()(x)  # Use the custom CUDA-based ReLU layer
x = Dense(256, activation='relu')(x)
x = Dropout(0.5)(x)
predictions = Dense(len(np.unique(df['classname'])), activation='softmax')(x)

# Create the model
model = Model(inputs=base_model.input, outputs=predictions)

# Compile the model
model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

# Train the model
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=10,
    batch_size=64
)


Processing images: 100%|██████████| 22424/22424 [01:50<00:00, 202.77it/s]
