## 1. Setup and GPU Information

In [None]:
# Check GPU availability and specifications
!nvidia-smi
print("\n" + "="*60)
!nvidia-smi --query-gpu=name,compute_cap,memory.total --format=csv

In [None]:
# Check CUDA compiler version
!nvcc --version

In [None]:
# Import required libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
from google.colab import files
import seaborn as sns
from IPython.display import Image, display
import subprocess

# Set plot style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 11

## 2. Upload Dataset Files

In [None]:
# Upload iris_train.csv and iris_test.csv
print("Please upload iris_train.csv and iris_test.csv")
uploaded = files.upload()

# Verify files
print("\nUploaded files:")
!ls -lh *.csv

## 3. Create Configurable CUDA Source Code

In [None]:
%%writefile knn_cuda_configurable.cu
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <cuda_runtime.h>

#define CUDA_CHECK(call) \
    do { \
        cudaError_t error = call; \
        if (error != cudaSuccess) { \
            fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__, \
                    cudaGetErrorString(error)); \
            exit(EXIT_FAILURE); \
        } \
    } while(0)

typedef struct {
    double *features;
    int label;
    int id;
} DataPoint;

typedef struct {
    double distance;
    int label;
} DistanceLabel;

typedef struct {
    DataPoint *points;
    int num_points;
    int num_features;
} Dataset;

int BLOCK_SIZE_X = 16;
int BLOCK_SIZE_Y = 16;

Dataset* load_csv(const char *filename);
void free_dataset(Dataset *dataset);
int find_max_label(Dataset *dataset);
int compare_distance(const void *a, const void *b);

__global__ void calculate_distances_batch_kernel(
    double *train_features,
    double *test_features,
    double *distances,
    int num_train,
    int num_test,
    int num_features
) {
    int test_idx = blockIdx.y * blockDim.y + threadIdx.y;
    int train_idx = blockIdx.x * blockDim.x + threadIdx.x;
    
    if (test_idx < num_test && train_idx < num_train) {
        double sum = 0.0;
        for (int f = 0; f < num_features; f++) {
            double diff = train_features[train_idx * num_features + f] - 
                         test_features[test_idx * num_features + f];
            sum += diff * diff;
        }
        distances[test_idx * num_train + train_idx] = sqrt(sum);
    }
}

void knn_predict_batch_gpu(
    double *d_train_features,
    int *d_train_labels,
    double *h_test_features,
    int *h_predictions,
    int num_train,
    int num_test,
    int num_features,
    int k,
    int num_classes
) {
    double *d_test_features, *d_distances;
    CUDA_CHECK(cudaMalloc(&d_test_features, num_test * num_features * sizeof(double)));
    CUDA_CHECK(cudaMalloc(&d_distances, num_test * num_train * sizeof(double)));
    
    CUDA_CHECK(cudaMemcpy(d_test_features, h_test_features, 
                         num_test * num_features * sizeof(double), 
                         cudaMemcpyHostToDevice));
    
    dim3 threads_per_block(BLOCK_SIZE_X, BLOCK_SIZE_Y);
    dim3 num_blocks((num_train + BLOCK_SIZE_X - 1) / BLOCK_SIZE_X, 
                    (num_test + BLOCK_SIZE_Y - 1) / BLOCK_SIZE_Y);
    
    printf("CUDA Configuration: Block size (%d, %d), Grid size (%d, %d)\n",
           BLOCK_SIZE_X, BLOCK_SIZE_Y, num_blocks.x, num_blocks.y);
    
    calculate_distances_batch_kernel<<<num_blocks, threads_per_block>>>(
        d_train_features, d_test_features, d_distances, 
        num_train, num_test, num_features
    );
    CUDA_CHECK(cudaGetLastError());
    CUDA_CHECK(cudaDeviceSynchronize());
    
    double *h_distances = (double*)malloc(num_test * num_train * sizeof(double));
    CUDA_CHECK(cudaMemcpy(h_distances, d_distances, 
                         num_test * num_train * sizeof(double), 
                         cudaMemcpyDeviceToHost));
    
    int *h_labels = (int*)malloc(num_train * sizeof(int));
    CUDA_CHECK(cudaMemcpy(h_labels, d_train_labels, 
                         num_train * sizeof(int), cudaMemcpyDeviceToHost));
    
    for (int t = 0; t < num_test; t++) {
        DistanceLabel *dist_labels = (DistanceLabel*)malloc(num_train * sizeof(DistanceLabel));
        for (int i = 0; i < num_train; i++) {
            dist_labels[i].distance = h_distances[t * num_train + i];
            dist_labels[i].label = h_labels[i];
        }
        
        qsort(dist_labels, num_train, sizeof(DistanceLabel), compare_distance);
        
        int *votes = (int*)calloc(num_classes, sizeof(int));
        for (int i = 0; i < k; i++) {
            int label = dist_labels[i].label;
            if (label >= 0 && label < num_classes) {
                votes[label]++;
            }
        }
        
        int max_votes = -1;
        int predicted_label = -1;
        for (int i = 0; i < num_classes; i++) {
            if (votes[i] > max_votes) {
                max_votes = votes[i];
                predicted_label = i;
            }
        }
        
        h_predictions[t] = predicted_label;
        
        free(dist_labels);
        free(votes);
    }
    
    free(h_distances);
    free(h_labels);
    CUDA_CHECK(cudaFree(d_test_features));
    CUDA_CHECK(cudaFree(d_distances));
}

int compare_distance(const void *a, const void *b) {
    DistanceLabel *dl_a = (DistanceLabel*)a;
    DistanceLabel *dl_b = (DistanceLabel*)b;
    if (dl_a->distance < dl_b->distance) return -1;
    if (dl_a->distance > dl_b->distance) return 1;
    return 0;
}

int main(int argc, char *argv[]) {
    char *train_file = "iris_train.csv";
    char *test_file = "iris_test.csv";
    int k = 3;
    
    if (argc >= 4) {
        train_file = argv[1];
        test_file = argv[2];
        k = atoi(argv[3]);
    }
    
    if (argc >= 6) {
        BLOCK_SIZE_X = atoi(argv[4]);
        BLOCK_SIZE_Y = atoi(argv[5]);
    }
    
    int device_count = 0;
    CUDA_CHECK(cudaGetDeviceCount(&device_count));
    if (device_count == 0) {
        fprintf(stderr, "No CUDA devices found!\n");
        return 1;
    }
    
    cudaDeviceProp device_prop;
    CUDA_CHECK(cudaGetDeviceProperties(&device_prop, 0));
    printf("GPU: %s (Compute %d.%d)\n", device_prop.name, device_prop.major, device_prop.minor);
    
    Dataset *train_data = load_csv(train_file);
    if (!train_data) return 1;
    
    Dataset *test_data = load_csv(test_file);
    if (!test_data) {
        free_dataset(train_data);
        return 1;
    }
    
    int max_train_label = find_max_label(train_data);
    int max_test_label = find_max_label(test_data);
    int num_classes = (max_train_label > max_test_label ? max_train_label : max_test_label) + 1;
    
    if (train_data->num_features != test_data->num_features) {
        fprintf(stderr, "Error: Feature dimension mismatch!\n");
        free_dataset(train_data);
        free_dataset(test_data);
        return 1;
    }
    
    int num_train = train_data->num_points;
    int num_test = test_data->num_points;
    int num_features = train_data->num_features;
    
    printf("Dataset: %d train, %d test, %d features, %d classes\n",
           num_train, num_test, num_features, num_classes);
    
    double *h_train_features = (double*)malloc(num_train * num_features * sizeof(double));
    int *h_train_labels = (int*)malloc(num_train * sizeof(int));
    double *h_test_features = (double*)malloc(num_test * num_features * sizeof(double));
    int *h_test_labels = (int*)malloc(num_test * sizeof(int));
    
    for (int i = 0; i < num_train; i++) {
        for (int j = 0; j < num_features; j++) {
            h_train_features[i * num_features + j] = train_data->points[i].features[j];
        }
        h_train_labels[i] = train_data->points[i].label;
    }
    
    for (int i = 0; i < num_test; i++) {
        for (int j = 0; j < num_features; j++) {
            h_test_features[i * num_features + j] = test_data->points[i].features[j];
        }
        h_test_labels[i] = test_data->points[i].label;
    }
    
    double *d_train_features;
    int *d_train_labels;
    CUDA_CHECK(cudaMalloc(&d_train_features, num_train * num_features * sizeof(double)));
    CUDA_CHECK(cudaMalloc(&d_train_labels, num_train * sizeof(int)));
    
    CUDA_CHECK(cudaMemcpy(d_train_features, h_train_features, 
                         num_train * num_features * sizeof(double), 
                         cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_train_labels, h_train_labels, 
                         num_train * sizeof(int), 
                         cudaMemcpyHostToDevice));
    
    cudaEvent_t start, stop;
    CUDA_CHECK(cudaEventCreate(&start));
    CUDA_CHECK(cudaEventCreate(&stop));
    
    CUDA_CHECK(cudaEventRecord(start));
    
    int *h_predictions = (int*)malloc(num_test * sizeof(int));
    knn_predict_batch_gpu(d_train_features, d_train_labels, h_test_features, 
                         h_predictions, num_train, num_test, num_features, 
                         k, num_classes);
    
    CUDA_CHECK(cudaEventRecord(stop));
    CUDA_CHECK(cudaEventSynchronize(stop));
    
    float execution_time = 0;
    CUDA_CHECK(cudaEventElapsedTime(&execution_time, start, stop));
    
    int correct_predictions = 0;
    for (int i = 0; i < num_test; i++) {
        if (h_predictions[i] == h_test_labels[i]) {
            correct_predictions++;
        }
    }
    
    double accuracy = (double)correct_predictions / num_test * 100.0;
    
    printf("\nRESULTS: Accuracy=%.2f%%, Time=%.4f ms, k=%d\n",
           accuracy, execution_time, k);
    
    free(h_train_features);
    free(h_train_labels);
    free(h_test_features);
    free(h_test_labels);
    free(h_predictions);
    CUDA_CHECK(cudaFree(d_train_features));
    CUDA_CHECK(cudaFree(d_train_labels));
    CUDA_CHECK(cudaEventDestroy(start));
    CUDA_CHECK(cudaEventDestroy(stop));
    free_dataset(train_data);
    free_dataset(test_data);
    
    return 0;
}

Dataset* load_csv(const char *filename) {
    FILE *file = fopen(filename, "r");
    if (!file) return NULL;
    
    int num_lines = 0;
    char buffer[1024];
    while (fgets(buffer, sizeof(buffer), file)) num_lines++;
    rewind(file);
    
    if (num_lines == 0) {
        fclose(file);
        return NULL;
    }
    
    Dataset *dataset = (Dataset*)malloc(sizeof(Dataset));
    if (!dataset) {
        fclose(file);
        return NULL;
    }
    
    dataset->points = (DataPoint*)malloc(num_lines * sizeof(DataPoint));
    dataset->num_points = 0;
    dataset->num_features = 0;
    
    while (fgets(buffer, sizeof(buffer), file)) {
        buffer[strcspn(buffer, "\n")] = 0;
        if (strlen(buffer) == 0) continue;
        
        int features_in_line = 1;
        for (int i = 0; buffer[i] != '\0'; i++) {
            if (buffer[i] == ',') features_in_line++;
        }
        
        if (dataset->num_features == 0) {
            dataset->num_features = features_in_line - 1;
        }
        
        if (features_in_line != dataset->num_features + 1) continue;
        
        dataset->points[dataset->num_points].features = 
            (double*)malloc(dataset->num_features * sizeof(double));
        
        char *token = strtok(buffer, ",");
        int feature_idx = 0;
        
        while (token != NULL && feature_idx < dataset->num_features) {
            dataset->points[dataset->num_points].features[feature_idx] = atof(token);
            token = strtok(NULL, ",");
            feature_idx++;
        }
        
        if (token != NULL) {
            dataset->points[dataset->num_points].label = atoi(token);
        }
        
        dataset->points[dataset->num_points].id = dataset->num_points;
        dataset->num_points++;
    }
    
    fclose(file);
    return dataset;
}

int find_max_label(Dataset *dataset) {
    if (!dataset || dataset->num_points == 0) return -1;
    int max_label = dataset->points[0].label;
    for (int i = 1; i < dataset->num_points; i++) {
        if (dataset->points[i].label > max_label) {
            max_label = dataset->points[i].label;
        }
    }
    return max_label;
}

void free_dataset(Dataset *dataset) {
    if (dataset) {
        if (dataset->points) {
            for (int i = 0; i < dataset->num_points; i++) {
                if (dataset->points[i].features) {
                    free(dataset->points[i].features);
                }
            }
            free(dataset->points);
        }
        free(dataset);
    }
}

## 4. Compile CUDA Code

In [None]:
# Compile with appropriate architecture for Colab GPUs
!nvcc -O3 -arch=sm_75 -o knn_cuda knn_cuda_configurable.cu -lm

import os
if os.path.exists('knn_cuda'):
    print("\nâœ“ Compilation successful!")
else:
    print("\nâœ— Compilation failed!")
    print("\nTrying alternative architecture...")
    !nvcc -O3 -arch=sm_37 -o knn_cuda knn_cuda_configurable.cu -lm

## 5. Performance Analysis - Varying Block Sizes and Thread Configurations

In [None]:
# Define different block size configurations to test
block_configurations = [
    (4, 4),
    (8, 8),
    (16, 16),
    (32, 32),
    (8, 16),
    (16, 8),
    (32, 16),
    (16, 32),
    (64, 4),
    (4, 64),
    (128, 2),
    (2, 128)
]

# Store results
results = []
baseline_time = None

print("Running performance tests with different CUDA configurations...")
print("="*70)

for block_x, block_y in block_configurations:
    threads_per_block = block_x * block_y
    
    # Run the CUDA program with specific block configuration
    cmd = f"./knn_cuda iris_train.csv iris_test.csv 3 {block_x} {block_y}"
    result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
    output = result.stdout
    
    # Extract execution time and accuracy
    time_match = re.search(r'Time=([0-9.]+)\s*ms', output)
    acc_match = re.search(r'Accuracy=([0-9.]+)%', output)
    
    if time_match and acc_match:
        exec_time = float(time_match.group(1))
        accuracy = float(acc_match.group(1))
        
        # Calculate speedup (using first configuration as baseline)
        if baseline_time is None:
            baseline_time = exec_time
        speedup = baseline_time / exec_time
        
        results.append({
            'Block_X': block_x,
            'Block_Y': block_y,
            'Threads_Per_Block': threads_per_block,
            'Config': f"{block_x}x{block_y}",
            'Execution_Time_ms': exec_time,
            'Accuracy': accuracy,
            'Speedup': speedup
        })
        
        print(f"Config: {block_x:3d}x{block_y:<3d} ({threads_per_block:4d} threads) | "
              f"Time: {exec_time:7.4f} ms | Accuracy: {accuracy:6.2f}% | Speedup: {speedup:.3f}x")

print("="*70)
print(f"\nCompleted {len(results)} configurations")

# Create DataFrame
df_results = pd.DataFrame(results)
display(df_results)

## 6. Generate Performance Graphs

In [None]:
# Graph 1: Configuration Parameters vs Execution Time
fig, ax = plt.subplots(figsize=(14, 7))

x_labels = df_results['Config']
x_pos = np.arange(len(x_labels))
exec_times = df_results['Execution_Time_ms']

bars = ax.bar(x_pos, exec_times, color=sns.color_palette("viridis", len(x_labels)))

# Add value labels on bars
for i, (bar, time, threads) in enumerate(zip(bars, exec_times, df_results['Threads_Per_Block'])):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{time:.3f}ms\n({threads}t)',
            ha='center', va='bottom', fontsize=9)

ax.set_xlabel('Block Configuration (X x Y)', fontsize=13, fontweight='bold')
ax.set_ylabel('Execution Time (ms)', fontsize=13, fontweight='bold')
ax.set_title('CUDA Performance: Block Configuration vs Execution Time', 
             fontsize=15, fontweight='bold', pad=20)
ax.set_xticks(x_pos)
ax.set_xticklabels(x_labels, rotation=45, ha='right')
ax.grid(axis='y', alpha=0.3, linestyle='--')

plt.tight_layout()
plt.savefig('cuda_config_vs_time.png', dpi=300, bbox_inches='tight')
plt.show()

print("âœ“ Graph saved as 'cuda_config_vs_time.png'")

In [None]:
# Graph 2: Configuration Parameters vs Speedup
fig, ax = plt.subplots(figsize=(14, 7))

speedups = df_results['Speedup']
bars = ax.bar(x_pos, speedups, color=sns.color_palette("rocket", len(x_labels)))

# Add value labels on bars
for i, (bar, speedup, threads) in enumerate(zip(bars, speedups, df_results['Threads_Per_Block'])):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{speedup:.3f}x\n({threads}t)',
            ha='center', va='bottom', fontsize=9)

# Add baseline reference line
ax.axhline(y=1.0, color='red', linestyle='--', linewidth=2, label='Baseline (1.0x)')

ax.set_xlabel('Block Configuration (X x Y)', fontsize=13, fontweight='bold')
ax.set_ylabel('Speedup (relative to first config)', fontsize=13, fontweight='bold')
ax.set_title('CUDA Performance: Block Configuration vs Speedup', 
             fontsize=15, fontweight='bold', pad=20)
ax.set_xticks(x_pos)
ax.set_xticklabels(x_labels, rotation=45, ha='right')
ax.grid(axis='y', alpha=0.3, linestyle='--')
ax.legend(fontsize=11)

plt.tight_layout()
plt.savefig('cuda_config_vs_speedup.png', dpi=300, bbox_inches='tight')
plt.show()

print("âœ“ Graph saved as 'cuda_config_vs_speedup.png'")

In [None]:
# Graph 3: Threads Per Block vs Execution Time
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Sort by threads per block
df_sorted = df_results.sort_values('Threads_Per_Block')

# Plot 1: Execution Time
ax1.plot(df_sorted['Threads_Per_Block'], df_sorted['Execution_Time_ms'], 
         marker='o', linewidth=2, markersize=8, color='#2E86AB')
ax1.set_xlabel('Threads Per Block', fontsize=12, fontweight='bold')
ax1.set_ylabel('Execution Time (ms)', fontsize=12, fontweight='bold')
ax1.set_title('Threads Per Block vs Execution Time', fontsize=13, fontweight='bold')
ax1.grid(True, alpha=0.3)
ax1.set_xscale('log', base=2)

# Plot 2: Speedup
ax2.plot(df_sorted['Threads_Per_Block'], df_sorted['Speedup'], 
         marker='s', linewidth=2, markersize=8, color='#A23B72')
ax2.axhline(y=1.0, color='red', linestyle='--', linewidth=1, alpha=0.5)
ax2.set_xlabel('Threads Per Block', fontsize=12, fontweight='bold')
ax2.set_ylabel('Speedup', fontsize=12, fontweight='bold')
ax2.set_title('Threads Per Block vs Speedup', fontsize=13, fontweight='bold')
ax2.grid(True, alpha=0.3)
ax2.set_xscale('log', base=2)

plt.tight_layout()
plt.savefig('cuda_threads_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

print("âœ“ Graph saved as 'cuda_threads_analysis.png'")

In [None]:
# Graph 4: Heatmap of Execution Time by Block Dimensions
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Create pivot tables
pivot_time = df_results.pivot_table(values='Execution_Time_ms', 
                                     index='Block_Y', columns='Block_X', aggfunc='mean')
pivot_speedup = df_results.pivot_table(values='Speedup', 
                                        index='Block_Y', columns='Block_X', aggfunc='mean')

# Heatmap 1: Execution Time
sns.heatmap(pivot_time, annot=True, fmt='.3f', cmap='YlOrRd', ax=ax1, 
            cbar_kws={'label': 'Time (ms)'}, linewidths=0.5)
ax1.set_title('Execution Time Heatmap (ms)', fontsize=13, fontweight='bold')
ax1.set_xlabel('Block X Dimension', fontsize=11, fontweight='bold')
ax1.set_ylabel('Block Y Dimension', fontsize=11, fontweight='bold')

# Heatmap 2: Speedup
sns.heatmap(pivot_speedup, annot=True, fmt='.3f', cmap='RdYlGn', ax=ax2, 
            cbar_kws={'label': 'Speedup (x)'}, linewidths=0.5)
ax2.set_title('Speedup Heatmap', fontsize=13, fontweight='bold')
ax2.set_xlabel('Block X Dimension', fontsize=11, fontweight='bold')
ax2.set_ylabel('Block Y Dimension', fontsize=11, fontweight='bold')

plt.tight_layout()
plt.savefig('cuda_heatmap.png', dpi=300, bbox_inches='tight')
plt.show()

print("âœ“ Graph saved as 'cuda_heatmap.png'")

## 7. Capture Execution Screenshots with Different Configurations

In [None]:
# Run and display output for key configurations
key_configs = [(8, 8), (16, 16), (32, 32), (16, 8), (32, 16)]

print("\n" + "="*70)
print("DETAILED EXECUTION SCREENSHOTS FOR KEY CONFIGURATIONS")
print("="*70 + "\n")

for block_x, block_y in key_configs:
    print("\n" + "#"*70)
    print(f"# Configuration: Block Size ({block_x} x {block_y}) - {block_x*block_y} threads per block")
    print("#"*70)
    
    cmd = f"./knn_cuda iris_train.csv iris_test.csv 3 {block_x} {block_y}"
    !{cmd}
    
    print("\n" + "-"*70 + "\n")

## 8. Statistical Analysis and Best Configuration

In [None]:
# Find best and worst configurations
best_time_idx = df_results['Execution_Time_ms'].idxmin()
worst_time_idx = df_results['Execution_Time_ms'].idxmax()
best_speedup_idx = df_results['Speedup'].idxmax()

print("\n" + "="*70)
print("PERFORMANCE ANALYSIS SUMMARY")
print("="*70)

print("\nâœ“ BEST Configuration (Fastest Execution):")
print(f"  Block Size: {df_results.loc[best_time_idx, 'Config']}")
print(f"  Threads Per Block: {df_results.loc[best_time_idx, 'Threads_Per_Block']}")
print(f"  Execution Time: {df_results.loc[best_time_idx, 'Execution_Time_ms']:.4f} ms")
print(f"  Speedup: {df_results.loc[best_time_idx, 'Speedup']:.3f}x")
print(f"  Accuracy: {df_results.loc[best_time_idx, 'Accuracy']:.2f}%")

print("\nâœ— WORST Configuration (Slowest Execution):")
print(f"  Block Size: {df_results.loc[worst_time_idx, 'Config']}")
print(f"  Threads Per Block: {df_results.loc[worst_time_idx, 'Threads_Per_Block']}")
print(f"  Execution Time: {df_results.loc[worst_time_idx, 'Execution_Time_ms']:.4f} ms")
print(f"  Speedup: {df_results.loc[worst_time_idx, 'Speedup']:.3f}x")
print(f"  Accuracy: {df_results.loc[worst_time_idx, 'Accuracy']:.2f}%")

print("\nðŸ“Š STATISTICS:")
print(f"  Average Execution Time: {df_results['Execution_Time_ms'].mean():.4f} ms")
print(f"  Std Deviation (Time): {df_results['Execution_Time_ms'].std():.4f} ms")
print(f"  Min Time: {df_results['Execution_Time_ms'].min():.4f} ms")
print(f"  Max Time: {df_results['Execution_Time_ms'].max():.4f} ms")
print(f"  Time Range: {df_results['Execution_Time_ms'].max() - df_results['Execution_Time_ms'].min():.4f} ms")

improvement = ((df_results.loc[worst_time_idx, 'Execution_Time_ms'] - 
                df_results.loc[best_time_idx, 'Execution_Time_ms']) / 
               df_results.loc[worst_time_idx, 'Execution_Time_ms']) * 100

print(f"\nâš¡ Performance Improvement: {improvement:.2f}%")
print(f"   (Best config is {improvement:.2f}% faster than worst config)")

print("\n" + "="*70)

## 9. Export Results and Download

In [None]:
# Save results to CSV
df_results.to_csv('cuda_performance_results.csv', index=False)
print("âœ“ Results saved to 'cuda_performance_results.csv'")

# Create detailed report
with open('cuda_performance_report.txt', 'w') as f:
    f.write("="*70 + "\n")
    f.write("CUDA KNN PERFORMANCE ANALYSIS REPORT\n")
    f.write("="*70 + "\n\n")
    
    f.write(f"GPU Information:\n")
    gpu_info = subprocess.run(["nvidia-smi", "--query-gpu=name,compute_cap,memory.total", "--format=csv,noheader"], 
                             capture_output=True, text=True)
    f.write(f"  {gpu_info.stdout}\n")
    
    f.write(f"\nTotal Configurations Tested: {len(df_results)}\n")
    f.write(f"\nBest Configuration:\n")
    f.write(f"  Block Size: {df_results.loc[best_time_idx, 'Config']}\n")
    f.write(f"  Threads: {df_results.loc[best_time_idx, 'Threads_Per_Block']}\n")
    f.write(f"  Time: {df_results.loc[best_time_idx, 'Execution_Time_ms']:.4f} ms\n")
    f.write(f"  Speedup: {df_results.loc[best_time_idx, 'Speedup']:.3f}x\n")
    
    f.write(f"\nPerformance Improvement: {improvement:.2f}%\n")
    
    f.write("\n" + "="*70 + "\n")
    f.write("DETAILED RESULTS\n")
    f.write("="*70 + "\n\n")
    f.write(df_results.to_string())

print("âœ“ Report saved to 'cuda_performance_report.txt'")

# List all generated files
print("\nGenerated Files:")
print("  ðŸ“Š cuda_config_vs_time.png")
print("  ðŸ“Š cuda_config_vs_speedup.png")
print("  ðŸ“Š cuda_threads_analysis.png")
print("  ðŸ“Š cuda_heatmap.png")
print("  ðŸ“„ cuda_performance_results.csv")
print("  ðŸ“„ cuda_performance_report.txt")

In [None]:
# Download all results
print("Downloading all files...\n")

files_to_download = [
    'cuda_config_vs_time.png',
    'cuda_config_vs_speedup.png',
    'cuda_threads_analysis.png',
    'cuda_heatmap.png',
    'cuda_performance_results.csv',
    'cuda_performance_report.txt'
]

for file in files_to_download:
    if os.path.exists(file):
        files.download(file)
        print(f"âœ“ Downloaded: {file}")

print("\nâœ… All files ready for download!")