# RAPIDS Data Processing Lab

This notebook demonstrates how to load and process data using RAPIDS cuDF.

In [None]:
import cudf
import numpy as np
import pandas as pd
import time

# Display library versions
print(f"cuDF version: {cudf.__version__}")

In [None]:
# Load a CSV file with cuDF
def load_data(file_path):
    df = cudf.read_csv(file_path)
    return df

# Sample dataset (you can replace this with a real CSV file path)
data = {
    "id": [1, 2, 3, 4, 5],
    "name": ["Alice", "Bob", "Charlie", "David", "Eva"],
    "age": [25, 30, 35, 40, None],
    "salary": [50000, 60000, 75000, 100000, 120000]
}
pd.DataFrame(data).to_csv("sample_data.csv", index=False)

In [None]:
# Load the dataset using RAPIDS cuDF
start_time = time.time()
df = load_data("sample_data.csv")
cuDF_load_time = time.time() - start_time
print("Loaded Data:")
print(df)
print(f"Data loaded in {cuDF_load_time:.5f} seconds using cuDF.")

In [None]:
# Handling missing values
start_time = time.time()
df["age"].fillna(df["age"].mean(), inplace=True)
cuDF_missing_values_time = time.time() - start_time
print("Data after handling missing values:")
print(df)
print(f"Missing values handled in {cuDF_missing_values_time:.5f} seconds using cuDF.")

In [None]:
# Filtering and sorting
start_time = time.time()
df_sorted = df.sort_values(by="salary", ascending=False)
cuDF_sort_time = time.time() - start_time
print("Sorted Data by Salary:")
print(df_sorted)
print(f"Sorting completed in {cuDF_sort_time:.5f} seconds using cuDF.")

In [None]:
# Feature Engineering: Adding a new column
start_time = time.time()
df["income_bracket"] = cudf.cut(df["salary"], bins=[0, 60000, 100000, np.inf], labels=["Low", "Medium", "High"])
cuDF_feature_engineering_time = time.time() - start_time
print("Data with Income Bracket:")
print(df)
print(f"Feature engineering completed in {cuDF_feature_engineering_time:.5f} seconds using cuDF.")

In [None]:
# Save processed data
start_time = time.time()
df.to_csv("processed_data.csv", index=False)
cuDF_save_time = time.time() - start_time
print("Processed data saved to 'processed_data.csv'")
print(f"Data saving completed in {cuDF_save_time:.5f} seconds using cuDF.")

In [None]:
# Performance Benchmarking: Comparing RAPIDS cuDF with pandas
print("\nPerformance Comparison: cuDF vs Pandas")

# Pandas Data Loading
start_time = time.time()
pd_df = pd.read_csv("sample_data.csv")
pandas_load_time = time.time() - start_time
print(f"Pandas: Data loading time: {pandas_load_time:.5f} seconds")

# Pandas Missing Value Handling (Fixed Chained Assignment Warning)
start_time = time.time()
pd_df.loc[:, "age"] = pd_df["age"].fillna(pd_df["age"].mean())
pandas_missing_values_time = time.time() - start_time
print(f"Pandas: Missing values handling time: {pandas_missing_values_time:.5f} seconds")

# Pandas Sorting
start_time = time.time()
pd_df_sorted = pd_df.sort_values(by="salary", ascending=False)
pandas_sort_time = time.time() - start_time
print(f"Pandas: Sorting time: {pandas_sort_time:.5f} seconds")

# Pandas Feature Engineering (Fixed Chained Assignment Warning)
start_time = time.time()
pd_df.loc[:, "income_bracket"] = pd.cut(pd_df["salary"], bins=[0, 60000, 100000, np.inf], labels=["Low", "Medium", "High"])
pandas_feature_engineering_time = time.time() - start_time
print(f"Pandas: Feature engineering time: {pandas_feature_engineering_time:.5f} seconds")

# Pandas Saving Data
start_time = time.time()
pd_df.to_csv("processed_data_pandas.csv", index=False)
pandas_save_time = time.time() - start_time
print(f"Pandas: Data saving time: {pandas_save_time:.5f} seconds")

# Summary Table
print("\nSummary of Execution Time (Lower is Better)")
print(f"{'Operation':<30}{'cuDF (GPU)':<20}{'Pandas (CPU)':<20}")
print("-" * 70)
print(f"{'Data Loading':<30}{cuDF_load_time:.5f} sec    {pandas_load_time:.5f} sec")
print(f"{'Missing Values Handling':<30}{cuDF_missing_values_time:.5f} sec    {pandas_missing_values_time:.5f} sec")
print(f"{'Sorting':<30}{cuDF_sort_time:.5f} sec    {pandas_sort_time:.5f} sec")
print(f"{'Feature Engineering':<30}{cuDF_feature_engineering_time:.5f} sec    {pandas_feature_engineering_time:.5f} sec")
print(f"{'Data Saving':<30}{cuDF_save_time:.5f} sec    {pandas_save_time:.5f} sec")

print("\nSummary: RAPIDS cuDF speeds up data loading, processing, and transformations significantly compared to pandas, making it ideal for AI workflows handling large datasets.")