In [None]:
import cudf
import numpy as np
import pandas as pd

# Display library versions
print(f"cuDF version: {cudf.__version__}")

# Load a CSV file with cuDF
def load_data(file_path):
    df = cudf.read_csv(file_path)
    return df

# Sample dataset (you can replace this with a real CSV file path)
data = {
    "id": [1, 2, 3, 4, 5],
    "name": ["Alice", "Bob", "Charlie", "David", "Eva"],
    "age": [25, 30, 35, 40, None],
    "salary": [50000, 60000, 75000, 100000, 120000]
}
pd.DataFrame(data).to_csv("sample_data.csv", index=False)

# Load the dataset using RAPIDS cuDF
df = load_data("sample_data.csv")
print("Loaded Data:")
print(df)

# Handling missing values
df["age"].fillna(df["age"].mean(), inplace=True)
print("Data after handling missing values:")
print(df)

# Filtering and sorting
df_sorted = df.sort_values(by="salary", ascending=False)
print("Sorted Data by Salary:")
print(df_sorted)

# Feature Engineering: Adding a new column
df["income_bracket"] = cudf.cut(df["salary"], bins=[0, 60000, 100000, np.inf], labels=["Low", "Medium", "High"])
print("Data with Income Bracket:")
print(df)

# Save processed data
df.to_csv("processed_data.csv", index=False)
print("Processed data saved to 'processed_data.csv'")
