# Use RAPIDS on a single GPU

https://saturncloud.io/docs/examples/python/rapids/qs-01-rapids-single-gpu/


In [1]:
import cudf

from cuml.ensemble import RandomForestClassifier
from cuml.metrics import roc_auc_score
from sklearn.metrics import roc_curve

import matplotlib.pyplot as plt

In [2]:
# choose the gpu to use, 0 or 1
import cupy

# setup memory in the GPU 1
cupy.cuda.Device(1).use()

In [3]:
taxi = cudf.read_csv(
    "https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2019-01.csv",
    parse_dates=["tpep_pickup_datetime", "tpep_dropoff_datetime"],
)

In [4]:
num_rows = len(taxi)
memory_usage = round(taxi.memory_usage(deep=True).sum() / 1e9, 2)
print(f"Num rows: {num_rows}, Memory Usage: {memory_usage} GB")

Num rows: 7667792, Memory Usage: 1.08 GB


In [5]:
def prep_df(df: cudf.DataFrame) -> cudf.DataFrame:

    df = df[df["fare_amount"] > 0]  # to avoid a divide by zero error
    df["tip_fraction"] = df["tip_amount"] / df["fare_amount"]
    df["target"] = df["tip_fraction"] > 0.2

    df["pickup_weekday"] = df["tpep_pickup_datetime"].dt.weekday
    df["pickup_hour"] = df["tpep_pickup_datetime"].dt.hour
    df["pickup_week_hour"] = (df["pickup_weekday"] * 24) + df.pickup_hour
    df["pickup_minute"] = df["tpep_pickup_datetime"].dt.minute

    df = df[
        [
            "pickup_weekday",
            "pickup_hour",
            "pickup_week_hour",
            "pickup_minute",
            "passenger_count",
            "PULocationID",
            "DOLocationID",
            "target",
        ]
    ]

    df = df.astype("float32").fillna(-1)
    df["target"] = df["target"].astype("int32")

    return df

In [6]:
taxi = prep_df(taxi)

In [7]:
taxi["target"].value_counts(normalize=True)

1    0.522546
0    0.477454
Name: target, dtype: float64

In [8]:
num_rows = len(taxi)
memory_usage = round(taxi.memory_usage(deep=True).sum() / 1e9, 2)
print(f"Num rows: {num_rows}, Memory Usage: {memory_usage} GB")

Num rows: 7658235, Memory Usage: 0.31 GB


In [9]:
X = taxi.drop(columns=["target"])
y = taxi["target"]

In [10]:
rfc = RandomForestClassifier(n_estimators = 100, 
                             max_depth = 10, 
                             n_streams = 4, 
                             verbose = 0)

In [None]:
_ = rfc.fit(X, y)

In [None]:
import numpy as np
from cuml.ensemble import RandomForestClassifier as cuRFC

X = np.random.normal(size=(10,4)).astype(np.float32)
y = np.asarray([0,1]*5, dtype=np.int32)

cuml_model = cuRFC(max_features=1.0,
                   n_bins=8,
                   n_estimators=40)
cuml_model.fit(X,y)
cuml_predict = cuml_model.predict(X)

print("Predicted labels : ", cuml_predict)

In [None]:
import cuml
from cuml.datasets.classification import make_classification
from cuml.model_selection import train_test_split
from cuml.ensemble import RandomForestClassifier as cuRF
from sklearn.metrics import accuracy_score
from cupy import asnumpy

# synthetic dataset dimensions
n_samples = 1000
n_features = 10
n_classes = 2

# random forest depth and size
n_estimators = 25
max_depth = 10

# generate synthetic data [ binary classification task ]
X, y = make_classification ( n_classes = n_classes,
                             n_features = n_features,
                             n_samples = n_samples)

X_train, X_test, y_train, y_test = train_test_split( X, y, random_state = 0 )

model = cuRF( max_depth = max_depth,
              n_estimators = n_estimators,
              random_state  = 0 )

%time trained_RF = model.fit ( X_train, y_train )

predictions = model.predict ( X_test )

cu_score = cuml.metrics.accuracy_score( y_test, predictions )
sk_score = accuracy_score( asnumpy( y_test ), asnumpy( predictions ) )

In [None]:
from cuml.datasets.classification import make_classification

X, y = make_classification(n_samples=10, n_features=4,
                           n_informative=2, n_classes=2)

print("X:")
print(X)

print("y:")
print(y)