In [1]:
import os

import cupy as cp
import pandas as pd
import torch

import cudf
import dask_cudf

cp.random.seed(12)

#### Portions of this were borrowed and adapted from the
#### cuDF cheatsheet, existing cuDF documentation,
#### and 10 Minutes to Pandas.

In [2]:
import torch
import cudf
import timeit
from cuml.dask.neighbors import NearestNeighbors
from cuml.common.device_selection import using_device_type, set_global_device_type, get_global_device_type

In [3]:
import time

from dask.distributed import Client, wait
from dask_cuda import LocalCUDACluster

cluster = LocalCUDACluster()
client = Client(cluster)



In [4]:
# Generate a "coordinate" dataframe
n_points = 32000
df = cudf.DataFrame()
df['x'] = torch.cuda.FloatTensor(n_points).uniform_()
df['y'] = torch.cuda.FloatTensor(n_points).uniform_()
df['z'] = torch.cuda.FloatTensor(n_points).uniform_()

In [5]:
ddf = dask_cudf.from_cudf(df, npartitions=1)
ddf.head()

ddf = ddf.persist()

In [11]:
n_neighbors = 16

# Create a cuML NearestNeighbors model
nn = NearestNeighbors(n_neighbors=n_neighbors, client=client)

# Compute the Dask DataFrame
# ddf_computed = ddf.compute()

start_time = timeit.default_timer()

# Fit the model with the input data
nn.fit(ddf)

# Get the nearest neighbors
distances, indices = nn.kneighbors(ddf)


end_time = timeit.default_timer()
execution_time = end_time - start_time
print(f"auto Execution time: {execution_time} seconds")

auto Execution time: 0.863862573998631 seconds


In [12]:
# Create a cuML NearestNeighbors model
nn2 = NearestNeighbors(n_neighbors=n_neighbors, algorithm='rbc')
# Compute the Dask DataFrame
#ddf_computed = ddf.compute()

start_time = timeit.default_timer()



# Fit the model with the input data
nn2.fit(ddf)

# Get the nearest neighbors
distances, indices = nn2.kneighbors(ddf)

end_time = timeit.default_timer()
execution_time = end_time - start_time
print(f"rbc Execution time: {execution_time} seconds")

rbc Execution time: 0.6439342650010076 seconds


In [13]:
# Create a cuML NearestNeighbors model
nn3 = NearestNeighbors(n_neighbors=n_neighbors, algorithm='ivfflat')
# Compute the Dask DataFrame
#ddf_computed = ddf.compute()

start_time = timeit.default_timer()



# Fit the model with the input data
nn3.fit(ddf)

# Get the nearest neighbors
distances, indices = nn3.kneighbors(ddf)

end_time = timeit.default_timer()
execution_time = end_time - start_time
print(f"ivfflat Execution time: {execution_time} seconds")

ivfflat Execution time: 0.6513749460027611 seconds


In [None]:
# # Create a cuML NearestNeighbors model
# nn4 = NearestNeighbors(n_neighbors=n_neighbors, algorithm='ivfpq')
# # Compute the Dask DataFrame
# ddf_computed = ddf.compute()

# start_time = timeit.default_timer()



# # Fit the model with the input data
# nn4.fit(ddf_computed)

# # Get the nearest neighbors
# distances, indices = nn4.kneighbors(ddf_computed)

# end_time = timeit.default_timer()
# execution_time = end_time - start_time
# print(f"ivfflat Execution time: {execution_time} seconds")

In [None]:
client.close()

In [None]:
ddf = ddf.persist()
ddf