Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add skip_noisy_assignment to dataset.cluster #1194

Merged
merged 3 commits into from
Feb 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 29 additions & 10 deletions lilac/data/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@
UMAP_DIM = 5
UMAP_SEED = 42
HDBSCAN_SELECTION_EPS = 0.05
BATCH_SOFT_CLUSTER_NOISE = 1024
BATCH_SOFT_CLUSTER_NOISE = 512


def cluster_impl(
Expand All @@ -68,6 +68,7 @@ def cluster_impl(
task_id: Optional[TaskId] = None,
recompute_titles: bool = False,
batch_size_titling: Optional[int] = None,
skip_noisy_assignment: bool = False,
dsmilkov marked this conversation as resolved.
Show resolved Hide resolved
) -> None:
"""Compute clusters for a field of the dataset."""
topic_fn = topic_fn or generate_title_openai
Expand Down Expand Up @@ -108,6 +109,13 @@ def cluster_impl(
else:
raise ValueError('input must be provided.')

if use_garden and skip_noisy_assignment:
raise ValueError(
'`use_garden` and `skip_noisy_assignment` cannot both be True. '
'The garden implementation is heavily optimizied and will always '
'assign noisy points to the nearest cluster.'
)

# Extract the text from the input path into a temporary column.
TEXT_COLUMN = 'text'
temp_text_path = (*cluster_output_path, TEXT_COLUMN)
Expand Down Expand Up @@ -154,7 +162,12 @@ def cluster_documents(items: Iterator[Item]) -> Iterator[Item]:
cluster_items = sparse_to_dense_compute(
docs,
lambda x: _hdbscan_cluster(
x, min_cluster_size, use_garden, num_docs=total_len, task_info=task_info
x,
min_cluster_size,
use_garden,
num_docs=total_len,
task_info=task_info,
skip_noisy_assignment=skip_noisy_assignment,
),
)
for item, cluster_item in zip(items2, cluster_items):
Expand Down Expand Up @@ -208,7 +221,13 @@ def cluster_titles(items: Iterator[Item]) -> Iterator[Item]:
items, items2 = itertools.tee(items)
docs = (item.get(CLUSTER_TITLE) for item in items)
cluster_items = sparse_to_dense_compute(
docs, lambda x: _hdbscan_cluster(x, MIN_CLUSTER_SIZE_CATEGORY, use_garden)
docs,
lambda x: _hdbscan_cluster(
x,
MIN_CLUSTER_SIZE_CATEGORY,
use_garden=use_garden,
skip_noisy_assignment=skip_noisy_assignment,
),
)
for item, cluster_item in zip(items2, cluster_items):
item[CATEGORY_ID] = (cluster_item or {}).get(CLUSTER_ID, -1)
Expand Down Expand Up @@ -298,6 +317,7 @@ def _hdbscan_cluster(
use_garden: bool = False,
num_docs: Optional[int] = None,
task_info: Optional[TaskInfo] = None,
skip_noisy_assignment: bool = False,
) -> Iterator[Item]:
"""Cluster docs with HDBSCAN."""
if use_garden:
Expand Down Expand Up @@ -338,9 +358,9 @@ def _hdbscan_cluster(
from umap import UMAP

dim = all_vectors[0].size
with DebugTimer(f'UMAP: Reducing dim from {dim} to {UMAP_DIM} of {len(all_vectors)} vectors'):
n_neighbors = min(30, len(all_vectors) - 1)
if UMAP_DIM < dim and UMAP_DIM < len(all_vectors):
n_neighbors = min(30, len(all_vectors) - 1)
if UMAP_DIM < dim and UMAP_DIM < len(all_vectors):
with DebugTimer(f'UMAP: Reducing dim from {dim} to {UMAP_DIM} of {len(all_vectors)} vectors'):
reducer = UMAP(
n_components=UMAP_DIM,
n_neighbors=n_neighbors,
Expand Down Expand Up @@ -375,14 +395,13 @@ def _hdbscan_cluster(
if cluster_id == -1:
noisy_vectors.append(all_vectors[i])
num_noisy = len(noisy_vectors)
perc_noisy = 100 * num_noisy / len(clusterer.labels_)
log(f'{num_noisy} noise points ({perc_noisy:.1f}%) will be assigned to nearest cluster.')

noisy_labels: list[np.ndarray] = []
noisy_probs: list[np.ndarray] = []
labels = clusterer.labels_
memberships = clusterer.probabilities_
if num_noisy > 0 and num_noisy < len(clusterer.labels_):
if not skip_noisy_assignment and num_noisy > 0 and num_noisy < len(clusterer.labels_):
perc_noisy = 100 * num_noisy / len(clusterer.labels_)
log(f'{num_noisy} noise points ({perc_noisy:.1f}%) will be assigned to nearest cluster.')
with DebugTimer('HDBSCAN: Computing membership for the noise points'):
for batch_noisy_vectors in chunks(noisy_vectors, BATCH_SOFT_CLUSTER_NOISE):
batch_noisy_vectors = np.array(batch_noisy_vectors, dtype=np.float32)
Expand Down
94 changes: 92 additions & 2 deletions lilac/data/clustering_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,9 @@ def compute(docs: list[str]) -> list[Item]:
if 'summar' in doc or 'hello' in doc or 'greeting' in doc:
result.append([chunk_embedding(0, len(doc), np.array([1, 1, 1]))])
elif 'simpl' in doc or 'whats' in doc or 'time' in doc:
result.append([chunk_embedding(0, len(doc), np.array([0, 0, 0]))])
result.append([chunk_embedding(0, len(doc), np.array([-1, -1, -1]))])
else:
result.append([chunk_embedding(0, len(doc), np.array([0.5, 0.5, 0.5]))])
result.append([chunk_embedding(0, len(doc), np.array([100, 0, -100]))])
return result

mocker.patch.object(JinaV2Small, 'compute', side_effect=compute)
Expand Down Expand Up @@ -718,3 +718,93 @@ def topic_fn(docs: list[tuple[str, float]]) -> str:
},
},
]


def test_clusters_skip_noisy_assignment(
make_test_data: TestDataMaker, mocker: MockerFixture
) -> None:
texts: list[str] = [
'Can you summarize this article',
'Can you rewrite this in a simpler way',
'Can you provide a short summary of the following text',
'Can you simplify this text',
'Hello world',
]
dataset = make_test_data([{'text': t} for t in texts])

def topic_fn(docs: list[tuple[str, float]]) -> str:
if 'summar' in docs[0][0]:
return 'summarization'
elif 'simpl' in docs[0][0]:
return 'simplification'
return 'other'

mocker.patch.object(clustering, 'MIN_CLUSTER_SIZE_CATEGORY', 2)
_mock_jina(mocker)

dataset.cluster(
'text',
min_cluster_size=2,
topic_fn=topic_fn,
category_fn=lambda _: 'MockCategory',
skip_noisy_assignment=True,
)

rows = list(dataset.select_rows(['text', 'text__cluster'], combine_columns=True))
assert rows == [
{
'text': 'Can you summarize this article',
'text__cluster': {
'cluster_id': 0,
'cluster_membership_prob': 1.0,
'cluster_title': 'summarization',
'category_id': 0,
'category_membership_prob': 1.0,
'category_title': 'MockCategory',
},
},
{
'text': 'Can you rewrite this in a simpler way',
'text__cluster': {
'cluster_id': 1,
'cluster_membership_prob': 1.0,
'cluster_title': 'simplification',
'category_id': 1,
'category_membership_prob': 1.0,
'category_title': 'MockCategory',
},
},
{
'text': 'Can you provide a short summary of the following text',
'text__cluster': {
'cluster_id': 0,
'cluster_membership_prob': 1.0,
'cluster_title': 'summarization',
'category_id': 0,
'category_membership_prob': 1.0,
'category_title': 'MockCategory',
},
},
{
'text': 'Can you simplify this text',
'text__cluster': {
'cluster_id': 1,
'cluster_membership_prob': 1.0,
'cluster_title': 'simplification',
'category_id': 1,
'category_membership_prob': 1.0,
'category_title': 'MockCategory',
},
},
{
'text': 'Hello world',
'text__cluster': {
'cluster_id': -1,
'cluster_membership_prob': 0.0,
'cluster_title': None,
'category_id': -1,
'category_membership_prob': 0.0,
'category_title': None,
},
},
]
4 changes: 4 additions & 0 deletions lilac/data/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -506,6 +506,7 @@ def cluster(
task_id: Optional[TaskId] = None,
# TODO(0.4.0): colocate with topic_fn.
category_fn: Optional[TopicFn] = None,
skip_noisy_assignment: bool = False,
dsmilkov marked this conversation as resolved.
Show resolved Hide resolved
) -> None:
"""Compute clusters for a field of the dataset.

Expand All @@ -524,6 +525,9 @@ def cluster(
of the task.
category_fn: A function that returns a category for a set of related titles. It takes a list
of (doc, membership_score) tuples and returns a single category name.
skip_noisy_assignment: If true, noisy points will not be assigned to the nearest cluster.
This only has an effect when the clustering is done locally (use_garden=False) and will
speedup clustering.

"""
pass
Expand Down
3 changes: 3 additions & 0 deletions lilac/data/dataset_duckdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -3334,6 +3334,7 @@ def cluster(
use_garden: bool = False,
task_id: Optional[TaskId] = None,
category_fn: Optional[TopicFn] = cluster_titling.generate_category_openai,
skip_noisy_assignment: bool = False,
) -> None:
topic_fn = topic_fn or cluster_titling.generate_title_openai
category_fn = category_fn or cluster_titling.generate_category_openai
Expand All @@ -3347,6 +3348,7 @@ def cluster(
overwrite=overwrite,
use_garden=use_garden,
task_id=task_id,
skip_noisy_assignment=skip_noisy_assignment,
)

@override
Expand Down Expand Up @@ -3950,6 +3952,7 @@ def _auto_bins(stats: StatsResult) -> list[Bin]:
return [('0', const_val, None)]

is_integer = stats.value_samples and all(isinstance(val, int) for val in stats.value_samples)

def _round(value: float) -> float:
# Select a round ndigits as a function of the value range. We offset it by 2 to allow for some
# decimal places as a function of the range.
Expand Down
6 changes: 6 additions & 0 deletions lilac/router_dataset_signals.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,11 @@ class ClusterOptions(BaseModel):
use_garden: bool = PydanticField(
default=False, description='Accelerate computation by running remotely on Lilac Garden.'
)
skip_noisy_assignment: bool = PydanticField(
default=False,
description='Skip assignment of noisy points to the nearest cluster to speed up clustering.',
)

overwrite: bool = False


Expand Down Expand Up @@ -145,6 +150,7 @@ def run() -> None:
use_garden=options.use_garden,
overwrite=options.overwrite,
task_id=task_id,
skip_noisy_assignment=options.skip_noisy_assignment,
)

launch_task(task_id, run)
Expand Down
18 changes: 17 additions & 1 deletion web/blueprint/src/lib/components/ComputeClusterModal.svelte
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
input: Path;
output_path?: Path;
use_garden?: boolean;
skip_noisy_assignment?: boolean;
overwrite?: boolean;
};

Expand Down Expand Up @@ -101,7 +102,8 @@
use_garden: options.use_garden,
output_path: outputColumn,
input_selector: selectedFormatSelector,
overwrite: options.overwrite
overwrite: options.overwrite,
skip_noisy_assignment: options.skip_noisy_assignment
}
]);
close();
Expand Down Expand Up @@ -173,6 +175,20 @@
</div>
{/if}
</div>

<div>
<div class="label mb-2 font-medium text-gray-700">Skip noisy assignment</div>
<div class="label text-sm text-gray-700">
Skip assignment of noisy points to the nearest cluster to speed up clustering.
</div>
<Toggle
labelA={'False'}
labelB={'True'}
bind:toggled={options.skip_noisy_assignment}
hideLabel
/>
</div>

<div>
<div class="label text-s mb-2 font-medium text-gray-700">Overwrite</div>
<Toggle labelA={'False'} labelB={'True'} bind:toggled={options.overwrite} hideLabel />
Expand Down
4 changes: 4 additions & 0 deletions web/lib/fastapi_client/models/ClusterOptions.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@ export type ClusterOptions = {
* Accelerate computation by running remotely on Lilac Garden.
*/
use_garden?: boolean;
/**
* Skip assignment of noisy points to the nearest cluster to speed up clustering.
*/
skip_noisy_assignment?: boolean;
overwrite?: boolean;
};

Loading