# Clean: Quick Start Guide

This notebook demonstrates the basic usage of Clean for data quality analysis.

In [None]:
# Install clean if needed
# !pip install clean-data-quality

In [None]:
import numpy as np
import pandas as pd
from clean import DatasetCleaner

## Create Sample Data

Let's create a sample dataset with some intentional quality issues.

In [None]:
np.random.seed(42)
n_samples = 500

# Create features
df = pd.DataFrame({
    'feature_1': np.random.randn(n_samples),
    'feature_2': np.random.randn(n_samples),
    'feature_3': np.random.randn(n_samples),
    'category': np.random.choice(['A', 'B', 'C'], n_samples),
})

# Create labels with some errors (5% wrong)
true_labels = (df['feature_1'] + df['feature_2'] > 0).astype(int)
labels = true_labels.copy()
error_idx = np.random.choice(n_samples, size=25, replace=False)
labels.iloc[error_idx] = 1 - labels.iloc[error_idx]
df['label'] = labels

# Add some duplicates
df.iloc[100] = df.iloc[0]
df.iloc[200] = df.iloc[0]

# Add outliers
df.iloc[50, 0] = 10
df.iloc[51, 1] = -10

print(f"Dataset shape: {df.shape}")
df.head()

## Initialize DatasetCleaner

In [None]:
cleaner = DatasetCleaner(
    data=df,
    label_column='label',
    task='classification'
)
print(cleaner)

## Run Analysis

In [None]:
report = cleaner.analyze()

## View Summary

In [None]:
print(report.summary())

## Explore Specific Issues

### Label Errors

In [None]:
label_errors = report.label_errors()
print(f"Found {len(label_errors)} label errors")
label_errors.head(10)

### Duplicates

In [None]:
duplicates = report.duplicates()
print(f"Found {len(duplicates)} duplicate pairs")
duplicates.head()

### Outliers

In [None]:
outliers = report.outliers()
print(f"Found {len(outliers)} outliers")
outliers.head()

## Get Clean Dataset

In [None]:
clean_df = cleaner.get_clean_data(
    remove_duplicates=True,
    remove_outliers='conservative'
)

print(f"Original: {len(df)} samples")
print(f"Clean: {len(clean_df)} samples")
print(f"Removed: {len(df) - len(clean_df)} samples")

## Get Review Queue

In [None]:
review_queue = cleaner.get_review_queue(max_items=20)
review_queue

## Export Report

In [None]:
# Export to JSON
# report.save_json('report.json')

# Export to HTML
# report.save_html('report.html')

# Or get as dict
report_dict = report.to_dict()
print(f"Report keys: {list(report_dict.keys())}")