# Default notebook

This default notebook is executed using Databricks Workflows as defined in resources/lineage_ops.job.yml.

In [None]:
%pip install umap-learn

In [2]:
%load_ext autoreload
%autoreload 2

In [0]:
# Target catalog to analyze (None for all)
CATALOG_FILTER = 'hurcy'  # e.g., "prod_catalog"

# Similarity threshold (0.8 = only pairs with 80%+ similarity)
SIMILARITY_THRESHOLD = 0.8

# Lineage query period (days)
DAYS_BACK = 30

# Embedding provider ("databricks", "openai", "simple")
EMBEDDING_PROVIDER = "databricks"

# Result storage location
RESULT_CATALOG = "hurcy"
RESULT_SCHEMA = "analysis"


In [None]:
# import sys
# sys.path.append("/Workspace/Users/cinyoung.hur@databricks.com/.bundle/lineage_ops/dev/files/src")


In [None]:
from lineage_ops.main import DuplicatedTableDetector, LineageOpsConfig

# Create configuration object
config = LineageOpsConfig(
    catalog_filter=CATALOG_FILTER,
    days_back=DAYS_BACK,
    similarity_threshold=SIMILARITY_THRESHOLD,
    embedding_provider=EMBEDDING_PROVIDER
)



In [None]:
# Initialize detector
detector = DuplicatedTableDetector(spark, config)

# Extract lineage information
lineage_df = detector.extract_lineage()
print(f"Lineage events: {lineage_df.count():,}")
display(lineage_df.limit(10))

In [None]:
# Find common ancestors
from lineage_ops.data_extractor import LineageExtractor
lineage_extractor = LineageExtractor(spark)
lineage_df = lineage_extractor.get_table_lineage(days_back=30)
common_ancestors = lineage_extractor.find_common_ancestors(lineage_df)
display(common_ancestors)

In [None]:
# Extract schema texts
schema_texts_df = detector.extract_schema_texts()
print(f"Tables to analyze: {schema_texts_df.count():,}")
display(schema_texts_df.limit(5))

### 3.2 Schema Embedding

In [None]:
# Databricks Foundation Model 
from lineage_ops.schema_embedder import DatabricksFoundationModelProvider, SchemaEmbedder

provider = DatabricksFoundationModelProvider(endpoint_name="databricks-bge-large-en")
embedder = SchemaEmbedder(spark, provider)
embeddings_df = embedder.embed_schema_texts(schema_texts_df)
display(embeddings_df)

#### Similarity Search and Clustering

In [None]:
from lineage_ops.similarity_analyzer import SimilarityAnalyzer
analyzer = SimilarityAnalyzer(spark)
# Compute cosine similarity
# similar_pairs = analyzer.compute_cosine_similarity(embeddings_df, similarity_threshold=0.8)
# display(similar_pairs)

In [None]:
# Or use LSH for efficient large-scale search
similar_pairs = analyzer.compute_similarity_with_lsh(embeddings_df)
display(similar_pairs)

### 3.3 Duplicate Candidate Detection

In [None]:
# Find duplicate candidates
candidates_df = detector.find_duplicate_candidates()
print(f"Duplicate candidate pairs: {candidates_df.count():,}")
display(candidates_df)

### 3.4 Generate Consolidation Recommendations

In [None]:
# Generate consolidation recommendations
recommendations_df = detector.generate_recommendations()
display(recommendations_df)

## 4. Results Summary

In [None]:
from lineage_ops.recommendation_generator import RecommendationGenerator

generator = RecommendationGenerator(spark)
summary = generator.generate_summary_report(recommendations_df)

print("=" * 60)
print("Analysis Results Summary")
print("=" * 60)
print(f"Duplicate table pairs found: {summary['summary']['total_duplicate_pairs_found']:,}")
print(f"Estimated monthly DBU savings: ${summary['summary']['estimated_monthly_dbu_savings_usd']:,.2f}")
print(f"Estimated storage savings: {summary['summary']['estimated_storage_savings_gb']:,.2f} GB")
print(f"Pipelines to remove: {summary['summary']['total_pipelines_to_remove']:,}")
print(f"Average similarity score: {summary['summary']['average_similarity_score']:.2%}")
print()
print("Confidence distribution:")
for level, count in summary['confidence_distribution'].items():
    print(f"  {level}: {count:,}")

In [None]:
results = detector.run_full_analysis(include_visualization=True)

# 결과 확인
results["results"]["visualization"].display()

## 5. Save Results (Optional)

In [None]:

# Save results to Delta tables
detector.save_results(RESULT_CATALOG, RESULT_SCHEMA)

## 6. Detailed Analysis Example

In [None]:

# Detailed analysis of the most similar table pair
top_candidate = candidates_df.first()

if top_candidate:
    print(f"Table A: {top_candidate['table_a']}")
    print(f"Table B: {top_candidate['table_b']}")
    print(f"Similarity score: {top_candidate['cosine_similarity']:.2%}")
    print(f"Confidence: {top_candidate['confidence_level']}")
    
    # Detailed column comparison
    from lineage_ops.data_extractor import SchemaExtractor
    extractor = SchemaExtractor(spark)
    
    cols_a = extractor.get_column_metadata(top_candidate['table_a'])
    cols_b = extractor.get_column_metadata(top_candidate['table_b'])
    
    print(f"\n{top_candidate['table_a']} columns:")
    display(cols_a)
    
    print(f"\n{top_candidate['table_b']} columns:")
    display(cols_b)

