# 🧠 AI-Powered Data Pipeline Auditor (Colab Edition)
This notebook lets you analyze Azure Data Factory pipeline JSONs using an open-source LLM (TinyLlama or similar).

👉 Upload a pipeline JSON and get intelligent architecture feedback using an LLM.


In [None]:
# Step 1: Install required libraries
!pip install transformers accelerate

In [None]:
# Step 2: Load a lightweight LLM from HuggingFace
from transformers import pipeline
llm = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")

In [None]:
# Step 3: Upload your ADF pipeline JSON
from google.colab import files
import json

uploaded = files.upload()
file_path = list(uploaded.keys())[0]

with open(file_path) as f:
    data = json.load(f)

def parse_adf_pipeline(data):
    activities = [a['name'] for a in data['properties'].get('activities', [])]
    return {
        'name': data.get('name', 'Unknown Pipeline'),
        'activities': activities,
        'triggers': data['properties'].get('triggers', []),
        'linked_services': data['properties'].get('linkedServices', [])
    }

parsed_data = parse_adf_pipeline(data)
parsed_data

In [None]:
# Step 4: Prompt the LLM for analysis
prompt = f'''
You are a cloud data engineer. Analyze this Azure Data Factory pipeline:

{json.dumps(parsed_data, indent=2)}

1. Describe what the pipeline is doing.
2. Identify performance bottlenecks or risks.
3. Recommend optimization (e.g., concurrency, data partitioning).
4. Rate overall design on a scale of 1 to 10.

Respond clearly with bullet points.
'''

response = llm(prompt, max_new_tokens=300, do_sample=True)[0]["generated_text"]
print(response)