## Build a Data Quality Dashboard

**Description**: Create a simple dashboard that displays data quality metrics using a library like `dash` or `streamlit`.

**Steps:**
1. Install Streamlit: pip install streamlit
2. Create a Python script dashboard.py.
3. Run the dashboard: streamlit run dashboard.py

In [3]:
import streamlit as st
import pandas as pd
import numpy as np
import re

# Function to validate email format
def is_valid_email(email):
    pattern = r'^\S+@\S+\.\S+$'
    return bool(re.match(pattern, email))

# Title
st.title("📊 Data Quality Dashboard")

# Sample data (you can later replace with file upload)
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', None],
    'Age': [25, np.nan, 30, 22, 25],
    'Email': ['a@example.com', 'b@example.com', 'c@example.com', 'd@example.com', 'a@example.com']
}

try:
    df = pd.DataFrame(data)

    # Validate DataFrame existence
    if df.empty:
        st.warning("⚠️ The dataset is empty.")
    else:
        st.subheader("📄 Raw Data")
        st.dataframe(df)

        # Missing values
        st.subheader("🧪 Missing Values (%)")
        missing_percent = df.isnull().mean() * 100
        st.write(missing_percent.round(2))

        # Duplicate rows
        st.subheader("📎 Duplicate Row Count")
        dup_count = df.duplicated().sum()
        st.write(f"Duplicate rows: **{dup_count}**")

        # Email format validation
        st.subheader("✅ Email Format Validity")
        if 'Email' in df.columns:
            valid_emails = df['Email'].apply(lambda x: is_valid_email(x) if pd.notnull(x) else False)
            invalid_count = (~valid_emails).sum()
            st.write(f"Invalid email entries: **{invalid_count}**")
            st.write("Invalid rows:")
            st.dataframe(df[~valid_emails])
        else:
            st.warning("Column 'Email' not found.")

except Exception as e:
    st.error(f"An unexpected error occurred: {str(e)}")


