In [None]:
import pandas as pd
import re
from dateutil.parser import parse

# 1. Missing Data

def count_missing_emails(df):
    return df['email'].isna().sum()

def missing_transaction_date_percentage(df):
    total = len(df)
    missing = df['transaction_date'].isna().sum()
    return (missing / total) * 100 if total else 0

def missing_departments(df):
    return df[df['department'].isna()]


# 2. Duplicate Data

def count_duplicate_customers(df):
    return df.duplicated().sum()

def duplicated_supplier_names(df):
    return df[df.duplicated(subset='supplier_name', keep=False)]

def duplicate_product_ids(df):
    return df[df.duplicated(subset='product_id', keep=False)]


# 3. Inconsistent Formatting

def check_inconsistent_dates(date_series):
    inconsistent = []
    for date in date_series:
        try:
            parse(date, dayfirst=False)
        except:
            inconsistent.append(date)
    return inconsistent

def inconsistent_phone_numbers(series):
    standard_pattern = re.compile(r'^\+?1?\d{10,15}$')  # Example standard pattern
    return series[~series.astype(str).str.match(standard_pattern)]

def inconsistent_states(series):
    expected = {'CA', 'NY', 'TX', 'FL'}  # Modify as per your valid states
    return series[~series.isin(expected)].unique()


# 4. Data Drift

def revenue_drift(df):
    return df.groupby('month')['revenue'].mean()

def engagement_by_quarter(df):
    df['quarter'] = pd.to_datetime(df['date']).dt.to_period('Q')
    return df.groupby('quarter')['engagement_score'].mean()

def detect_stock_anomalies(df):
    q1 = df['price'].quantile(0.25)
    q3 = df['price'].quantile(0.75)
    iqr = q3 - q1
    return df[(df['price'] < q1 - 1.5 * iqr) | (df['price'] > q3 + 1.5 * iqr)]
