In [1]:
from loader import load_contract, load_dataset
from schema_builder import build_schema, validate_schema
from rule_engine import get_contract_rules, validate_rules
from alert_engine import get_contract_alerts, validate_alerts
from scorer import score_violation
from reporter import generate_markdown_report, generate_csv_report, generate_html_report

# Loading contract and dataset

In [2]:
df = load_dataset('C:\\Users\\Kevin\\OneDrive\\Personal\\GitHub\\data-contracts-data-profiler\\Data_Profiler_Data_Contracts\\data\\mock_data.csv'
                  , ['created_at', 'updated_at'])
contract = load_contract('C:\\Users\\Kevin\\OneDrive\\Personal\\GitHub\\data-contracts-data-profiler\\Data_Profiler_Data_Contracts\\contract\\data_contract.yaml')


In [3]:
max(df['age'])

200

In [4]:
contract

{'dataset_name': 'mock_data',
 'owner': 'data-team@owner.com',
 'refresh_frequency': 'daily',
 'description': 'This is a mock dataset for testing purposes.',
 'fields': [{'name': 'id',
   'type': 'integer',
   'description': 'Unique identifier for each record.',
   'nullable': False,
   'pii': False},
  {'name': 'first_name',
   'type': 'string',
   'description': 'Name of the entity.',
   'nullable': False,
   'pii': True},
  {'name': 'last_name',
   'type': 'string',
   'description': 'Last name of the entity.',
   'nullable': False,
   'pii': True},
  {'name': 'gender',
   'type': 'string',
   'description': 'Gender of the entity.',
   'nullable': False,
   'pii': True},
  {'name': 'ip_address',
   'type': 'string',
   'description': 'IP address of the entity.',
   'nullable': False,
   'pii': True},
  {'name': 'weight',
   'type': 'float',
   'description': 'Weight of the entity.',
   'nullable': False,
   'pii': False},
  {'name': 'age',
   'type': 'integer',
   'description': 'Ag

# Schema validation

In [5]:
print(build_schema(contract))

<Schema DataFrameSchema(
    columns={
        'id': <Schema Column(name=id, type=DataType(int64))>
        'first_name': <Schema Column(name=first_name, type=DataType(str))>
        'last_name': <Schema Column(name=last_name, type=DataType(str))>
        'gender': <Schema Column(name=gender, type=DataType(str))>
        'ip_address': <Schema Column(name=ip_address, type=DataType(str))>
        'weight': <Schema Column(name=weight, type=DataType(float64))>
        'age': <Schema Column(name=age, type=DataType(int64))>
        'salary': <Schema Column(name=salary, type=DataType(float64))>
        'country': <Schema Column(name=country, type=DataType(str))>
        'monthly_savings': <Schema Column(name=monthly_savings, type=DataType(float64))>
        'marital_status': <Schema Column(name=marital_status, type=DataType(str))>
        'role': <Schema Column(name=role, type=DataType(str))>
        'height': <Schema Column(name=height, type=DataType(float64))>
        'education_level': <Sc

In [6]:
validate_schema(df, contract)

'Schema validation passed'

# Rules validation

In [7]:
rules = get_contract_rules(contract)
print(rules)

[{'name': 'valid_age_range', 'description': 'Ensure that the age field contains valid values.', 'severity': 'medium', 'severity_point': 3, 'type': 'range_check', 'fields': ['age'], 'min_value': 0, 'max_value': 120}, {'name': 'valid_weight_range', 'description': 'Ensure that the weight field contains valid values.', 'severity': 'medium', 'severity_point': 3, 'type': 'range_check', 'fields': ['weight'], 'min_value': 0, 'max_value': 500}, {'name': 'valid_height_range', 'description': 'Ensure that the height field contains valid values.', 'severity': 'medium', 'severity_point': 3, 'type': 'range_check', 'fields': ['height'], 'min_value': 0, 'max_value': 9}, {'name': 'valid_email_format', 'description': 'Ensure that the email field contains valid email addresses.', 'severity': 'medium', 'severity_point': 3, 'type': 'regex_check', 'fields': ['email'], 'regex': '^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$'}, {'name': 'valid_ip_format', 'description': 'Ensure that the IP address field co

In [8]:
rules_violations = validate_rules(df, rules)
rules_violations

[{'field': 'age',
  'rule': 'range_check',
  'severity': 'medium',
  'severity_points': 3,
  'min_value': 0,
  'max_value': 120,
  'actual_value': np.int64(200),
  'error': 'max_value',
  'timestamp': datetime.datetime(2025, 5, 11, 21, 22, 44, 981983)},
 {'field': 'height',
  'rule': 'range_check',
  'severity': 'medium',
  'severity_points': 3,
  'min_value': 0,
  'max_value': 9,
  'actual_value': np.float64(11.0),
  'error': 'max_value',
  'timestamp': datetime.datetime(2025, 5, 11, 21, 22, 44, 983019)},
 {'field': 'ip_address',
  'rule': 'regex_check',
  'severity': 'low',
  'severity_points': 1,
  'regex': '^(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$',
  'actual_value': 2    252/30.214-249
  Name: ip_address, dtype: object,
  'error': 'regex_mismatch',
  'timestamp': datetime.datetime(2025, 5, 11, 21, 22, 44, 989026)},
 {'field': 'email',
  'rule': 'null_check',
 

# Alerts validation

In [9]:
alerts = get_contract_alerts(contract)
alerts

[{'name': 'completness_check',
  'description': 'Ensure that no more than 5% of the records should be missing values in nullable fields.',
  'severity': 'high',
  'type': 'completeness_check',
  'threshold': 0.05,
  'fields': ['salary',
   'monthly_savings',
   'role',
   'education_level',
   'years_experience']},
 {'name': 'tags',
  'description': 'Ensure that the dataset is tagged correctly.',
  'severity': 'high',
  'type': 'tag_check',
  'tags': ['pii'],
  'fields': ['id', 'first_name', 'last_name', 'email', 'ip_address']}]

In [10]:
alerts=validate_alerts(df, contract, alerts)
alerts

[{'field': 'education_level',
  'alert': 'completeness_check',
  'severity': 'high',
  'column': 'education_level',
  'threshold': 0.05,
  'null_ratio': np.float64(0.102),
  'timestamp': datetime.datetime(2025, 5, 11, 21, 22, 45, 20252)},
 {'field': 'id',
  'alert': 'tag_check',
  'severity': 'high',
  'column': 'id',
  'tags': ['pii'],
  'timestamp': datetime.datetime(2025, 5, 11, 21, 22, 45, 20252)}]

# Scorer

In [11]:
score_violation = score_violation(rules_violations, threshold=5)

# Report

In [12]:
generate_markdown_report(rules_violations,score_violation)

In [13]:
generate_csv_report(rules_violations)

In [14]:
generate_html_report(rules_violations,score_violation,alerts)