In [None]:
# Part 1: Detecting Data Drift in AI/ML Models
# Objective: Understand data drift, how it affects machine learning models, and techniques tomonitor it.

# Task 1: Understanding Data Drift: Study a historical dataset used in training a simple linear regression model and
# compare it with recent unseen data to detect drift.

# Task 2: Monitoring Distribution Changes: Write the code to identify features that exhibit statistical distribution differences.

# Task 3: Visualizing Data Drift: Use visualization techniques to illustrate data drift.

import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from scipy.stats import ks_2samp
import matplotlib.pyplot as plt

# Part 1: Detecting Data Drift in AI/ML Models

# Task 1: Create original training data and drifted test data
np.random.seed(0)
# Original training data: features ~ N(0,1)
X_train = np.random.normal(loc=0, scale=1, size=(1000, 1))
y_train = 3 * X_train.flatten() + np.random.normal(0, 0.5, 1000)

# Drifted test data: features ~ N(0.5, 1.5)
X_test = np.random.normal(loc=0.5, scale=1.5, size=(300, 1))
y_test = 3 * X_test.flatten() + np.random.normal(0, 0.5, 300)

# Train linear regression model on original data
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on drifted test data
y_pred = model.predict(X_test)

# Evaluate model performance
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error on Drifted Data: {mse:.4f}")

# Task 2: Monitoring Distribution Changes using KS test for feature X
ks_stat, p_value = ks_2samp(X_train.flatten(), X_test.flatten())
print(f"KS Statistic: {ks_stat:.4f}, P-value: {p_value:.4f}")
if p_value < 0.05:
    print("Significant distribution difference detected: Data drift likely.")
else:
    print("No significant distribution difference detected.")

# Task 3: Visualizing Data Drift
plt.figure(figsize=(12,5))

# Distribution of training data
plt.subplot(1,2,1)
plt.hist(X_train, bins=30, alpha=0.7, label='Training Data')
plt.title('Training Data Feature Distribution')
plt.xlabel('Feature Value')
plt.ylabel('Frequency')

# Distribution of test (drifted) data
plt.subplot(1,2,2)
plt.hist(X_test, bins=30, alpha=0.7, color='orange', label='Drifted Test Data')
plt.title('Drifted Test Data Feature Distribution')
plt.xlabel('Feature Value')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()





In [None]:
# Part 2: Automating Data Quality Checks
# Objective: Use Python and data quality frameworks to automate validation.

# Task 1: Setting Up Automated Validation with Python

# Task 2: Introduction to Great Expectations: Install the great_expectations package and set up a basic project.

# Task 3: Creating Expectations with Great Expectations: Use Great Expectations to define data validation expectations for a dataset.

# Part 2: Automating Data Quality Checks

# Task 1: Basic validation using Python (example with pandas)
import pandas as pd

def basic_data_quality_checks(df):
    checks = {}
    checks['no_missing'] = df.isnull().sum().sum() == 0
    checks['no_duplicates'] = df.duplicated().sum() == 0
    checks['columns_exist'] = all(col in df.columns for col in ['id', 'name', 'age'])
    return checks

# Sample data
data = {'id': [1, 2, 3, 3], 'name': ['Alice', 'Bob', 'Charlie', 'Charlie'], 'age': [25, 30, None, None]}
df = pd.DataFrame(data)
print("Basic Data Quality Checks:", basic_data_quality_checks(df))


# Task 2 & 3: Using Great Expectations for validation
# To run this, first install with: pip install great_expectations

import great_expectations as ge

# Initialize a Great Expectations dataframe
ge_df = ge.from_pandas(df)

# Create some expectations
ge_df.expect_column_to_exist('id')
ge_df.expect_column_values_to_not_be_null('id')
ge_df.expect_column_values_to_be_unique('id')
ge_df.expect_column_values_to_not_be_null('age')

# Validate data and print result
result = ge_df.validate()
print(result)
