In [None]:
# Data Drift Impact on Model
# Question: Use a simple linear regression model to demonstrate how data drift affects model predictions.

# 1. Train a model on the original data:
# 2. Evaluate on the drifted data:
# 3. Compare errors:

import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Step 1: Train a model on the original data
np.random.seed(0)
X_train = np.random.rand(100, 1) * 10  # Features from 0 to 10
y_train = 3 * X_train + 5 + np.random.randn(100, 1)  # y = 3x + 5 + noise

model = LinearRegression()
model.fit(X_train, y_train)

# Step 2: Evaluate on the drifted data
# Simulate data drift: feature distribution changes (shift to 10-20)
X_drifted = np.random.rand(100, 1) * 10 + 10  # Features from 10 to 20
y_drifted = 3 * X_drifted + 5 + np.random.randn(100, 1)  # Same relationship

# Predictions
y_pred_original = model.predict(X_train)
y_pred_drifted = model.predict(X_drifted)

# Step 3: Compare errors
mse_original = mean_squared_error(y_train, y_pred_original)
mse_drifted = mean_squared_error(y_drifted, y_pred_drifted)

print(f"Mean Squared Error on original data: {mse_original:.2f}")
print(f"Mean Squared Error on drifted data: {mse_drifted:.2f}")

# Plotting
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.scatter(X_train, y_train, label="Original Data")
plt.plot(X_train, y_pred_original, color="red", label="Model Prediction")
plt.title("Model on Original Data")
plt.xlabel("X")
plt.ylabel("y")
plt.legend()

plt.subplot(1, 2, 2)
plt.scatter(X_drifted, y_drifted, label="Drifted Data", color="orange")
plt.plot(X_drifted, y_pred_drifted, color="red", label="Model Prediction")
plt.title("Model on Drifted Data")
plt.xlabel("X")
plt.ylabel("y")
plt.legend()

plt.tight_layout()
plt.show()



In [None]:
# Monitoring Data Distribution Changes
# Question: Use Python to monitor distribution changes in features to detect potential data drift.

# 1. Calculate feature statistics (mean and standard deviation) for both original and drifted data:
# 2. Compare statistics:
# 3. Set thresholds to detect significant drift:




In [None]:
# Automating Data Quality Checks with Python
# Question: Automate a basic data validation process using Python to ensure the dataset's
# structural integrity.

# 1. Define validation checks:
# 2. Apply validation:




In [None]:
# Introducing Great Expectations for Data Validation
# Question: Use Great Expectations to set up data validation checks for a dataset.

# 1. Install Great Expectations:
# 2. Create a new expectations suite:
# 3. Load data and generate expectations:




In [None]:
# Automating Constraint Checks with Python
# Question: Automate primary key and foreign key constraint checks using Python to ensure dataset compliance.


# 1. Assuming datasets exist with primary and foreign key relationships in pandas dataframes employees_df and departments_df :




In [None]:
# Advanced Data Drift Detection using Statistical Tests
# Question: Implement Kolmogorov-Smirnov test using Python to detect data drift at a more sophisticated level.

# 1. Use SciPy to perform KS test:


