## AI & Machine Learning for Data Quality
**Description**: AI and machine learning can automate and enhance data quality checks by learning patterns and identifying anomalies more effectively than static rules.

**Task 1**: Training a model to predict and flag unusual trend patterns in sales data that
deviate from historical norms.

In [None]:
# write your code from here
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt

# Generate synthetic sales data
np.random.seed(42)
dates = pd.date_range(start="2023-01-01", periods=100)
sales = np.random.normal(loc=200, scale=20, size=100)
# Inject anomalies
sales[[20, 50, 75]] = [300, 50, 400]
df = pd.DataFrame({'date': dates, 'sales': sales})

# Feature engineering: add rolling mean and std
df['rolling_mean'] = df['sales'].rolling(window=5, min_periods=1).mean()
df['rolling_std'] = df['sales'].rolling(window=5, min_periods=1).std()

# Isolation Forest for anomaly detection
model = IsolationForest(contamination=0.05, random_state=42)
df['anomaly'] = model.fit_predict(df[['sales', 'rolling_mean', 'rolling_std']])
df['anomaly'] = df['anomaly'].map({1: 0, -1: 1})  # 1 = anomaly

# Plot results
plt.figure(figsize=(12, 5))
plt.plot(df['date'], df['sales'], label='Sales')
plt.scatter(df[df['anomaly'] == 1]['date'], df[df['anomaly'] == 1]['sales'],
            color='red', label='Anomaly', s=60)
plt.title("Sales Trend with Anomaly Detection")
plt.xlabel("Date")
plt.ylabel("Sales")
plt.legend()
plt.tight_layout()
plt.show()

# Output detected anomalies
print("\nDetected Anomalies:")
print(df[df['anomaly'] == 1][['date', 'sales']])


**Task 2**: Using clustering algorithms to detect duplicate records where entries are not
exactly identical.

In [None]:
# write your code from here
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import cosine_distances

# Sample data with near-duplicate records
data = {
    'name': [
        'John Doe', 'Jon Doe', 'J. Doe', 'Jane Smith',
        'Jane S.', 'Smith Jane', 'Alice Johnson', 'Alyce Jonson'
    ]
}
df = pd.DataFrame(data)

# Vectorize text using TF-IDF
vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(2, 4))
X = vectorizer.fit_transform(df['name'])

# Use cosine distance in DBSCAN for clustering
distance_matrix = cosine_distances(X)
db = DBSCAN(eps=0.3, min_samples=1, metric='precomputed')
df['cluster'] = db.fit_predict(distance_matrix)

# Display clusters
for cluster in sorted(df['cluster'].unique()):
    print(f"\nCluster {cluster}:")
    print(df[df['cluster'] == cluster]['name'].to_list())


**Task 3**: Implementing classification models to validate data based on learned
characteristics from labeled datasets.

In [None]:
# write your code from here
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Create a mock labeled dataset
data = {
    'age': [25, 45, 130, 35, -1, 22, 40, 300, 33, 29],
    'salary': [50000, 80000, 60000, 40000, 100000, 30000, -5000, 90000, 45000, 48000],
    'experience': [2, 20, 5, 10, 40, 1, 15, 50, 7, 6],
    'label': [0, 0, 1, 0, 1, 0, 0, 1, 0, 0]  # 1 indicates data quality issue
}
df = pd.DataFrame(data)

# Prepare features and target
X = df.drop('label', axis=1)
y = df['label']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
