Exercise 1: Data Quality Checker 
Real scenario: Your team receives daily CSV exports from a vendor. Before loading into the pipeline, you need to validate the data.

In [None]:
# You receive this data structure (simulate it as variables):
daily_records = [
    {"user_id": "U001", "age": 25, "revenue": 150.50, "country": "US"},
    {"user_id": "U002", "age": -5, "revenue": 200.00, "country": "UK"},  # Invalid age
    {"user_id": "U003", "age": 30, "revenue": "invalid", "country": "CA"},  # Invalid revenue
    {"user_id": "", "age": 28, "revenue": 175.25, "country": "US"},  # Missing user_id
    {"user_id": "U005", "age": 150, "revenue": 99.99, "country": ""},  # Invalid age, missing country
]

valid_records=[]
invalid_records=[]

for i in daily_records:
    if i['user_id'] !="" and 0<=i['age']<=120 and (isinstance(i['revenue'],int) or isinstance(i['revenue'],float)) and i['country'] !="" :
        valid_records.append(i)
    else:
        invalid_records.append(i)
        

print(valid_records)
print(invalid_records)


print(len(daily_records))
print(len(valid_records))
print(len(invalid_records))


[{'user_id': 'U001', 'age': 25, 'revenue': 150.5, 'country': 'US'}]
[{'user_id': 'U002', 'age': -5, 'revenue': 200.0, 'country': 'UK'}, {'user_id': 'U003', 'age': 30, 'revenue': 'invalid', 'country': 'CA'}, {'user_id': '', 'age': 28, 'revenue': 175.25, 'country': 'US'}, {'user_id': 'U005', 'age': 150, 'revenue': 99.99, 'country': ''}]
5
1
4


Exercise 2:This simulates a log file (normally you'd read from file, but start simple)


In [73]:
log_entries = [
    "2026-01-26 09:15:23 INFO Model training started",
    "2026-01-26 09:16:45 ERROR Failed to load dataset: FileNotFoundError",
    "2026-01-26 09:17:12 WARNING Low memory: 85% used",
    "2026-01-26 09:18:33 INFO Epoch 1/10 completed",
    "2026-01-26 09:19:44 ERROR CUDA out of memory",
    "2026-01-26 09:20:15 ERROR Failed to load dataset: FileNotFoundError",  # Duplicate
    "2026-01-26 09:21:05 INFO Model saved successfully",
    "2026-01-26 09:22:18 WARNING Validation accuracy below threshold",
]

#q1
times=[]
log_level=[]
msg=[]
for i in log_entries:
    s=i.split(" ",3)
    print(s)
    times.append(s[1])
    log_level.append(s[2])
    msg.append(s[-1])
print(times) 
print(log_level)
print(msg)

#q2
logs_per_level = {}

for i in log_level:
    if i not in logs_per_level:
        logs_per_level[i]=1
    else:
        logs_per_level[i]+=1

logs_per_level = {}



unique_errors = set()

for i in range(len(log_level)):
    if log_level[i] == "ERROR":
        unique_errors.add(msg[i])


#q3
first_error_index = log_level.index("ERROR")
first_error_time = times[first_error_index]

#q4

error_counts = {}

for i in range(len(log_level)):
    if log_level[i] == "ERROR":
        if msg[i] not in error_counts:
            error_counts[msg[i]] = 1
        else:
            error_counts[msg[i]] += 1

most_common_error = max(error_counts, key=error_counts.get)
most_common_error_count = error_counts[most_common_error]



# 3. Find:
#    - Time of first ERROR
#    - Most common error message
#
# 4. Print report:
#    """
#    === Log Summary ===
#    Total entries: 8
#    INFO: 3
#    WARNING: 2
#    ERROR: 3
#    
#    First error at: 09:16:45
#    Unique errors: 2
#    Most common error: Failed to load dataset: FileNotFoundError (appeared 2 times)
#    """

['2026-01-26', '09:15:23', 'INFO', 'Model training started']
['2026-01-26', '09:16:45', 'ERROR', 'Failed to load dataset: FileNotFoundError']
['2026-01-26', '09:18:33', 'INFO', 'Epoch 1/10 completed']
['2026-01-26', '09:19:44', 'ERROR', 'CUDA out of memory']
['2026-01-26', '09:20:15', 'ERROR', 'Failed to load dataset: FileNotFoundError']
['2026-01-26', '09:21:05', 'INFO', 'Model saved successfully']
['09:15:23', '09:16:45', '09:17:12', '09:18:33', '09:19:44', '09:20:15', '09:21:05', '09:22:18']
['Model training started', 'Failed to load dataset: FileNotFoundError', 'Low memory: 85% used', 'Epoch 1/10 completed', 'CUDA out of memory', 'Failed to load dataset: FileNotFoundError', 'Model saved successfully', 'Validation accuracy below threshold']


EXERCISE 3

In [3]:
# This simulates a config file (later you'll read from JSON/YAML)
training_config1 = {
    "model_name": "resnet50",
    "batch_size": 32,
    "learning_rate": 0.001,
    "epochs": 100,
    "gpu_id": 0,
    "data_path": "/data/imagenet",
    "save_checkpoints": True,
}

# Validation rules (your manager gives you these):
VALID_MODELS = ["resnet50", "vgg16", "efficientnet"]
MIN_BATCH_SIZE = 1
MAX_BATCH_SIZE = 512
MIN_LR = 0.00001
MAX_LR = 1.0
MIN_EPOCHS = 1
MAX_EPOCHS = 1000


# 2. If ALL valid: print "✓ Config valid. Safe to start training."
# 3. If ANY invalid: print "✗ Config invalid:" and list ALL issues
#    Example: "✗ Config invalid:
#             - batch_size 1024 exceeds maximum 512
#             - learning_rate 5.0 exceeds maximum 1.0"
#
# 3. Test with INVALID config:
training_config = {
    "model_name": "bert",  # Not in valid models
    "batch_size": 1024,    # Too large
    "learning_rate": 5.0,  # Too large
    "epochs": -10,         # Negative
    "gpu_id": -1,          # Negative
    "data_path": "",       # Empty
    "save_checkpoints": "yes",  # Wrong type
}

# Your job:
# 1. Check:
#    - model_name is in VALID_MODELS
#    - batch_size is between MIN and MAX
#    - learning_rate is between MIN and MAX
#    - epochs is between MIN and MAX
#    - gpu_id is 0 or positi
#    - data_path is not empty string
#    - save_checkpoints is True or False (boolean)
if training_config['model_name'] in VALID_MODELS and MIN_BATCH_SIZE<=training_config["batch_size"]<=MAX_BATCH_SIZE and MIN_LR<=training_config["learning_rate"]<=MAX_LR and MIN_EPOCHS<=training_config["epochs"]<=MAX_EPOCHS and training_config['gpu_id'] >=0 and training_config['data_path']!= "" and (training_config['save_checkpoints'] ==True or training_config['save_checkpoints'] ==False):
    print('✓ Config valid. Safe to start training.')
else:
    print('✗ Config invalid:')





✗ Config invalid:


In [None]:
# Raw data from database (simulate as list of dicts)
users = [
    {"user_id": "U001", "signup_date": "2025-01-15", "last_login": "2026-01-25", 
     "purchases": 5, "total_spent": 499.99, "email_domain": "gmail.com"},
    
    {"user_id": "U002", "signup_date": "2024-06-20", "last_login": "2026-01-20",
     "purchases": 0, "total_spent": 0, "email_domain": "yahoo.com"},
    
    {"user_id": "U003", "signup_date": "2025-11-01", "last_login": "2026-01-26",
     "purchases": 12, "total_spent": 1250.00, "email_domain": "company.com"},
    
    {"user_id": "U004", "signup_date": "2023-03-10", "last_login": "2025-12-15",
     "purchases": 3, "total_spent": 150.00, "email_domain": "gmail.com"},
]
import datetime
# Your job - calculate these features for EACH user:
# 1. days_since_signup: days between signup_date and today (2026-01-26)
# 2. days_since_last_login: days between last_login and today
# 3. average_purchase_value: total_spent / purchases (handle division by zero!)
# 4. is_active: True if last_login was within 7 days, False otherwise
# 5. is_high_value: True if total_spent > 500, False otherwise
# 6. is_enterprise_email: True if email_domain is NOT gmail/yahoo/hotmail

# Create NEW list with enriched data:
enriched_users = [
    {
        "user_id": "U001",
        "days_since_signup": 376,  # Calculate this
        "days_since_last_login": 1,
        "average_purchase_value": 99.99,
        "is_active": True,
        "is_high_value": False,
        "is_enterprise_email": False,
    },
    # ... for all users
]

# Print summary stats:
# Total users: 4
# Active users: 2
# High-value users: 1
# Enterprise emails: 1
# Average purchase value (all users): $X.XX

In [None]:
# Simulate errors collected from monitoring system
errors = [
    {"timestamp": "2026-01-26 02:15:33", "component": "data_loader", 
     "error_type": "FileNotFoundError", "severity": "HIGH"},
    
    {"timestamp": "2026-01-26 02:16:45", "component": "model_inference", 
     "error_type": "CUDA_OUT_OF_MEMORY", "severity": "CRITICAL"},
    
    {"timestamp": "2026-01-26 02:17:12", "component": "data_loader", 
     "error_type": "FileNotFoundError", "severity": "HIGH"},
    
    {"timestamp": "2026-01-26 03:22:05", "component": "api_server", 
     "error_type": "TimeoutError", "severity": "MEDIUM"},
    
    {"timestamp": "2026-01-26 03:45:18", "component": "model_inference", 
     "error_type": "InvalidInputShape", "severity": "HIGH"},
    
    {"timestamp": "2026-01-26 04:10:33", "component": "data_loader", 
     "error_type": "FileNotFoundError", "severity": "HIGH"},
]

# Generate report:
"""
=== OVERNIGHT ERROR REPORT ===
Time range: 02:15:33 - 04:10:33
Total errors: 6

By Severity:
  CRITICAL: 1
  HIGH: 4
  MEDIUM: 1

By Component:
  data_loader: 3 errors
  model_inference: 2 errors
  api_server: 1 error

Top Issues:
  1. FileNotFoundError - 3 occurrences (data_loader)
  2. CUDA_OUT_OF_MEMORY - 1 occurrence (model_inference)
  3. InvalidInputShape - 1 occurrence (model_inference)

⚠️ CRITICAL ISSUE: CUDA_OUT_OF_MEMORY at 02:16:45
   Component: model_inference

Recommendation: Check data_loader - 3 errors of same type
"""

# Your job: Generate this report programmatically
# - Count by severity
# - Count by component
# - Find most common error types
# - Identify critical errors
# - Calculate time range