# Eligibility Scoring - Data Exploration

This notebook explores the schemes table and performs EDA for eligibility scoring.



In [None]:
import sys
import os
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'shared', 'utils'))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import mlflow
import yaml

from db_connector import DBConnector

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print('âœ… Libraries imported successfully')



In [None]:
# Load configuration
config_path = '../use-cases/eligibility_scoring/config/db_config.yaml'
with open(config_path, 'r') as f:
    config = yaml.safe_load(f)

print('âœ… Configuration loaded')
print(f"Database: {config['database']['host']}:{config['database']['port']}/{config['database']['database']}")



In [None]:
# Initialize MLflow
mlflow.set_tracking_uri(config['mlflow']['tracking_uri'])
mlflow.set_experiment(config['mlflow']['experiment_name'])

print(f"âœ… MLflow tracking: {mlflow.get_tracking_uri()}")
print(f"âœ… MLflow experiment: {config['mlflow']['experiment_name']}")



In [None]:
# Connect to database
db = DBConnector(
    host=config['database']['host'],
    port=config['database']['port'],
    database=config['database']['database'],
    user=config['database']['user'],
    password=config['database']['password']
)
db.connect()



In [None]:
# List all tables
tables = db.list_tables()
print(f"ðŸ“Š Available tables ({len(tables)}):")
for table in tables:
    print(f"  - {table}")



In [None]:
# Get schemes table info
schemes_info = db.get_table_info('schemes')
print("ðŸ“‹ Schemes Table Schema:")
print(schemes_info.to_string())



In [None]:
# Get row count
row_count = db.get_table_count('schemes')
print(f"ðŸ“Š Total rows in schemes table: {row_count:,}")



In [None]:
# Load schemes data
query = "SELECT * FROM schemes LIMIT 10000"  # Adjust limit as needed
df_schemes = db.execute_query(query)

print(f"âœ… Loaded {len(df_schemes):,} rows")
print(f"âœ… Columns: {list(df_schemes.columns)}")
df_schemes.head()



In [None]:
# Basic statistics
print("ðŸ“Š Dataset Shape:")
print(f"Rows: {df_schemes.shape[0]:,}")
print(f"Columns: {df_schemes.shape[1]}")
print("\nðŸ“‹ Data Types:")
print(df_schemes.dtypes)
print("\nðŸ“Š Missing Values:")
missing = df_schemes.isnull().sum()
print(missing[missing > 0])



In [None]:
# Numerical columns summary
numerical_cols = df_schemes.select_dtypes(include=[np.number]).columns
if len(numerical_cols) > 0:
    print("ðŸ“Š Numerical Columns Summary:")
    print(df_schemes[numerical_cols].describe())



In [None]:
# Categorical columns
categorical_cols = df_schemes.select_dtypes(include=['object']).columns
if len(categorical_cols) > 0:
    print("ðŸ“Š Categorical Columns:")
    for col in categorical_cols[:10]:  # First 10
        print(f"\n{col}:")
        print(df_schemes[col].value_counts().head())



In [None]:
# Start MLflow run
with mlflow.start_run(run_name="data_exploration_v1") as run:
    # Log dataset info
    mlflow.log_param("dataset", "schemes")
    mlflow.log_param("rows", len(df_schemes))
    mlflow.log_param("columns", df_schemes.shape[1])
    mlflow.log_param("missing_values", df_schemes.isnull().sum().sum())
    
    # Log numerical statistics
    if len(numerical_cols) > 0:
        for col in numerical_cols:
            mlflow.log_metric(f"{col}_mean", float(df_schemes[col].mean()))
            mlflow.log_metric(f"{col}_std", float(df_schemes[col].std()))
    
    print(f"âœ… Logged to MLflow run: {run.info.run_id}")
    print(f"ðŸ“Š View at: {mlflow.get_tracking_uri()}/#/experiments/{run.info.experiment_id}/runs/{run.info.run_id}")



In [None]:
# Save processed data
output_dir = config['data']['output_path']
os.makedirs(output_dir, exist_ok=True)

df_schemes.to_csv(f"{output_dir}schemes_sample.csv", index=False)
print(f"âœ… Data saved to {output_dir}schemes_sample.csv")



In [None]:
# Close database connection
db.disconnect()
print("âœ… Database connection closed")

