In [1]:
import pandas as pd
from sqlalchemy import create_engine
from dotenv import load_dotenv
import os

# 1. SETUP CONNECTION
# Load environment variables (ensure .env contains DB_URL)
load_dotenv(dotenv_path="../.env") 

RENDER_URL = os.getenv("DB_URL")
engine = create_engine(RENDER_URL)

# 2. SQL JOIN QUERY (Telco)
query = """
SELECT 
    c.*, 
    s.phoneservice, s.multiplelines, s.internetservice, 
    s.onlinesecurity, s.onlinebackup, s.deviceprotection, 
    s.techsupport, s.streamingtv, s.streamingmovies,
    k.tenure, k.contract, k.paperlessbilling, 
    k.paymentmethod, k.monthlycharges, k.totalcharges
FROM customers AS c
JOIN services AS s ON c.customerid = s.customerid
JOIN contracts AS k ON c.customerid = k.customerid
"""

# 3. LOAD DATA
df = pd.read_sql(query, engine)

# 4. CLEANUP
# Ensure TotalCharges is numeric (handle blanks)
df['totalcharges'] = pd.to_numeric(df['totalcharges'], errors='coerce').fillna(0)
# Drop CustomerID as it is not a feature for the UI
df = df.drop(columns=['customerid'], errors='ignore')

print(f"✓ Data loaded from Render. Shape: {df.shape}")
df.head()

✓ Data loaded from Render. Shape: (7043, 19)


Unnamed: 0,gender,seniorcitizen,partner,dependents,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,tenure,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges
0,Female,0,Yes,No,No,No phone service,DSL,No,Yes,No,No,No,No,1,Month-to-month,Yes,Electronic check,29.85,29.85
1,Male,0,No,No,Yes,No,DSL,Yes,No,Yes,No,No,No,34,One year,No,Mailed check,56.95,1889.5
2,Male,0,No,No,Yes,No,DSL,Yes,Yes,No,No,No,No,2,Month-to-month,Yes,Mailed check,53.85,108.15
3,Male,0,No,No,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,45,One year,No,Bank transfer (automatic),42.3,1840.75
4,Female,0,No,No,Yes,No,Fiber optic,No,No,No,No,No,No,2,Month-to-month,Yes,Electronic check,70.7,151.65


In [2]:
# =============================================================================
# ANALYZE TELCO DATA FOR STREAMLIT APP
# Find min/max for numerical features and unique options for categorical features
# =============================================================================

import json
import numpy as np

print("=" * 80)
print("ANALYZING TELCO CHURN DATA FOR STREAMLIT APP")
print("=" * 80)

# 1. Define Feature Groups
numerical_features = ['tenure', 'monthlycharges', 'totalcharges']

# Automatically identify categorical features (everything else except churn)
categorical_features = [col for col in df.columns if col not in numerical_features and col != 'churn']

# 2. Initialize Schema
data_schema = {
    "numerical": {},
    "categorical": {}
}

# 3. Analyze Numerical Features
print("\n" + "-" * 80)
print("NUMERICAL FEATURES")
print("-" * 80)
print(f"{'Feature':<20} {'Min':<10} {'Max':<10} {'Mean':<10}")

for feature in numerical_features:
    min_val = float(df[feature].min())
    max_val = float(df[feature].max())
    mean_val = float(df[feature].mean())

    data_schema["numerical"][feature] = {
        "min": min_val,
        "max": max_val,
        "mean": mean_val
    }
    print(f"{feature:<20} {min_val:<10.2f} {max_val:<10.2f} {mean_val:<10.2f}")

# 4. Analyze Categorical Features
print("\n" + "-" * 80)
print("CATEGORICAL FEATURES")
print("-" * 80)

for feature in categorical_features:
    # Get unique values, sort them, and ensure they are strings
    unique_values = sorted(df[feature].astype(str).unique().tolist())
    
    data_schema["categorical"][feature] = unique_values
    
    print(f"{feature}: {unique_values}")

# 5. Save Schema to JSON
# This assumes your notebook is in 'notebooks/' and you want to save to 'data/'
output_path = "../data/data_schema.json"
os.makedirs(os.path.dirname(output_path), exist_ok=True)

with open(output_path, 'w') as f:
    json.dump(data_schema, f, indent=4)

print("\n" + "=" * 80)
print(f"✓ Data schema saved to {output_path}")
print("=" * 80)

ANALYZING TELCO CHURN DATA FOR STREAMLIT APP

--------------------------------------------------------------------------------
NUMERICAL FEATURES
--------------------------------------------------------------------------------
Feature              Min        Max        Mean      
tenure               0.00       72.00      32.37     
monthlycharges       18.25      118.75     64.76     
totalcharges         0.00       8684.80    2279.73   

--------------------------------------------------------------------------------
CATEGORICAL FEATURES
--------------------------------------------------------------------------------
gender: ['Female', 'Male']
seniorcitizen: ['0', '1']
partner: ['No', 'Yes']
dependents: ['No', 'Yes']
phoneservice: ['No', 'Yes']
multiplelines: ['No', 'No phone service', 'Yes']
internetservice: ['DSL', 'Fiber optic', 'No']
onlinesecurity: ['No', 'No internet service', 'Yes']
onlinebackup: ['No', 'No internet service', 'Yes']
deviceprotection: ['No', 'No internet servic