In [39]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

# Plotly Imports
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# 0. CONFIGURATION & DATA LOADING
CSV_PATH = '/content/life_expectancy_data.csv'
TARGET = 'Life_Expectancy'
ID_COL = 'Country'

print(f"PREDICTING {TARGET}: FINAL FIX USING EXACT ID 'Country'")

try:
    df = pd.read_csv(CSV_PATH)
except FileNotFoundError:
    print(f"File not found at {CSV_PATH}. Creating a realistic dummy dataset with correlations...")
    np.random.seed(42)
    N = 2000

    # Create base factors that correlate with life expectancy
    base_health = np.random.normal(0, 1, N)
    base_wealth = np.random.normal(0, 1, N)

    # Generate realistic correlated features
    adult_mortality = 150 - 30 * base_health + np.random.normal(0, 30, N)
    adult_mortality = adult_mortality.clip(50, 300)

    immunization = 75 + 15 * base_health + np.random.normal(0, 10, N)
    immunization = immunization.clip(50, 99)

    gdp_per_capita = np.exp(9 + 0.8 * base_wealth + np.random.normal(0, 0.5, N))

    schooling = 10 + 3 * base_wealth + 2 * base_health + np.random.normal(0, 2, N)
    schooling = schooling.clip(5, 18)

    health_exp = 5 + 2 * base_wealth + np.random.normal(0, 1.5, N)
    health_exp = health_exp.clip(2, 12)

    # Life expectancy depends on these factors
    life_exp = 70 + 5 * base_health + 3 * base_wealth - 0.05 * (adult_mortality - 150) + np.random.normal(0, 3, N)
    life_exp = life_exp.clip(45, 90)

    df = pd.DataFrame({
        'Country': np.random.choice(['USA', 'CAN', 'BRA', 'IND', 'CHN', 'NIG'], N),
        'Year': np.random.randint(2000, 2016, N),
        TARGET: life_exp,
        'Adult_Mortality': adult_mortality,
        'Immunization_Rate': immunization,
        'GDP_per_capita': gdp_per_capita,
        'Schooling_Years': schooling,
        'Total_Expenditure_Health': health_exp,
        'Status': np.random.choice(['Developed', 'Developing'], N, p=[0.3, 0.7])
    })

    # Add some missing values
    for col in ['Adult_Mortality', 'GDP_per_capita']:
        df.loc[df.sample(frac=0.05).index, col] = np.nan

# Initial Cleaning
original_cols = df.columns.tolist()
df.columns = df.columns.str.replace('[^A-Za-z0-9_]+', '', regex=True).str.replace(' ', '_')

if ID_COL not in df.columns:
    object_cols_before = [col for col in original_cols if 'country' in col.lower()]
    if object_cols_before:
        ID_COL = object_cols_before[0].replace(' ', '_').replace('[^A-Za-z0-9_]+', '', regex=True)

df = df.rename(columns={'LifeExpectancy': TARGET})
df = df.dropna(subset=[TARGET]).reset_index(drop=True)
print(f"Final ID Column used: {ID_COL}")
print(f"Available columns: {list(df.columns)}")


# 1. FEATURE ENGINEERING & PREPARATION

df_fe = df.copy()

# Log Transformation
SKEWED_COLS = ['GDP_per_capita']
for col in SKEWED_COLS:
    if col in df_fe.columns:
        df_fe[f'Log_{col}'] = np.log1p(df_fe[col])

# Handle Categorical Features
categorical_features = df_fe.select_dtypes(include='object').columns.tolist()
if ID_COL and ID_COL in categorical_features:
    categorical_features.remove(ID_COL)

if categorical_features:
    df_fe = pd.get_dummies(df_fe, columns=categorical_features, drop_first=True)

# Imputation
numeric_features = df_fe.select_dtypes(include=np.number).columns.tolist()
numeric_features = [col for col in numeric_features if col != TARGET]

for col in numeric_features:
    df_fe[col] = df_fe[col].fillna(df_fe[col].mean())
print("NaNs imputed with mean.")


# 2. MODEL TRAINING & EVALUATION

# Final Feature List
EXCLUDE_COLS = [TARGET, 'Year'] + SKEWED_COLS
if ID_COL:
    EXCLUDE_COLS.append(ID_COL)

feature_cols = [col for col in df_fe.columns if col not in EXCLUDE_COLS]

# Train/Test Split
X = df_fe[feature_cols]
y = df_fe[TARGET]

# Store the original indices to map back to test_df
X_train, X_test, y_train, y_test, train_idx, test_idx = train_test_split(
    X, y, df_fe.index, test_size=0.2, random_state=42
)

# Create test_df using the test indices
test_df = df.iloc[test_idx].reset_index(drop=True)

# 1. Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 2. Hyperparameter Tuning
param_grid = {'alpha': [0.01, 0.1, 1.0, 10.0, 100.0]}
ridge = Ridge()
gscv = GridSearchCV(ridge, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
gscv.fit(X_train_scaled, y_train)
best_alpha = gscv.best_params_['alpha']

# 3. Final Model Training & Prediction
model_name = 'Ridge Regression (Optimized)'
model = Ridge(alpha=best_alpha)
model.fit(X_train_scaled, y_train)

y_test_pred = model.predict(X_test_scaled)
train_r2 = r2_score(y_train, model.predict(X_train_scaled))
test_r2 = r2_score(y_test, y_test_pred)

print(f"Optimal Ridge Alpha found: {best_alpha}")
print(f"Train R²: {train_r2:.4f}, Test R²: {test_r2:.4f}")
print(f"Train RMSE: {np.sqrt(mean_squared_error(y_train, model.predict(X_train_scaled))):.2f}")
print(f"Test RMSE: {np.sqrt(mean_squared_error(y_test, y_test_pred)):.2f}")

PREDICTING Life_Expectancy: FINAL FIX USING EXACT ID 'Country'
File not found at /content/life_expectancy_data.csv. Creating a realistic dummy dataset with correlations...
Final ID Column used: Country
Available columns: ['Country', 'Year', 'Life_Expectancy', 'Adult_Mortality', 'Immunization_Rate', 'GDP_per_capita', 'Schooling_Years', 'Total_Expenditure_Health', 'Status']
NaNs imputed with mean.
Optimal Ridge Alpha found: 10.0
Train R²: 0.7557, Test R²: 0.6878
Train RMSE: 3.83
Test RMSE: 4.22


In [40]:
# 3. PLOTLY VISUALIZATIONS

hover_text_base = "Data Point"
if ID_COL in test_df.columns:
    hover_text_base = test_df[ID_COL].astype(str)
if 'Year' in test_df.columns:
    hover_text = hover_text_base + " - " + test_df['Year'].astype(str)
else:
    hover_text = hover_text_base

scatter_df = pd.DataFrame({
    'Actual': y_test.values,
    'Predicted': y_test_pred,
    'Hover_Text': hover_text.values if isinstance(hover_text, pd.Series) else hover_text
})


# 1. Interactive Actual vs. Predicted Scatter Plot
min_val = min(y_test.min(), y_test_pred.min())
max_val = max(y_test.max(), y_test_pred.max())

fig_scatter = go.Figure()
fig_scatter.add_trace(go.Scatter(
    x=scatter_df['Actual'],
    y=scatter_df['Predicted'],
    mode='markers',
    name='Predictions',
    text=scatter_df['Hover_Text'],
    hoverinfo='text+x+y'
))
fig_scatter.add_trace(go.Scatter(
    x=[min_val, max_val], y=[min_val, max_val], mode='lines',
    line=dict(color='red', dash='dash'), name='Perfect Prediction'
))
fig_scatter.update_layout(
    title='Actual vs. Predicted Life Expectancy (Test Set)',
    xaxis_title='Actual Life Expectancy (Years)',
    yaxis_title='Predicted Life Expectancy (Years)',
    template="plotly_white", height=600, width=700
)
fig_scatter.show()


# 2. Interactive Feature Coefficients Bar Chart
coeffs = model.coef_
feature_names = X.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coeffs})
feature_importance_df['Abs_Coefficient'] = feature_importance_df['Coefficient'].abs()
feature_importance_df = feature_importance_df.sort_values(by='Abs_Coefficient', ascending=False)

fig_coef = go.Figure()
fig_coef.add_trace(go.Bar(
    y=feature_importance_df['Feature'],
    x=feature_importance_df['Coefficient'],
    orientation='h',
    marker_color=['red' if c < 0 else 'blue' for c in feature_importance_df['Coefficient']]
))
fig_coef.update_layout(
    title='Ridge Model Feature Coefficients (Impact on Life Expectancy)',
    xaxis_title='Coefficient Value (Scaled Impact)',
    yaxis_title='Feature',
    template="plotly_white"
)
fig_coef.show()
