In [1]:
import pandas as pd
import numpy as np
import pickle
import warnings
warnings.filterwarnings('ignore')

In [None]:
with open('outputs/churn_prediction_model.pkl', 'rb') as f:
    model = pickle.load(f)
print("‚úì Model loaded")

with open('outputs/label_encoders.pkl', 'rb') as f:
    label_encoders = pickle.load(f)
print("‚úì Label encoders loaded")

with open('outputs/feature_list.pkl', 'rb') as f:
    feature_list = pickle.load(f)
print("‚úì Feature list loaded")

with open('outputs/model_metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)
print("‚úì Model metadata loaded")

feature_importance = pd.read_csv('outputs/feature_importance.csv')
print("‚úì Feature importance loaded")

print(f"\nüìä Model Info:")
print(f"   ‚Ä¢ Type: {metadata['model_type']}")
print(f"   ‚Ä¢ Accuracy: {metadata['test_accuracy']:.2%}")
print(f"   ‚Ä¢ ROC-AUC: {metadata['test_roc_auc']:.4f}")

‚úì Model loaded
‚úì Label encoders loaded
‚úì Feature list loaded
‚úì Model metadata loaded
‚úì Feature importance loaded

üìä Model Info:
   ‚Ä¢ Type: XGBoost
   ‚Ä¢ Accuracy: 81.05%
   ‚Ä¢ ROC-AUC: 0.8658


In [None]:
df = pd.read_csv('../data/churn.csv')
print(f"‚úì Loaded {len(df):,} customers")

df = df[df['Monthly_Charge'] >= 0]
print(f"‚úì Cleaned data: {len(df):,} customers")
df_all = df.copy()

service_cols = [
    'Phone_Service', 'Multiple_Lines', 'Online_Security', 'Online_Backup',
    'Device_Protection_Plan', 'Premium_Support', 'Streaming_TV',
    'Streaming_Movies', 'Streaming_Music', 'Unlimited_Data'
]

df_all['Total_Services'] = df_all[service_cols].apply(lambda x: (x == 'Yes').sum(), axis=1)
df_all['Revenue_Per_Month'] = df_all['Total_Revenue'] / (df_all['Tenure_in_Months'] + 1)
df_all['Charge_To_Revenue_Ratio'] = df_all['Monthly_Charge'] / (df_all['Total_Revenue'] + 1)
df_all['Refund_Rate'] = df_all['Total_Refunds'] / (df_all['Total_Charges'] + 1)
df_all['Has_Refund'] = (df_all['Total_Refunds'] > 0).astype(int)
df_all['Has_Extra_Charges'] = (df_all['Total_Extra_Data_Charges'] > 0).astype(int)

high_value_threshold = df_all['Total_Revenue'].quantile(0.75)
df_all['High_Value_Customer'] = (df_all['Total_Revenue'] > high_value_threshold).astype(int)
df_all['Service_Adoption_Rate'] = df_all['Total_Services'] / len(service_cols)

‚úì Loaded 6,418 customers
‚úì Cleaned data: 6,311 customers


In [None]:
X = df_all[feature_list].copy()

categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
original_values = {}
for col in categorical_cols:
    original_values[col] = X[col].copy()

for col in categorical_cols:
    X[col] = X[col].fillna('Missing')
    try:
        X[col] = label_encoders[col].transform(X[col])
    except ValueError:
        X[col] = 0

numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
for col in numerical_cols:
    if X[col].isnull().sum() > 0:
        X[col].fillna(X[col].median(), inplace=True)

print(f"‚úì Prepared {len(feature_list)} features")
print(f"‚úì Encoded {len(categorical_cols)} categorical features")
print(f"‚úì No missing values: {X.isnull().sum().sum() == 0}")

‚úì Prepared 33 features
‚úì Encoded 16 categorical features
‚úì No missing values: True


In [None]:
predictions = model.predict(X)
probabilities = model.predict_proba(X)[:, 1]

df_all['Churn_Prediction'] = predictions
df_all['Churn_Probability'] = probabilities

# Create risk levels
def get_risk_level(prob):
    if prob >= 0.7:
        return 'Critical'
    elif prob >= 0.5:
        return 'High'
    elif prob >= 0.3:
        return 'Medium'
    else:
        return 'Low'

df_all['Risk_Level'] = df_all['Churn_Probability'].apply(get_risk_level)
df_all['Predicted_Status'] = df_all['Churn_Prediction'].map({
    0: 'Will Stay',
    1: 'Will Churn'
})

print(f"\n‚úì Predictions completed for {len(df_all):,} customers")


‚úì Predictions completed for 6,311 customers


In [None]:
total_predicted_churn = (df_all['Churn_Prediction'] == 1).sum()
total_predicted_stay = (df_all['Churn_Prediction'] == 0).sum()
avg_churn_prob = df_all['Churn_Probability'].mean()

print(f"\nüìä Prediction Summary:")
print(f"   ‚Ä¢ Total Customers: {len(df_all):,}")
print(f"   ‚Ä¢ Predicted to Churn: {total_predicted_churn:,} ({total_predicted_churn/len(df_all)*100:.1f}%)")
print(f"   ‚Ä¢ Predicted to Stay: {total_predicted_stay:,} ({total_predicted_stay/len(df_all)*100:.1f}%)")
print(f"   ‚Ä¢ Average Churn Probability: {avg_churn_prob:.2%}")

print(f"\nüéØ Risk Level Distribution:")
risk_counts = df_all['Risk_Level'].value_counts()
for risk, count in risk_counts.items():
    print(f"   ‚Ä¢ {risk}: {count:,} ({count/len(df_all)*100:.1f}%)")

predicted_churners = df_all[df_all['Churn_Prediction'] == 1].copy()

print(f"\nüìã Predicted Churners Profile:")
print(f"   ‚Ä¢ Total Predicted Churners: {len(predicted_churners):,}")
print(f"   ‚Ä¢ Average Churn Probability: {predicted_churners['Churn_Probability'].mean():.2%}")
print(f"   ‚Ä¢ Average Monthly Charge: ${predicted_churners['Monthly_Charge'].mean():.2f}")
print(f"   ‚Ä¢ Average Tenure: {predicted_churners['Tenure_in_Months'].mean():.1f} months")
print(f"   ‚Ä¢ Average Total Revenue: ${predicted_churners['Total_Revenue'].mean():.2f}")

total_revenue_at_risk = predicted_churners['Total_Revenue'].sum()
monthly_revenue_at_risk = predicted_churners['Monthly_Charge'].sum()

print(f"\nüí∞ Revenue at Risk:")
print(f"   ‚Ä¢ Total Revenue at Risk: ${total_revenue_at_risk:,.2f}")
print(f"   ‚Ä¢ Monthly Revenue at Risk: ${monthly_revenue_at_risk:,.2f}")


üìä Prediction Summary:
   ‚Ä¢ Total Customers: 6,311
   ‚Ä¢ Predicted to Churn: 2,161 (34.2%)
   ‚Ä¢ Predicted to Stay: 4,150 (65.8%)
   ‚Ä¢ Average Churn Probability: 36.28%

üéØ Risk Level Distribution:
   ‚Ä¢ Low: 3,829 (60.7%)
   ‚Ä¢ Critical: 1,934 (30.6%)
   ‚Ä¢ Medium: 321 (5.1%)
   ‚Ä¢ High: 227 (3.6%)

üìã Predicted Churners Profile:
   ‚Ä¢ Total Predicted Churners: 2,161
   ‚Ä¢ Average Churn Probability: 89.48%
   ‚Ä¢ Average Monthly Charge: $69.77
   ‚Ä¢ Average Tenure: 17.4 months
   ‚Ä¢ Average Total Revenue: $1592.92

üí∞ Revenue at Risk:
   ‚Ä¢ Total Revenue at Risk: $3,442,295.84
   ‚Ä¢ Monthly Revenue at Risk: $150,772.70


In [None]:
top_features = feature_importance.head(10)['Feature'].tolist()

print(f"\nüîù Top 10 Churn Predictors:")
for idx, row in feature_importance.head(10).iterrows():
    print(f"   {idx+1}. {row['Feature']}: {row['Importance']:.4f}")

def get_risk_factors(row):
    factors = []
    
    # Contract risk
    if row['Contract'] == 'Month-to-Month':
        factors.append('Month-to-Month Contract')
    
    # Tenure risk
    if row['Tenure_in_Months'] < 6:
        factors.append('Short Tenure (<6 months)')
    elif row['Tenure_in_Months'] < 12:
        factors.append('Low Tenure (<1 year)')
    
    # Service risk
    if row['Total_Services'] < 3:
        factors.append('Low Service Adoption')
    
    # Support risk
    if row.get('Premium_Support') == 'No':
        factors.append('No Premium Support')
    
    # Charge risk
    if row['Monthly_Charge'] > 80:
        factors.append('High Monthly Charge')
    
    # Refund risk
    if row['Total_Refunds'] > 0:
        factors.append('Has Refunds')
    
    # Return top 3
    return ', '.join(factors[:3]) if factors else 'General Risk'

predicted_churners['Top_Risk_Factors'] = predicted_churners.apply(get_risk_factors, axis=1)

print(f"\n‚úì Risk factors identified for {len(predicted_churners):,} predicted churners")


üîù Top 10 Churn Predictors:
   1. Contract: 0.3484
   2. Charge_To_Revenue_Ratio: 0.0539
   3. Internet_Type: 0.0481
   4. Premium_Support: 0.0360
   5. Streaming_Movies: 0.0292
   6. Streaming_TV: 0.0289
   7. Streaming_Music: 0.0265
   8. Monthly_Charge: 0.0241
   9. Payment_Method: 0.0216
   10. Online_Security: 0.0212

‚úì Risk factors identified for 2,161 predicted churners


In [8]:
output_file_all = '../data/all_customers_predictions.csv'
df_all.to_csv(output_file_all, index=False)
print(f"‚úì All predictions saved: {output_file_all}")

# Save predicted churners only
output_file_churners = '../data/predicted_churners.csv'
predicted_churners.to_csv(output_file_churners, index=False)
print(f"‚úì Predicted churners saved: {output_file_churners}")

# Save summary statistics
summary_stats = {
    'total_customers': len(df_all),
    'predicted_churners': total_predicted_churn,
    'predicted_stays': total_predicted_stay,
    'churn_rate_predicted': total_predicted_churn / len(df_all),
    'avg_churn_probability': avg_churn_prob,
    'total_revenue_at_risk': total_revenue_at_risk,
    'monthly_revenue_at_risk': monthly_revenue_at_risk,
    'critical_risk_count': (df_all['Risk_Level'] == 'Critical').sum(),
    'high_risk_count': (df_all['Risk_Level'] == 'High').sum(),
    'medium_risk_count': (df_all['Risk_Level'] == 'Medium').sum(),
    'low_risk_count': (df_all['Risk_Level'] == 'Low').sum()
}

summary_df = pd.DataFrame([summary_stats])
summary_file = '../data/prediction_summary.csv'
summary_df.to_csv(summary_file, index=False)
print(f"‚úì Summary statistics saved: {summary_file}")

‚úì All predictions saved: ../data/all_customers_predictions.csv
‚úì Predicted churners saved: ../data/predicted_churners.csv
‚úì Summary statistics saved: ../data/prediction_summary.csv


In [9]:
top_10_risk = predicted_churners.nlargest(10, 'Churn_Probability')[[
    'Customer_ID', 'Age', 'Gender', 'Contract', 'Tenure_in_Months',
    'Monthly_Charge', 'Total_Revenue', 'Churn_Probability', 
    'Risk_Level', 'Top_Risk_Factors'
]]

print("\n")
print(top_10_risk.to_string(index=False))



Customer_ID  Age Gender       Contract  Tenure_in_Months  Monthly_Charge  Total_Revenue  Churn_Probability Risk_Level                                                  Top_Risk_Factors
  27402-AND   25   Male Month-to-Month                35           70.80         101.40           0.999790   Critical                       Month-to-Month Contract, No Premium Support
  44755-JHA   28 Female Month-to-Month                19           80.55         124.22           0.999788   Critical  Month-to-Month Contract, No Premium Support, High Monthly Charge
  46624-WES   26 Female Month-to-Month                27           80.20          81.43           0.999778   Critical  Month-to-Month Contract, No Premium Support, High Monthly Charge
  82170-TAM   41   Male Month-to-Month                29           44.60          89.88           0.999775   Critical Month-to-Month Contract, Low Service Adoption, No Premium Support
  29609-AND   55   Male Month-to-Month                16           70.75      

In [10]:
print(f"\nüìä Results:")
print(f"   ‚Ä¢ {total_predicted_churn:,} customers predicted to churn ({total_predicted_churn/len(df_all)*100:.1f}%)")
print(f"   ‚Ä¢ ${total_revenue_at_risk:,.2f} total revenue at risk")
print(f"   ‚Ä¢ ${monthly_revenue_at_risk:,.2f} monthly recurring revenue at risk")

print(f"\nüíæ Files Saved:")
print(f"   ‚Ä¢ all_customers_predictions.csv (all customers with predictions)")
print(f"   ‚Ä¢ predicted_churners.csv (only predicted churners)")
print(f"   ‚Ä¢ prediction_summary.csv (summary statistics)")

print(f"\nüéØ Next Steps:")
print(f"   1. Review predicted churners in predicted_churners.csv")
print(f"   2. Prioritize by Risk_Level and Churn_Probability")
print(f"   3. Run churn_prediction_dashboard.py to visualize results")
print(f"   4. Implement retention strategies for high-risk customers")


üìä Results:
   ‚Ä¢ 2,161 customers predicted to churn (34.2%)
   ‚Ä¢ $3,442,295.84 total revenue at risk
   ‚Ä¢ $150,772.70 monthly recurring revenue at risk

üíæ Files Saved:
   ‚Ä¢ all_customers_predictions.csv (all customers with predictions)
   ‚Ä¢ predicted_churners.csv (only predicted churners)
   ‚Ä¢ prediction_summary.csv (summary statistics)

üéØ Next Steps:
   1. Review predicted churners in predicted_churners.csv
   2. Prioritize by Risk_Level and Churn_Probability
   3. Run churn_prediction_dashboard.py to visualize results
   4. Implement retention strategies for high-risk customers
