In [None]:
!pip install pandas numpy matplotlib seaborn scikit-learn imbalanced-learn joblib scipy flask


In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
from scipy.stats import chi2_contingency
from imblearn.over_sampling import SMOTE
import joblib


In [None]:
DATA_PATH = "data/telco_churn.csv"
df = pd.read_csv(DATA_PATH)

print("Rows, Columns:", df.shape)
df.head()


In [None]:
if 'customerID' in df.columns:
    df.drop(columns=['customerID'], inplace=True)

if 'TotalCharges' in df.columns:
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

print("Missing values per column:")
print(df.isna().sum())

if df['TotalCharges'].isna().sum() > 0:
    df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)

df['Churn'] = df['Churn'].map({'Yes':1, 'No':0})

for c in df.select_dtypes(include='object').columns:
    df[c] = df[c].str.strip()


In [None]:
num_cols = df.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = df.select_dtypes(include=['object']).columns.tolist()

print("Numeric columns:", num_cols)
print("Categorical columns:", cat_cols)
print(df.describe(include='all'))


In [None]:
os.makedirs("figures", exist_ok=True)

plt.figure(figsize=(6,4))
sns.countplot(x='Churn', data=df)
plt.title("Churn Distribution")
plt.savefig("figures/churn_distribution.png")
plt.close()

plt.figure(figsize=(8,4))
sns.histplot(data=df, x='tenure', hue='Churn', bins=30, multiple='stack')
plt.title("Tenure Distribution by Churn")
plt.savefig("figures/tenure_churn.png")
plt.close()

plt.figure(figsize=(10,8))
sns.heatmap(df[num_cols].corr(), annot=True, fmt=".2f")
plt.title("Correlation Heatmap")
plt.savefig("figures/corr_heatmap.png")
plt.close()


In [None]:
binary_map = {'Yes':1, 'No':0}

if 'gender' in df.columns:
    df['gender'] = df['gender'].map({'Male':1, 'Female':0})

for c in df.select_dtypes(include='object').columns:
    if df[c].nunique() == 2:
        vals = df[c].unique()
        if set(vals) <= set(['Yes','No']):
            df[c] = df[c].map(binary_map)

remaining_cat = [c for c in df.select_dtypes(include='object').columns if c != 'Churn']
df = pd.get_dummies(df, columns=remaining_cat, drop_first=True)

features = [c for c in df.columns if c != 'Churn']
X = df[features]
y = df['Churn']


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

print("Before SMOTE:", y_train.value_counts().to_dict())

sm = SMOTE(random_state=42)
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)

print("After SMOTE:", pd.Series(y_train_sm).value_counts().to_dict())


In [None]:
scaler = StandardScaler()
lr = LogisticRegression(max_iter=1000, solver='liblinear')
from sklearn.pipeline import Pipeline
pipe_lr = Pipeline([('scaler', scaler), ('clf', lr)])
pipe_lr.fit(X_train_sm, y_train_sm)

y_pred_lr = pipe_lr.predict(X_test)
y_proba_lr = pipe_lr.predict_proba(X_test)[:,1]

acc_lr = accuracy_score(y_test, y_pred_lr)
auc_lr = roc_auc_score(y_test, y_proba_lr)

print("Logistic Regression Accuracy:", acc_lr)
print("Logistic Regression AUC:", auc_lr)
print(classification_report(y_test, y_pred_lr))


In [None]:
def chi2_pvalue(cat_series, target):
    contingency = pd.crosstab(cat_series, target)
    if contingency.shape[0] < 2 or contingency.shape[1] < 2:
        return 1.0
    chi2, p, _, _ = chi2_contingency(contingency)
    return p

raw = pd.read_csv(DATA_PATH)
raw['Churn'] = raw['Churn'].map({'Yes':1, 'No':0})
cat_columns_raw = raw.select_dtypes(include='object').columns.tolist()

chi2_results = []
for c in cat_columns_raw:
    try:
        p = chi2_pvalue(raw[c], raw['Churn'])
        chi2_results.append((c, p))
    except Exception:
        pass

chi2_results = sorted(chi2_results, key=lambda x: x[1])
print("Categorical features ranked by chi-square p-value:")
print(chi2_results)


In [None]:
dt = DecisionTreeClassifier(max_depth=3, min_samples_leaf=50, random_state=42)
dt.fit(X_train_sm, y_train_sm)

y_pred_dt = dt.predict(X_test)
y_proba_dt = dt.predict_proba(X_test)[:,1]

acc_dt = accuracy_score(y_test, y_pred_dt)
auc_dt = roc_auc_score(y_test, y_proba_dt)

print("Decision Tree Accuracy:", acc_dt)
print("Decision Tree AUC:", auc_dt)
print(classification_report(y_test, y_pred_dt))

tree_rules = export_text(dt, feature_names=features)
print(tree_rules)

with open("figures/decision_tree_rules.txt", "w") as f:
    f.write(tree_rules)


In [None]:
from sklearn.metrics import roc_curve
plt.figure(figsize=(6,5))
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_proba_lr)
fpr_dt, tpr_dt, _ = roc_curve(y_test, y_proba_dt)

plt.plot(fpr_lr, tpr_lr, label=f"Logistic AUC={auc_lr:.3f}")
plt.plot(fpr_dt, tpr_dt, label=f"DecisionTree AUC={auc_dt:.3f}")
plt.plot([0,1],[0,1],'--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve Comparison")
plt.legend()
plt.savefig("figures/roc_comparison.png")
plt.close()

def gains_table(y_true, y_score, groups=10):
    df_ = pd.DataFrame({'y_true': y_true, 'y_score': y_score})
    df_ = df_.sort_values('y_score', ascending=False).reset_index(drop=True)
    df_['bucket'] = pd.qcut(df_.index, groups, labels=False)
    grouped = df_.groupby('bucket').agg({'y_true': ['sum','count']})
    grouped.columns = ['sum','count']
    grouped['response_rate'] = grouped['sum'] / grouped['count']
    grouped['cum_response'] = grouped['sum'].cumsum()
    grouped['cum_count'] = grouped['count'].cumsum()
    grouped['cum_response_rate'] = grouped['cum_response'] / grouped['cum_count']
    return grouped

gains_lr = gains_table(y_test.values, y_proba_lr)
gains_dt = gains_table(y_test.values, y_proba_dt)

plt.figure(figsize=(6,5))
plt.plot(np.linspace(0,100,len(gains_lr)), gains_lr['cum_response_rate']*100, marker='o', label='Logistic')
plt.plot(np.linspace(0,100,len(gains_dt)), gains_dt['cum_response_rate']*100, marker='o', label='DecisionTree')
plt.xlabel("Percentile of sample")
plt.ylabel("Cumulative Response Rate (%)")
plt.title("Cumulative Gains Chart")
plt.legend()
plt.savefig("figures/cumulative_gains.png")
plt.close()


In [None]:
os.makedirs("models", exist_ok=True)

best_model = pipe_lr if auc_lr >= auc_dt else dt
joblib.dump(best_model, "models/best_model.joblib")

cleaned_csv_path = "data/cleaned_telco_churn.csv"
df.to_csv(cleaned_csv_path, index=False)

print("Best model saved to models/best_model.joblib")
print("Cleaned dataset saved to data/cleaned_telco_churn.csv")


In [None]:
flask_code = """
from flask import Flask, request, jsonify
import joblib
import pandas as pd

app = Flask(__name__)
model = joblib.load('models/best_model.joblib')

@app.route('/predict', methods=['POST'])
def predict():
    data = request.json
    df = pd.DataFrame([data])
    features = joblib.load('models/feature_list.joblib')
    for c in features:
        if c not in df.columns:
            df[c] = 0
    df = df[features]
    proba = model.predict_proba(df)[:,1]
    return jsonify({'churn_probability': float(proba[0])})

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5000)
"""
with open("deployment_flask_app.txt", "w") as f:
    f.write(flask_code)

joblib.dump(features, "models/feature_list.joblib")

print("Flask skeleton written to deployment_flask_app.txt")
print("Feature list saved to models/feature_list.joblib")
    