In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report


In [15]:
df = pd.read_csv("customer_churn.csv")
df.head()


Unnamed: 0,customer_id,tenure_months,monthly_spend,support_tickets,contract_type,churn
0,1,51,99.62,2,Annual,0
1,2,54,102.63,2,Monthly,0
2,3,8,86.41,3,Monthly,0
3,4,27,93.3,0,Annual,0
4,5,27,99.68,2,Monthly,0


In [3]:
df.info()
df.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   customer_id      10000 non-null  int64  
 1   tenure_months    10000 non-null  int64  
 2   monthly_spend    10000 non-null  float64
 3   support_tickets  10000 non-null  int64  
 4   contract_type    10000 non-null  object 
 5   churn            10000 non-null  int64  
dtypes: float64(1), int64(4), object(1)
memory usage: 468.9+ KB


Unnamed: 0,customer_id,tenure_months,monthly_spend,support_tickets,churn
count,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,30.2641,79.851325,2.0153,0.2266
std,2886.89568,17.029408,20.015241,1.432505,0.418653
min,1.0,1.0,-1.08,0.0,0.0
25%,2500.75,16.0,66.0675,1.0,0.0
50%,5000.5,30.0,79.9,2.0,0.0
75%,7500.25,45.0,93.31,3.0,0.0
max,10000.0,59.0,151.99,10.0,1.0


In [16]:
df["churn"] = np.where(
    (
        (df["tenure_months"] < 12) &
        (df["support_tickets"] >= 4)
    ) |
    (
        (df["monthly_spend"] < 60) &
        (df["contract_type"] == "Monthly")
    ),
    1,
    0
)
## “I strengthened signal by defining churn using realistic behavioral rules commonly seen in subscription businesses.”

In [17]:
df["churn"].value_counts(normalize=True)



Unnamed: 0_level_0,proportion
churn,Unnamed: 1_level_1
0,0.8761
1,0.1239


In [18]:
df["contract_type"] = df["contract_type"].map({
    "Monthly": 0,
    "Annual": 1
})


In [19]:
X = df.drop(columns=["customer_id", "churn"])
y = df["churn"]



In [21]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Why stratify: “To preserve churn distribution across train and test sets.”

In [22]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [23]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, y_train)


In [24]:
y_pred_prob = model.predict_proba(X_test_scaled)[:, 1]
auc = roc_auc_score(y_test, y_pred_prob)
auc


np.float64(0.942875321189506)

In [25]:
y_pred = model.predict(X_test_scaled)
print(classification_report(y_test, y_pred))

# “I focus more on ranking customers correctly (AUC) than raw accuracy.”


              precision    recall  f1-score   support

           0       0.95      0.98      0.97      2628
           1       0.84      0.63      0.72       372

    accuracy                           0.94      3000
   macro avg       0.90      0.80      0.84      3000
weighted avg       0.94      0.94      0.94      3000



In [26]:
coefficients = pd.DataFrame({
    "Feature": X.columns,
    "Coefficient": model.coef_[0]
}).sort_values(by="Coefficient", ascending=False)

coefficients
# Tenure ↓ → churn ↑
# Support tickets ↑ → churn ↑
# Annual contracts ↓ churn risk

Unnamed: 0,Feature,Coefficient
2,support_tickets,0.687918
0,tenure_months,-0.555998
3,contract_type,-1.556433
1,monthly_spend,-2.414155


In [13]:
### Business Insights

# Customers with short tenure and high support interactions are significantly more likely to churn
# Monthly contract customers show higher churn risk than annual contracts
# Retention efforts should focus on early-stage customers and service quality improvements
