In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE

In [27]:
# Step 1: Load a customer dataset
# Creating a sample customer dataset
data = {
    'CustomerID': range(1, 101),
    'Age': np.random.randint(18, 70, 100),
    'Tenure': np.random.randint(1, 10, 100),
    'Churn': np.random.choice([0, 1], 100, p=[0.7, 0.3])  # 70% retention, 30% churn
}

customer_data = pd.DataFrame(data)

In [29]:
# Step 2: Data Aggregation
# Aggregating the data to find the average age and tenure of customers who churned vs those who didn't
aggregated_data = customer_data.groupby('Churn').agg({'Age': 'mean', 'Tenure': 'mean'}).reset_index()
print("Aggregated Data (Average Age and Tenure):\n", aggregated_data)


Aggregated Data (Average Age and Tenure):
    Churn        Age    Tenure
0      0  38.088235  5.073529
1      1  42.250000  5.531250


In [31]:
# Step 3: Data Splitting
# Split the dataset into features (X) and target variable (y)
X = customer_data[['Age', 'Tenure']]
y = customer_data['Churn']

# Use Scikit-learn's train_test_split to divide the data into training (80%) and testing (20%) sets with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Check class distribution in training set
print("Class distribution in training set:\n", y_train.value_counts())

Class distribution in training set:
 Churn
0    54
1    26
Name: count, dtype: int64


In [33]:
#Step 4: Handle Class Imbalance
# Apply SMOTE to the training set to balance the classes
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [35]:
#Step 5: Model Training
# Choose a classification algorithm (Logistic Regression)
model = LogisticRegression()

# Train the model on the resampled training set
model.fit(X_resampled, y_resampled)

In [37]:
# Step 6: Make Predictions
# Make predictions on the test set
y_pred = model.predict(X_test)

In [39]:
#Step 7: Evaluate Model Performance
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred, zero_division=0)

In [41]:
# Step 8: Output the results
print(f"Model Accuracy: {accuracy:.2f}")
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)


Model Accuracy: 0.55
Confusion Matrix:
 [[8 6]
 [3 3]]
Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.57      0.64        14
           1       0.33      0.50      0.40         6

    accuracy                           0.55        20
   macro avg       0.53      0.54      0.52        20
weighted avg       0.61      0.55      0.57        20

