In [7]:
# Importing Libraries
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, classification_report, confusion_matrix, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
import pickle
import os

# Define paths
project_root = '/home/shoaib/Code/Churn_pred' # Get project root directory
data_path = os.path.join(project_root, 'app', 'data', 'Cleaned_Telecom_Dataset_New.csv')
models_dir = os.path.join(project_root, 'app', 'models')

# Reading csv
df = pd.read_csv(data_path)
df = df.drop('Unnamed: 0', axis=1)  # Assuming 'Unnamed: 0' is irrelevant

# Feature Selection (Do this before scaling)
X = df.drop('Churn', axis=1) 
y = df['Churn']

# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Feature Scaling (Fit on X_train only) ---
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Save the fitted scaler
scaler_path = os.path.join(models_dir, 'scaler.pkl')
pickle.dump(scaler, open(scaler_path, 'wb')) 

# --- SMOTEENN ---
sm = SMOTEENN(smote=SMOTE(random_state=42))
X_resampled, y_resampled = sm.fit_resample(X_train, y_train)  # Fit on scaled training data
Xr_train, Xr_test, yr_train, yr_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# --- Decision Tree Classifier ---
# Before SMOTEENN
model_dt = DecisionTreeClassifier(criterion="gini", random_state=100, max_depth=6, min_samples_leaf=8)
model_dt.fit(X_train, y_train)
y_pred_dt = model_dt.predict(X_test)
print("\nDecision Tree Classifier - Before SMOTEENN:")
print(f"Accuracy: {model_dt.score(X_test, y_test)}")
print(classification_report(y_test, y_pred_dt))
dt_model_path = os.path.join(models_dir, 'decision_tree_model.pkl')
pickle.dump(model_dt, open(dt_model_path, 'wb'))

# After SMOTEENN
model_dt_smote = DecisionTreeClassifier(criterion="gini", random_state=100, max_depth=6, min_samples_leaf=8)
model_dt_smote.fit(Xr_train, yr_train)
yr_pred_dt = model_dt_smote.predict(Xr_test)
print("\nDecision Tree Classifier - After SMOTEENN:")
print(f"Accuracy: {model_dt_smote.score(Xr_test, yr_test)}")
print(classification_report(yr_test, yr_pred_dt))
dt_smote_model_path = os.path.join(models_dir, 'decision_tree_smote_model.pkl')
pickle.dump(model_dt_smote, open(dt_smote_model_path, 'wb'))

# --- Random Forest Classifier ---
# Before SMOTEENN
model_rf = RandomForestClassifier(n_estimators=100, random_state=42)
model_rf.fit(X_train, y_train)
y_pred_rf = model_rf.predict(X_test)
print("\nRandom Forest Classifier - Before SMOTEENN:")
print(f"Accuracy: {model_rf.score(X_test, y_test)}")
print(classification_report(y_test, y_pred_rf))
rf_model_path = os.path.join(models_dir, 'random_forest_model.pkl')
pickle.dump(model_rf, open(rf_model_path, 'wb'))

# After SMOTEENN
model_rf_smote = RandomForestClassifier(n_estimators=100, random_state=42)
model_rf_smote.fit(Xr_train, yr_train)
yr_pred_rf = model_rf_smote.predict(Xr_test)
print("\nRandom Forest Classifier - After SMOTEENN:")
print(f"Accuracy: {model_rf_smote.score(Xr_test, yr_test)}")
print(classification_report(yr_test, yr_pred_rf))
rf_smote_model_path = os.path.join(models_dir, 'random_forest_smote_model.pkl')
pickle.dump(model_rf_smote, open(rf_smote_model_path, 'wb'))

# --- Logistic Regression ---
# Before SMOTEENN
model_lr = LogisticRegression(random_state=42)
model_lr.fit(X_train, y_train)
y_pred_lr = model_lr.predict(X_test)
print("\nLogistic Regression - Before SMOTEENN:")
print(f"Accuracy: {model_lr.score(X_test, y_test)}")
print(classification_report(y_test, y_pred_lr))
lr_model_path = os.path.join(models_dir, 'logistic_regression_model.pkl')
pickle.dump(model_lr, open(lr_model_path, 'wb'))

# After SMOTEENN
model_lr_smote = LogisticRegression(random_state=42)
model_lr_smote.fit(Xr_train, yr_train)
yr_pred_lr = model_lr_smote.predict(Xr_test)
print("\nLogistic Regression - After SMOTEENN:")
print(f"Accuracy: {model_lr_smote.score(Xr_test, yr_test)}")
print(classification_report(yr_test, yr_pred_lr))
lr_smote_model_path = os.path.join(models_dir, 'logistic_regression_smote_model.pkl')
pickle.dump(model_lr_smote, open(lr_smote_model_path, 'wb'))

# --- K-Nearest Neighbors Classifier ---
# Before SMOTEENN
model_knn = KNeighborsClassifier(n_neighbors=5)
model_knn.fit(X_train, y_train)
y_pred_knn = model_knn.predict(X_test)
print("\nK-Nearest Neighbors Classifier - Before SMOTEENN:")
print(f"Accuracy: {model_knn.score(X_test, y_test)}")
print(classification_report(y_test, y_pred_knn))
knn_model_path = os.path.join(models_dir, 'knn_model.pkl')
pickle.dump(model_knn, open(knn_model_path, 'wb'))

# After SMOTEENN
model_knn_smote = KNeighborsClassifier(n_neighbors=5)
model_knn_smote.fit(Xr_train, yr_train)
yr_pred_knn = model_knn_smote.predict(Xr_test)
print("\nK-Nearest Neighbors Classifier - After SMOTEENN:")
print(f"Accuracy: {model_knn_smote.score(Xr_test, yr_test)}")
print(classification_report(yr_test, yr_pred_knn))
knn_smote_model_path = os.path.join(models_dir, 'knn_smote_model.pkl')
pickle.dump(model_knn_smote, open(knn_smote_model_path, 'wb'))

print(f"\nAll models and scaler saved to: {models_dir}")


Decision Tree Classifier - Before SMOTEENN:
Accuracy: 0.7668798862828714
              precision    recall  f1-score   support

           0       0.85      0.83      0.84      1033
           1       0.56      0.59      0.57       374

    accuracy                           0.77      1407
   macro avg       0.70      0.71      0.71      1407
weighted avg       0.77      0.77      0.77      1407


Decision Tree Classifier - After SMOTEENN:
Accuracy: 0.9260326609029779
              precision    recall  f1-score   support

           0       0.92      0.92      0.92       468
           1       0.93      0.93      0.93       573

    accuracy                           0.93      1041
   macro avg       0.93      0.93      0.93      1041
weighted avg       0.93      0.93      0.93      1041


Random Forest Classifier - Before SMOTEENN:
Accuracy: 0.7711442786069652
              precision    recall  f1-score   support

           0       0.82      0.88      0.85      1033
           1    