In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [None]:
filepath = "your dataset path...!"
df = pd.read_csv(filepath)
print(df.head())

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
#Converting Categorical variables to numeric
df["ContractType"]=df["ContractType"].map({"Monthly": 0, "Yearly": 1})
df["HasPremiumSupport"]=df["HasPremiumSupport"].map({"No": 0, "Yes": 1})

#Verifying Changes
print(df.head())

In [None]:
from scipy.stats import zscore

#Scaling Numerical Columns
df["SubscriptionLength"]= zscore(df["SubscriptionLength"])
df["MonthlySpending"]= zscore(df["MonthlySpending"])
df["TotalSpent"]= zscore(df["TotalSpent"])

#Checking Results
print(df.describe())

In [None]:
from sklearn.model_selection import train_test_split

#Defining features X and target
X= df.drop(columns=["CustomerID", "Churn"])
y= df["Churn"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Verifying Split Sizes
print("Training Data :", X_train.shape)
print("Testing Data :", X_test.shape)

In [None]:
#Training Logitic Regression Model
log_model= LogisticRegression()
log_model.fit(X_train, y_train)

#Making Predicitons
y_pred_log= log_model.predict(X_test)

#Evaluating Performance
log_accuracy= accuracy_score(y_test, y_pred_log)
log_precision= precision_score(y_test, y_pred_log)
log_recall= recall_score(y_test, y_pred_log)
log_f1= f1_score(y_test, y_pred_log)

#Printing Metrics
print("Logistic Regression Result :")
print(f"Accuracy: {log_accuracy: .2f}")
print(f"Precision: {log_precision: .2f}")
print(f"Recall: {log_recall: .2f}")
print(f"F1 Score: {log_f1: .2f}")

In [None]:
from sklearn.ensemble import RandomForestClassifier

#Training Random Forest Model
rf_model= RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

#Making Predicitons
y_pred_rf= rf_model.predict(X_test)

#Evaluating Performance
rf_accuracy= accuracy_score(y_test, y_pred_rf)
rf_precision= precision_score(y_test, y_pred_rf)
rf_recall= recall_score(y_test, y_pred_rf)
rf_f1= f1_score(y_test, y_pred_rf)

#Printing Metrics
print("Logistic Regression Result :")
print(f"Accuracy: {rf_accuracy: .2f}")
print(f"Precision: {rf_precision: .2f}")
print(f"Recall: {rf_recall: .2f}")
print(f"F1 Score: {rf_f1: .2f}")

In [None]:
#Getting feature importance from Logistic Regression Model
log_coeffs= pd.DataFrame({"Feature": X_train.columns, "Importance": log_model.coef_[0]})
log_coeffs= log_coeffs.sort_values(by="Importance", ascending=False)

#Displaying feature importance
print("Logistic Regression Feature Importance :")
print(log_coeffs)

In [None]:
#Getting feature importance from Random Forest
rf_importance= pd.DataFrame({"Feature": X_train.columns, "Importance": rf_model.feature_importances_})
rf_importance= rf_importance.sort_values(by="Importance", ascending=False)

#Plotting feature Importance
plt.figure(figsize=(8, 5))
sns.barplot(x=rf_importance["Importance"], y=rf_importance["Feature"], palette="coolwarm", hue=rf_importance["Feature"], legend=False)
plt.xlabel("Importance Score")
plt.ylabel("Feature")
plt.title("Random Forest Feature Importance")
plt.savefig("importance_score_vs_feature.png", dpi=300, bbox_inches="tight") # Saves as PNG with high resolution
plt.show()

In [None]:
from sklearn.model_selection import GridSearchCV

#Defining hyperParameters to test 
param_grid= {
    "n_estimators": [50, 100, 200], #Number of trees
    "max_depth": [None, 10, 20], #Depth of each tree
    "min_samples_split": [2, 5, 10], #Minimun Sample Split
    "min_samples_leaf": [1, 2, 4], #Minimum sample per leaf
}
#Performing Grid Search
grid_search= GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

#Best Parameters
print("Best HyperParameters: ", grid_search.best_params_)

In [None]:
#training model with best parameters
best_rf_model= RandomForestClassifier(
    n_estimators= grid_search.best_params_["n_estimators"],
    max_depth= grid_search.best_params_["max_depth"],
    min_samples_split= grid_search.best_params_["min_samples_split"],
    min_samples_leaf= grid_search.best_params_["min_samples_leaf"],
    random_state=42
)

best_rf_model.fit(X_train, y_train)
y_pred_best_rf= best_rf_model.predict(X_test)

In [None]:
#Evaluating Performance
rf_accuracy= accuracy_score(y_test, y_pred_best_rf)
rf_precision= precision_score(y_test, y_pred_best_rf)
rf_recall= recall_score(y_test, y_pred_best_rf)
rf_f1= f1_score(y_test, y_pred_best_rf)

#Printing Metrics
print("Logistic Regression Result :")
print(f"Accuracy: {rf_accuracy: .2f}")
print(f"Precision: {rf_precision: .2f}")
print(f"Recall: {rf_recall: .2f}")
print(f"F1 Score: {rf_f1: .2f}")

In [None]:
import joblib

#saving the trained model 
joblib.dump(best_rf_model, "random_forest_churn_model.pkl")
print("Model Saved Successfully")

In [None]:
#loading the savemodel 
loaded_model= joblib.load("random_forest_churn_model.pkl")

# Defining  feature names (same as Dataset)
feature_names = ["SubscriptionLength", "MonthlySpending", "TotalSpent", "ContractType", "HasPremiumSupport"]

#Example Data (same format as X_train)
new_customer= pd.DataFrame([[24, 80.5, 1932.0, 0, 1]], columns= feature_names)

#Predictin Churn
prediction= loaded_model.predict(new_customer)

print("Churn Prediction: ", "Churned (1)" if prediction[0] == 1 else "Active (0)")

In [None]:
#Example Data (same format as X_train)
new_customer= pd.DataFrame([[48, 80.5, 2932.0, 1, 0]], columns= feature_names)

#Predictin Churn
prediction= loaded_model.predict(new_customer)

print("Churn Prediction: ", "Churned (1)" if prediction[0] == 1 else "Active (0)")