In [15]:
import pandas as pd

#load and print first 5 values of dataset
df=pd.read_csv("Customer-Churn-Records.csv")#in the same folder so no need for path
#print(df.head())

#check for missing values in each column as that would cause problems
#print(df.isnull().sum())

#If some data is missing, we may need to fill it with averages or remove those rows.

#selecting features
df = df[['CreditScore', 'Age', 'Balance', 'NumOfProducts', 'IsActiveMember', 'Exited']]


#split data into training and testing
from sklearn.model_selection import train_test_split  
# X represents features (values that predict) and Y represents target(value to be predicted)
X = df.drop(columns=['Exited'])  # All columns except 'Exited'
y = df['Exited']  # The column we want to predict  
#we split into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 
#random_state=42 is used to make sure the train-test split is always the same every time you run the code.

#we use logistic regression
# this is because it is simple and interpretable, also works well for binary classification

from sklearn.linear_model import LogisticRegression  

# Train Logistic Regression model  
model = LogisticRegression()  
model.fit(X_train, y_train)  

print("Model Coefficients:", model.coef_)  
print("Intercept:", model.intercept_)

#output : Model Coefficients: [[-5.14805701e-03  4.26168869e-02  3.59716088e-06 -4.31534741e-04 -1.36049989e-03]]
#Age (0.0426) has the biggest positive effect → Older customers have a higher chance of churning.
#Credit Score (-0.0051) has a small negative effect → Higher credit scores slightly reduce churn, but not significantly.
#Balance (0.000003) is basically irrelevant because the coefficient is very close to zero.

#The intercept captures the baseline probability of churn when all chosen features are zero.
#If the intercept is high, it suggests that even without considering the features, there's already a high likelihood of churn.
#If the intercept is low, it suggests that churn is mostly driven by the chosen features.

#predicting and checking accuracy
# Predict churn for the first 10 customers in test data
y_pred = model.predict(X_test)  

# Print actual vs predicted values for first 10 customers
print("Actual values:   ", y_test[:10].values)  
print("Predicted values:", y_pred[:10])
from sklearn.metrics import accuracy_score  
accuracy = accuracy_score(y_test, y_pred)  
print(f"Model Accuracy: {accuracy * 100:.2f}%") 


#simulating new customer
# Define column names based on training data
feature_names = ["CreditScore", "Age", "Balance", "NumOfProducts", "IsActiveMember"]

# Take user input for each feature
new_customer = []
for feature in feature_names:
    value = float(input(f"Enter {feature}: "))  # Convert input to float
    new_customer.append(value)

# Convert to DataFrame with correct column names
new_customer_df = pd.DataFrame([new_customer], columns=feature_names)

# Predict churn (0 = stays, 1 = leaves)
prediction = model.predict(new_customer_df)
print("Will the customer churn?", "Yes" if prediction[0] == 1 else "No")

Model Coefficients: [[-5.14805701e-03  4.26168869e-02  3.59716088e-06 -4.31534741e-04
  -1.36049989e-03]]
Intercept: [-0.00017491]
Actual values:    [0 0 0 0 0 0 0 1 0 0]
Predicted values: [0 0 0 1 0 0 0 0 0 0]
Model Accuracy: 79.80%
Enter CreditScore: 1000
Enter Age: 40
Enter Balance: 12000
Enter NumOfProducts: 3
Enter IsActiveMember: 0
Will the customer churn? No
