In [86]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

data = pd.read_excel('/content/customer_churn_large_dataset.xlsx')

In [87]:
data.describe()

Unnamed: 0,CustomerID,Age,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Churn
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,50000.5,44.02702,12.4901,65.053197,274.39365,0.49779
std,28867.657797,15.280283,6.926461,20.230696,130.463063,0.499998
min,1.0,18.0,1.0,30.0,50.0,0.0
25%,25000.75,31.0,6.0,47.54,161.0,0.0
50%,50000.5,44.0,12.0,65.01,274.0,0.0
75%,75000.25,57.0,19.0,82.64,387.0,1.0
max,100000.0,70.0,24.0,100.0,500.0,1.0


In [88]:
data = data.drop(['CustomerID','Name','Age','Location'], axis=1)
label_encoder = LabelEncoder()
data['Gender'] = label_encoder.fit_transform(data['Gender'])

In [89]:
data.isnull().sum()

Gender                        0
Subscription_Length_Months    0
Monthly_Bill                  0
Total_Usage_GB                0
Churn                         0
dtype: int64

In [90]:
scaler = StandardScaler()
data['Total_Usage_GB'] = scaler.fit_transform(data['Total_Usage_GB'].values.reshape(-1, 1))
scaler2 = StandardScaler()
data['Monthly_Bill'] = scaler2.fit_transform(data['Monthly_Bill'].values.reshape(-1, 1))

In [91]:
X = data.drop('Churn', axis=1)
y = data['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [92]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [93]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:")
print(confusion)

Accuracy: 0.502
Precision: 0.4960341671751068
Recall: 0.24584215300876927
F1 Score: 0.32875050545895673
Confusion Matrix:
[[7601 2478]
 [7482 2439]]


In [96]:
import joblib
joblib.dump(model, 'churn_prediction_model.pkl')

def predict_churn(new_data):
    loaded_model = joblib.load('churn_prediction_model.pkl')

    new_data['Gender'] = label_encoder.transform(new_data['Gender'])
    new_data['Total_Usage_GB'] = scaler.transform(new_data['Total_Usage_GB'].values.reshape(-1, 1))
    new_data['Monthly_Bill'] = scaler2.transform(new_data['Monthly_Bill'].values.reshape(-1, 1))

    predictions = loaded_model.predict(new_data)
    return predictions

new_customer_data = pd.DataFrame({
    'Gender': ['Female'],
    'Subscription_Length_Months': [12],
    'Monthly_Bill': [10.0],
    'Total_Usage_GB': [300.0]
})

new_data_predictions = predict_churn(new_customer_data)
print("New Customer Churn Prediction:", new_data_predictions)

New Customer Churn Prediction: [1]
