In [4]:
# Install necessary packages
!pip install xgboost joblib

# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib

# Loading the dataset from local file
data = pd.read_csv('data/telco_customer_churn.csv')

# Data preprocessing
data = data.drop(['customerID'], axis=1)
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')
data['TotalCharges'].fillna(data['TotalCharges'].mean(), inplace=True)

# Encoding the categorical variables
label_encoder = LabelEncoder()
for column in data.select_dtypes(include=['object']).columns:
    data[column] = label_encoder.fit_transform(data[column])

# Scaling numerical features
scaler = StandardScaler()
numerical_columns = ['tenure', 'MonthlyCharges', 'TotalCharges']
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

# Feature engineering
data['Contract'] = data['Contract'] * data['tenure']
data.drop(['PhoneService', 'MultipleLines'], axis=1, inplace=True)

# Split data into training and test data
X = data.drop('Churn', axis=1)
y = data['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and evaluate models
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'XGBoost': XGBClassifier()
}

results = {}
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    results[model_name] = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1 Score': f1_score(y_test, y_pred)
    }

results_df = pd.DataFrame(results).T
print(results_df)

# Saving the best model in pkl file
best_model = GradientBoostingClassifier()
best_model.fit(X_train, y_train)
joblib.dump(best_model, 'models/churn_model.pkl')



                     Accuracy  Precision    Recall  F1 Score
Logistic Regression  0.818311   0.698305  0.552279  0.616766
Random Forest        0.791341   0.644689  0.471850  0.544892
Gradient Boosting    0.808375   0.674576  0.533512  0.595808
XGBoost              0.797019   0.640777  0.530831  0.580645


['models/churn_model.pkl']