In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score

class ChurnPredictor:
    def __init__(self, data):
        
        self.data = data
        self.model = RandomForestClassifier(n_estimators=100, random_state=42)

    def preprocess_data(self):
        
        # Convert Contract_Type into numerical values
        self.data['Contract_Type'] = self.data['Contract_Type'].map({
            'Month-to-Month': 0, 'One-Year': 1, 'Two-Year': 2
        })
        
        # Features: Tenure, Contract_Type, Monthly_Charges
        X = self.data[['Tenure', 'Contract_Type', 'Monthly_Charges']]
        # Target: Churn_Flag (0 = No Churn, 1 = Churn)
        y = self.data['Churn_Flag']
        
        return X, y

    def split_data(self, X, y):
        
        return train_test_split(X, y, test_size=0.3, random_state=42)

    def train_model(self, X_train, y_train):
        
        self.model.fit(X_train, y_train)

    def predict_churn(self, X_test):
        
        return self.model.predict(X_test)

    def evaluate_performance(self, y_test, y_pred):
        
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        return accuracy, precision, recall

    def calculate_churn_probability(self, X_test):
        
        return self.model.predict_proba(X_test)[:, 1]

    def calculate_retention_rate(self):
        
        total_customers = len(self.data)
        churned_customers = self.data['Churn_Flag'].sum()
        retention_rate = (total_customers - churned_customers) / total_customers
        return retention_rate

# Main function to demonstrate the usage of the ChurnPredictor class
if __name__ == "__main__":
    # Load the dataset (replace with the path to your dataset)
    data = pd.read_csv('C:/Users/kanak\customer churn prediction/data/customer_churn_dataset.csv')
    
    # Initialize the ChurnPredictor
    churn_predictor = ChurnPredictor(data)
    
    # Preprocess the data
    X, y = churn_predictor.preprocess_data()
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = churn_predictor.split_data(X, y)
    
    # Train the model
    churn_predictor.train_model(X_train, y_train)
    
    # Predict churn for the test set
    y_pred = churn_predictor.predict_churn(X_test)
    
    # Evaluate the model's performance
    accuracy, precision, recall = churn_predictor.evaluate_performance(y_test, y_pred)
    print(f"Accuracy: {accuracy:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}")
    
    # Calculate churn probabilities
    churn_probabilities = churn_predictor.calculate_churn_probability(X_test)
    print(f"Churn Probabilities: {churn_probabilities}")
    
    # Calculate the customer retention rate
    retention_rate = churn_predictor.calculate_retention_rate()
    print(f"Customer Retention Rate: {retention_rate:.2f}")


Accuracy: 0.50, Precision: 0.64, Recall: 0.39
Churn Probabilities: [0.59 0.25 0.29 0.25 0.85 0.85 0.09 0.78 0.11 0.09 0.43 0.28 0.17 0.3
 0.65 0.36 0.06 0.28 0.76 0.72 0.35 0.65 0.45 0.39 0.58 0.4  0.07 0.96
 0.68 0.25]
Customer Retention Rate: 0.50


In [3]:
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
import pandas as pd

# Step 1: Load Dataset
def load_data(file_path):
    return pd.read_csv(file_path)

# Step 2: Preprocess Data
def preprocess_data(data):
    # Convert Contract_Type into numerical values
    data['Contract_Type'] = data['Contract_Type'].map({
        'Month-to-Month': 0, 'One-Year': 1, 'Two-Year': 2
    })
    
    # Features and target selection
    X = data[['Tenure', 'Contract_Type', 'Monthly_Charges']]
    y = data['Churn_Flag']
    
    return X, y

# Step 3: Split Data into Train and Test
def split_data(X, y):
    return train_test_split(X, y, test_size=0.3, random_state=42)

# Step 4: Use TPOT to Train and Evaluate Models
def train_with_tpot(X_train, X_test, y_train, y_test):
    tpot = TPOTClassifier(verbosity=2, generations=5, population_size=20, random_state=42)
    tpot.fit(X_train, y_train)
    
    # Evaluate the model on the test set
    y_pred = tpot.predict(X_test)
    
    # Calculate accuracy, precision, and recall
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    print(f"Best pipeline: {tpot.fitted_pipeline_}")
    print(f"Accuracy: {accuracy:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}")
    
    # Optionally export the best model pipeline code to a Python file
    tpot.export('best_model_pipeline.py')
    
    return accuracy, precision, recall

# Main function to run the AutoML process
if __name__ == "__main__":
    # Load the dataset (replace with the path to your dataset)
    data = pd.read_csv('C:/Users/kanak/customer churn prediction/data/customer_churn_dataset.csv')
    
    # Preprocess the data
    X, y = preprocess_data(data)
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = split_data(X, y)
    
    # Use AutoML (TPOT) to train the best model and evaluate its performance
    accuracy, precision, recall = train_with_tpot(X_train, X_test, y_train, y_test)
    
    print(f"Final Model Performance - Accuracy: {accuracy:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}")


                                                                             
Generation 1 - Current best internal CV score: 0.6142857142857143
                                                                             
Generation 2 - Current best internal CV score: 0.6142857142857143
                                                                             
Generation 3 - Current best internal CV score: 0.6428571428571428
                                                                             
Generation 4 - Current best internal CV score: 0.6428571428571428
                                                                              
Generation 5 - Current best internal CV score: 0.6571428571428573
                                                                              
Best pipeline: XGBClassifier(MaxAbsScaler(input_matrix), learning_rate=0.1, max_depth=3, min_child_weight=2, n_estimators=100, n_jobs=1, subsample=0.35000000000000003, verbosity=0)
Best pipeline: Pipe