DATA PREPROCESSING

In [2]:
import pandas as pd

# Load the dataset
data = pd.read_excel('/content/customer_churn_large_dataset.xlsx')

# Initial data exploration
print(data.head())  # View the first few rows
print(data.info())  # Check data types and missing values
print(data.describe())  # Summary statistics



   CustomerID        Name  Age  Gender     Location  \
0           1  Customer_1   63    Male  Los Angeles   
1           2  Customer_2   62  Female     New York   
2           3  Customer_3   24  Female  Los Angeles   
3           4  Customer_4   36  Female        Miami   
4           5  Customer_5   46  Female        Miami   

   Subscription_Length_Months  Monthly_Bill  Total_Usage_GB  Churn  
0                          17         73.36             236      0  
1                           1         48.76             172      0  
2                           5         85.47             460      0  
3                           3         97.94             297      1  
4                          19         58.14             266      0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   CustomerID               

In [3]:
#handling missing data
data.isnull().value_counts()

CustomerID  Name   Age    Gender  Location  Subscription_Length_Months  Monthly_Bill  Total_Usage_GB  Churn
False       False  False  False   False     False                       False         False           False    100000
dtype: int64

In [6]:
# Split data into training and testing sets (e.g., 80% train, 20% test)
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X = data.drop('Churn', axis=1)
y = data['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Feature scaling or normalization (if needed)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
non_numeric_columns = ['Name','Gender','Location']
X_train_numeric = X_train.drop(non_numeric_columns, axis=1)
X_test_numeric = X_test.drop(non_numeric_columns, axis=1)

X_train = scaler.fit_transform(X_train_numeric)
X_test = scaler.transform(X_test_numeric)

In [9]:
# Choose appropriate machine learning algorithms and train them
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import xgboost as xgb

# Example models
results = {}
#nn_model = MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=1000)

# Initialize models
models = {
    'Random Forest' : RandomForestClassifier(),
    "logistic": LogisticRegression(),
    'MLP':MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Support Vector Machine': SVC(),
    'XGBoost': xgb.XGBClassifier()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    results[name] = {'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1 Score': f1}

# Evaluate the model's performance
print(results)


{'Random Forest': {'Accuracy': 0.4978, 'Precision': 0.4935716525556601, 'Recall': 0.4759600846688842, 'F1 Score': 0.4846059113300493}, 'logistic': {'Accuracy': 0.4994, 'Precision': 0.4928335170893054, 'Recall': 0.3153915935893559, 'F1 Score': 0.3846342962507683}, 'MLP': {'Accuracy': 0.50305, 'Precision': 0.49911833855799376, 'Recall': 0.5135571010986796, 'F1 Score': 0.5062347856326693}, 'Decision Tree': {'Accuracy': 0.5011, 'Precision': 0.49713135379969803, 'Recall': 0.4978328797500252, 'F1 Score': 0.4974818694601128}, 'K-Nearest Neighbors': {'Accuracy': 0.49935, 'Precision': 0.4953600968327618, 'Recall': 0.49501058361052314, 'F1 Score': 0.4951852785480212}, 'Support Vector Machine': {'Accuracy': 0.50365, 'Precision': 0.4996425166825548, 'Recall': 0.4226388468904344, 'F1 Score': 0.45792606345219244}, 'XGBoost': {'Accuracy': 0.5003, 'Precision': 0.49624060150375937, 'Recall': 0.4856365285757484, 'F1 Score': 0.49088130412633724}}


In [10]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

# Define a parameter grid with hyperparameter options
param_grid = {
    'hidden_layer_sizes': [50, 50],
    'activation': ['relu', 'logistic', 'tanh'],
    'alpha': [0.0001, 0.001, 0.01],
}

# Create the GridSearchCV object with the parameter grid
grid_search = GridSearchCV(MLPClassifier(), param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best model from the grid search
best_model = grid_search.best_estimator_


In [20]:
# Simulate model deployment
def predict_churn(new_customer_data):
    # Assuming new_customer_data is a DataFrame with the same columns as X_train
    new_customer_data_scaled = scaler.transform(new_customer_data)
    churn_prediction = best_model.predict(new_customer_data_scaled)
    return churn_prediction

# Example usage:
new_customer_data_dict = {
    'CustomerID': [1234],  # Replace 'feature1' and 'value1' with actual data
    #'Name': ['ABCD'],  # Replace 'feature2' and 'value2' with actual data
    'Age': [45],
    #'Gender': ['female'],
    #'Location': ['Miami'],
    'Subscription_Length_Months': [19],
    'Monthly_Bill': [500],
    'Total_Usage_GB': [60],
    #'Churn': [0],# Replace 'feature3' and 'value3' with actual data
    # Add more features and values as needed
}

new_customer_data = pd.DataFrame(new_customer_data_dict)  # Create a DataFrame with new customer data
churn_prediction = predict_churn(new_customer_data)
print(churn_prediction)

[1]
