In [None]:
pip install pycaret



In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.max_columns', None)

import plotly.express as px #for visualization
import matplotlib.pyplot as plt #for visualization

#Read the dataset
data = pd.read_csv("customer_churn_large_dataset.csv")


#Get overview of the data
def dataoveriew(df, message):
    print(f'{message}:n')
    print('Number of rows: ', df.shape[0])
    print("nNumber of features:", df.shape[1])
    print("nData Features:")
    print(df.columns.tolist())
    print("nMissing values:", df.isnull().sum().values.sum())
    print("nUnique values:")
    print(df.nunique())

dataoveriew(data, 'Overview of the dataset')

Overview of the dataset:n
Number of rows:  100000
nNumber of features: 9
nData Features:
['CustomerID', 'Name', 'Age', 'Gender', 'Location', 'Subscription_Length_Months', 'Monthly_Bill', 'Total_Usage_GB', 'Churn']
nMissing values: 0
nUnique values:
CustomerID                    100000
Name                          100000
Age                               53
Gender                             2
Location                           5
Subscription_Length_Months        24
Monthly_Bill                    7001
Total_Usage_GB                   451
Churn                              2
dtype: int64


In [None]:
data.dtypes


CustomerID                      int64
Name                           object
Age                             int64
Gender                         object
Location                       object
Subscription_Length_Months      int64
Monthly_Bill                  float64
Total_Usage_GB                  int64
Churn                           int64
dtype: object

In [None]:
data['Total_Usage_GB'] = data['Total_Usage_GB'].replace(' ', np.nan)

data['Total_Usage_GB'] = data['Total_Usage_GB'].astype('float64')

In [None]:
data.isnull().sum()

CustomerID                    0
Name                          0
Age                           0
Gender                        0
Location                      0
Subscription_Length_Months    0
Monthly_Bill                  0
Total_Usage_GB                0
Churn                         0
dtype: int64

In [None]:
from pycaret.classification import *
s = setup(data, target = 'Churn', ignore_features = ['customerID'])

Unnamed: 0,Description,Value
0,Session id,1440
1,Target,Churn
2,Target type,Binary
3,Original data shape,"(100000, 9)"
4,Transformed data shape,"(100000, 13)"
5,Transformed train set shape,"(70000, 13)"
6,Transformed test set shape,"(30000, 13)"
7,Ignore features,1
8,Ordinal features,1
9,Numeric features,5


In [None]:
best_model = compare_models(sort='AUC')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.5022,0.5025,0.0,0.0,0.0,0.0,0.0,5.758
lr,Logistic Regression,0.5022,0.5011,0.0,0.0,0.0,0.0,0.0,4.293
dt,Decision Tree Classifier,0.5022,0.5,0.0,0.0,0.0,0.0,0.0,2.046
qda,Quadratic Discriminant Analysis,0.5018,0.5,0.1,0.0498,0.0665,0.0,0.0,1.82
ada,Ada Boost Classifier,0.5022,0.5,0.0,0.0,0.0,0.0,0.0,2.177
gbc,Gradient Boosting Classifier,0.5022,0.5,0.0,0.0,0.0,0.0,0.0,6.828
lda,Linear Discriminant Analysis,0.5022,0.5,0.0,0.0,0.0,0.0,0.0,1.972
xgboost,Extreme Gradient Boosting,0.5022,0.5,0.0,0.0,0.0,0.0,0.0,3.381
dummy,Dummy Classifier,0.5022,0.5,0.0,0.0,0.0,0.0,0.0,1.675
lightgbm,Light Gradient Boosting Machine,0.5022,0.4988,0.0,0.0,0.0,0.0,0.0,4.59


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

In [None]:
tuned_best_model = tune_model(best_model)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5023,0.5,0.0,0.0,0.0,0.0,0.0
1,0.5023,0.5,0.0,0.0,0.0,0.0,0.0
2,0.5023,0.5,0.0,0.0,0.0,0.0,0.0
3,0.5023,0.5,0.0,0.0,0.0,0.0,0.0
4,0.5023,0.5,0.0,0.0,0.0,0.0,0.0
5,0.5021,0.5,0.0,0.0,0.0,0.0,0.0
6,0.5021,0.5,0.0,0.0,0.0,0.0,0.0
7,0.5021,0.5,0.0,0.0,0.0,0.0,0.0
8,0.5021,0.5,0.0,0.0,0.0,0.0,0.0
9,0.5021,0.5,0.0,0.0,0.0,0.0,0.0


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [None]:
# create a custom function
def calculate_profit(y, y_pred):
    tp = np.where((y_pred==1) & (y==1), (5000-1000), 0)
    fp = np.where((y_pred==1) & (y==0), -1000, 0)
    return np.sum([tp,fp])
# add metric to PyCaret
add_metric('profit', 'Profit', calculate_profit)

Name                                                        Profit
Display Name                                                Profit
Score Function       <function calculate_profit at 0x794728a13400>
Scorer                               make_scorer(calculate_profit)
Target                                                        pred
Args                                                            {}
Greater is Better                                             True
Multiclass                                                    True
Custom                                                        True
Name: profit, dtype: object

In [None]:
best_model = compare_models(sort='Profit')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,Profit,TT (Sec)
et,Extra Trees Classifier,0.4965,0.4966,0.68,0.4954,0.5671,-0.0055,-0.0055,7068200.0,1.517
svm,SVM - Linear Kernel,0.5001,0.0,0.4999,0.4184,0.3341,0.0001,0.0004,5210700.0,0.9
knn,K Neighbors Classifier,0.4963,0.4937,0.4927,0.494,0.4934,-0.0075,-0.0075,5108800.0,2.501
nb,Naive Bayes,0.5001,0.4971,0.2703,0.4963,0.3495,-0.0018,-0.002,2810300.0,1.716
qda,Quadratic Discriminant Analysis,0.5018,0.5,0.1,0.0498,0.0665,0.0,0.0,1042500.0,1.272
lr,Logistic Regression,0.5022,0.5011,0.0,0.0,0.0,0.0,0.0,0.0,2.718
dt,Decision Tree Classifier,0.5022,0.5,0.0,0.0,0.0,0.0,0.0,0.0,1.326
ridge,Ridge Classifier,0.5022,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.489
rf,Random Forest Classifier,0.5022,0.5025,0.0,0.0,0.0,0.0,0.0,0.0,1.619
ada,Ada Boost Classifier,0.5022,0.5,0.0,0.0,0.0,0.0,0.0,0.0,1.269


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

In [None]:
final_best = finalize_model(best_model)
# save model to disk
save_model(final_best, 'diamond-pipeline')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=FastMemory(location=/tmp/joblib),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['CustomerID', 'Age',
                                              'Subscription_Length_Months',
                                              'Monthly_Bill', 'Total_Usage_GB'],
                                     transformer=SimpleImputer(add_indicator=False,
                                                               copy=True,
                                                               fill_value=None,
                                                               keep_empty_features=False,
                                                               missing_values=nan,
                                                               strategy='mean',
                                                               verbose='deprecate...
                  ExtraTreesClassifier(bootstrap=False, ccp

In [None]:
pip install "fastapi[all]"

Collecting fastapi[all]
  Downloading fastapi-0.101.1-py3-none-any.whl (65 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.8/65.8 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Collecting starlette<0.28.0,>=0.27.0 (from fastapi[all])
  Downloading starlette-0.27.0-py3-none-any.whl (66 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Collecting email-validator>=2.0.0 (from fastapi[all])
  Downloading email_validator-2.0.0.post2-py3-none-any.whl (31 kB)
Collecting httpx>=0.23.0 (from fastapi[all])
  Downloading httpx-0.24.1-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.4/75.4 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
Collecting pydantic-extra-types>=2.0.0 (from fastapi[all])
  Downloading pydantic_extra_types-2.1.0-py3-none-any.whl (16 kB)
Collecting pydantic-settings>=2.0.0 (from fastapi[all])
  Downloading pydantic_settings-2.0.3-py3-none-an

In [None]:
import pandas as pd
from pycaret.regression import load_model, predict_model
from fastapi import FastAPI
import uvicorn
import nest_asyncio

# Create the app object
app = FastAPI()

# Load trained Pipeline
model = load_model('diamond-pipeline')

# Define predict function
@app.post('/predict')
def predict(carat_weight, cut, color, clarity, polish, symmetry, report):
    data = pd.DataFrame([[carat_weight, cut, color, clarity, polish, symmetry, report]])
    data.columns = ['Carat Weight', 'Cut', 'Color', 'Clarity', 'Polish', 'Symmetry', 'Report']

    predictions = predict_model(model, data=data)
    return {'prediction': int(predictions['Label'][0])}

if __name__ == "__main__":
    # Apply nest_asyncio to allow running asyncio event loop in Jupyter or similar environments
    nest_asyncio.apply()

    # Use uvicorn to run the FastAPI app
    uvicorn.run(app, host="127.0.1.1", port=8000)



Transformation Pipeline and Model Successfully Loaded


INFO:     Started server process [516]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.1.1:8000 (Press CTRL+C to quit)
