# Data Science Automation
 


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

In [64]:
# Load the data
df = pd.read_csv(r'C:\Users\rames\Downloads\Karthik\updated_churn_data.csv')
columns_to_drop = ['Tenure_Category', 'MonthlyCharges_log', 'TotalCharges_log']
df = df.drop(columns=columns_to_drop)
df.head()


Unnamed: 0,tenure,PhoneService,Contract,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,1,0,0,2,29.85,29.85,0
1,34,1,1,3,56.95,1889.5,0
2,2,1,0,3,53.85,108.15,1
3,45,0,1,0,42.3,1840.75,0
4,2,1,0,2,70.7,151.65,1


We are using pycaret to find an ML algorithm that performs best on the data


In [65]:
pip install pycaret

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\rames\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.


In [66]:
df.replace([np.inf, -np.inf], np.nan, inplace=True)


We are choosing Precision metric as it optimizes the model to minimize false positives, which is crucial when false alarms.

In [67]:
from pycaret.classification import *
clf_setup = setup(data=df, target='Churn', session_id=123, 
                  preprocess=True, normalize=True, 
                  remove_multicollinearity=True, multicollinearity_threshold=0.95, 
                  ignore_features=['customerID'])
# Compare all models based on precision
best_model = compare_models(sort='Precision')
print(best_model)
evaluate_model(best_model)
tuned_model = tune_model(best_model, optimize='Precision')
evaluate_model(tuned_model)



Unnamed: 0,Description,Value
0,Session id,123
1,Target,Churn
2,Target type,Binary
3,Original data shape,"(7043, 7)"
4,Transformed data shape,"(7043, 7)"
5,Transformed train set shape,"(4930, 7)"
6,Transformed test set shape,"(2113, 7)"
7,Ignore features,1
8,Numeric features,6
9,Preprocess,True


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ridge,Ridge Classifier,0.7913,0.8184,0.4526,0.657,0.5345,0.4061,0.4186,0.008
gbc,Gradient Boosting Classifier,0.7931,0.8391,0.4955,0.6429,0.558,0.4264,0.4334,0.053
lr,Logistic Regression,0.7919,0.8317,0.5024,0.6394,0.5609,0.4274,0.4338,0.036
ada,Ada Boost Classifier,0.7874,0.8384,0.484,0.6298,0.5461,0.4108,0.4175,0.026
lightgbm,Light Gradient Boosting Machine,0.7905,0.8289,0.5261,0.6253,0.5705,0.4335,0.4369,0.125
lda,Linear Discriminant Analysis,0.7836,0.8184,0.4886,0.6186,0.5446,0.4054,0.4111,0.008
svm,SVM - Linear Kernel,0.772,0.8046,0.4488,0.6086,0.4964,0.3593,0.3733,0.009
rf,Random Forest Classifier,0.773,0.7978,0.4787,0.5892,0.527,0.3802,0.3843,0.057
knn,K Neighbors Classifier,0.7667,0.7797,0.4848,0.5722,0.5234,0.3708,0.3738,0.197
et,Extra Trees Classifier,0.7544,0.7724,0.4809,0.5431,0.5091,0.3463,0.3481,0.043


RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=None, positive=False, random_state=123, solver='auto',
                tol=0.0001)


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.783,0.835,0.4538,0.6211,0.5244,0.3882,0.3962
1,0.8134,0.812,0.5077,0.7021,0.5893,0.4726,0.483
2,0.7728,0.802,0.4275,0.6022,0.5,0.3584,0.3673
3,0.7972,0.8144,0.4885,0.6598,0.5614,0.4333,0.4415
4,0.7809,0.7997,0.4656,0.6162,0.5304,0.3912,0.3977
5,0.7748,0.8095,0.4733,0.5962,0.5277,0.3824,0.3868
6,0.7951,0.8382,0.4122,0.6923,0.5167,0.3972,0.4187
7,0.7931,0.8222,0.4046,0.6883,0.5096,0.3895,0.4116
8,0.8174,0.8282,0.4962,0.7303,0.5909,0.4789,0.4937
9,0.787,0.8253,0.3817,0.6757,0.4878,0.3662,0.39


Fitting 10 folds for each of 10 candidates, totalling 100 fits


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

We are saving the best Precision model onto to disk


In [68]:
from pycaret.classification import save_model

save_model(tuned_model, 'best_churn_model')



Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['tenure', 'PhoneService',
                                              'Contract', 'PaymentMethod',
                                              'MonthlyCharges', 'TotalCharges'],
                                     transformer=SimpleImputer(add_indicator=False,
                                                               copy=True,
                                                               fill_value=None,
                                                               keep_empty_features=False,
                                                               missing_values=nan,
                                                               strategy='mean'))),
                 ('categorical_imputer',...
                                     transformer=RemoveMulticollinearity(threshold=0.95))),
            

Now, we are creating a Python script with a function that takes a pandas dataframe as an input and returns the probability of churn for each row in the dataframe.

In [87]:
from pycaret.classification import load_model, predict_model

df = pd.read_csv(r'C:\Users\rames\Downloads\Karthik\updated_churn_data.csv')
df.replace([np.inf, -np.inf], np.nan, inplace=True)

clf_setup = setup(data=df, target='Churn', session_id=123)

# Compare models and select the one that supports predict_proba
best_model = compare_models(sort='Precision')

# Check if the best model supports predict_proba
if hasattr(best_model, 'predict_proba'):
    print(f"The best model is: {best_model.__class__.__name__}")
else:
    print("The best model does not support predict_proba.")
    
# Predict probabilities using the chosen model
def predict_churn_probability(df):
    # Load the pre-trained model
    model = load_model('best_churn_model')  # Adjust this based on your saved model

    # Check if the required columns exist
    required_columns = ['tenure', 'PhoneService', 'Contract', 'PaymentMethod', 'MonthlyCharges',
       'TotalCharges', 'Tenure_Category', 'MonthlyCharges_log',
       'TotalCharges_log']
    for col in required_columns:
        if col not in df.columns:
            raise KeyError(f"Column '{col}' not found in DataFrame.")

    # Predict probabilities of churn
    probabilities = model.predict_proba(df[required_columns])[:, 1]  # Assuming positive class is at index 1
    
    # Add the probabilities to the DataFrame
    df['Churn_Probability'] = probabilities
    
    return df



Unnamed: 0,Description,Value
0,Session id,123
1,Target,Churn
2,Target type,Binary
3,Original data shape,"(7043, 10)"
4,Transformed data shape,"(7043, 13)"
5,Transformed train set shape,"(4930, 13)"
6,Transformed test set shape,"(2113, 13)"
7,Numeric features,8
8,Categorical features,1
9,Rows with missing values,0.2%


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ridge,Ridge Classifier,0.7966,0.8353,0.4481,0.6766,0.538,0.4147,0.4297,0.011
lr,Logistic Regression,0.7957,0.8382,0.4848,0.6567,0.5566,0.4281,0.437,0.277
lda,Linear Discriminant Analysis,0.7939,0.8353,0.4779,0.6549,0.5513,0.4219,0.4315,0.011
gbc,Gradient Boosting Classifier,0.7925,0.8394,0.5008,0.639,0.5603,0.4275,0.4335,0.088
ada,Ada Boost Classifier,0.788,0.8388,0.4902,0.6304,0.5501,0.4146,0.421,0.037
lightgbm,Light Gradient Boosting Machine,0.7858,0.8303,0.5176,0.6143,0.5611,0.421,0.4241,0.122
svm,SVM - Linear Kernel,0.7225,0.6962,0.4203,0.6136,0.4227,0.272,0.3234,0.013
rf,Random Forest Classifier,0.7757,0.7997,0.4916,0.5935,0.5368,0.3908,0.3943,0.063
knn,K Neighbors Classifier,0.7665,0.7458,0.4412,0.578,0.4997,0.3513,0.3571,0.215
et,Extra Trees Classifier,0.7615,0.7768,0.4848,0.5591,0.5178,0.3608,0.3632,0.058


The best model does not support predict_proba.


In [70]:
print(df.columns)


Index(['tenure', 'PhoneService', 'Contract', 'PaymentMethod', 'MonthlyCharges',
       'TotalCharges', 'Churn', 'Tenure_Category', 'MonthlyCharges_log',
       'TotalCharges_log'],
      dtype='object')


Creating the function that takes a pandas dataframe as an input and returns the probability of churn for each row in the dataframe

  Using the Python function should print out the predictions for new data, while also looking at the true values for the new data.


In [91]:
from pycaret.classification import load_model, predict_model

def predict_churn_probabilities(df):

    # Load the trained churn model
    model = load_model('best_churn_model')  # Ensure this model is saved previously

    # Predict on the DataFrame and return predictions including probabilities
    predictions = predict_model(model, data=df)

    # Return the DataFrame with original features and probabilities
    print("\nPredictions DataFrame:\n", predictions)
    if 'Score' in predictions.columns and 'Label' in predictions.columns:
        return predictions[['Score', 'Label']]
    elif 'prediction_score' in predictions.columns and 'prediction_label' in predictions.columns:
        return predictions[['prediction_score', 'prediction_label']]
    else:
        raise ValueError("Expected columns are not found in the predictions DataFrame.")

if __name__ == "__main__":
    # Load new churn data
    new_data_path = 'new_churn_data.csv'
    new_data = pd.read_csv(r'C:\Users\rames\Downloads\Karthik\new_churn_data.csv')

    # Display the first few rows of the new data for inspection
    print("New data loaded:\n", new_data.head())

    # Get the churn probabilities
    churn_probabilities = predict_churn_probabilities(new_data)

    # Print the probabilities and predictions
    print("\nChurn probabilities and predictions:\n", churn_probabilities)

    # True values for the new data
    true_values = [1, 0, 0, 1, 0]  # True values for the new data
    print("\nTrue Values:\n", true_values)


New data loaded:
    customerID  tenure  PhoneService  Contract  PaymentMethod  MonthlyCharges  \
0  9305-CKSKC      22             1         0              2           97.40   
1  1452-KNGVK       8             0         1              1           77.30   
2  6723-OKKJM      28             1         0              0           28.25   
3  7832-POPKP      62             1         0              2          101.70   
4  6348-TACGU      10             0         0              1           51.15   

   TotalCharges  charge_per_tenure  
0        811.70          36.895455  
1       1701.95         212.743750  
2        250.90           8.960714  
3       3106.56          50.105806  
4       3440.97         344.097000  
Transformation Pipeline and Model Successfully Loaded



Predictions DataFrame:
    customerID  tenure  PhoneService  Contract  PaymentMethod  MonthlyCharges  \
0  9305-CKSKC      22             1         0              2       97.400002   
1  1452-KNGVK       8             0         1              1       77.300003   
2  6723-OKKJM      28             1         0              0       28.250000   
3  7832-POPKP      62             1         0              2      101.699997   
4  6348-TACGU      10             0         0              1       51.150002   

   TotalCharges  charge_per_tenure  prediction_label  prediction_score  
0    811.700012          36.895454                 0            0.5743  
1   1701.949951         212.743744                 0            0.8786  
2    250.899994           8.960714                 0            0.8878  
3   3106.560059          50.105808                 0            0.6541  
4   3440.969971         344.096985                 0            0.7141  

Churn probabilities and predictions:
    prediction_sco

To make the dataset more accurate we are cleaning it by dropping missing values 

In [97]:
import pandas as pd
import numpy as np

def clean_data(df):
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    print("NaN values before dropping:", df.isna().sum())
    df.dropna(inplace=True)

    print("NaN values after dropping:", df.isna().sum())
    for col in df.select_dtypes(include=[np.number]).columns:
        if df[col].max() > 1e6:  # Adjust threshold based on your domain knowledge
            print(f"Capping values in column {col} at 1e6.")
            df[col] = np.clip(df[col], None, 1e6)  # Cap values to a maximum of 1e6
    return df

if __name__ == "__main__":
    # Load the initial churn data for training
    train_data_path = 'your_initial_data.csv'  # Replace with your actual data path
    train_data = pd.read_csv


Testing the Python module and function on the new data file

In [108]:
import numpy as np
from pycaret.classification import load_model, predict_model, setup, create_model, save_model

def clean_data(df):
    """
    Cleans the DataFrame by handling infinite values, NaNs, and outliers.

    Parameters:
    df (pd.DataFrame): The DataFrame to clean.

    Returns:
    pd.DataFrame: The cleaned DataFrame.
    """
    # Replace infinite values with NaN
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    
    # Check for and count NaN values
    print("NaN values before dropping:", df.isna().sum())
    
    # Drop rows with NaN values
    df.dropna(inplace=True)

    # Check for and count remaining NaN values
    print("NaN values after dropping:", df.isna().sum())

    # Detect and cap extreme values (example threshold)
    for col in df.select_dtypes(include=[np.number]).columns:
        if df[col].max() > 1e6:  # Adjust threshold based on your domain knowledge
            print(f"Capping values in column {col} at 1e6.")
            df[col] = np.clip(df[col], None, 1e6)  # Cap values to a maximum of 1e6

    return df

def train_gradient_boosting_model(df):
    """
    Trains a Gradient Boosting model on the given DataFrame and saves the model.
    
    Parameters:
    df (pd.DataFrame): The DataFrame containing customer data for training.
    """
    # Setup PyCaret with the data
    clf = setup(data=df, target='Churn', session_id=123)

    # Create and train the Gradient Boosting model
    gb_model = create_model('gbc')  # 'gbc' stands for Gradient Boosting Classifier

    # Save the trained model
    save_model(gb_model, 'gradient_boosting_model')

def predict_churn_probabilities(df):
    """
    Predicts the churn probabilities for each row in the given DataFrame.
    
    Parameters:
    df (pd.DataFrame): The DataFrame containing customer data for prediction.
    
    Returns:
    pd.DataFrame: A DataFrame with the original data and the predicted churn probabilities.
    """
    # Load the trained churn model
    model = load_model('gradient_boosting_model')

    # Predict on the DataFrame and return predictions
    predictions = predict_model(model, data=df)

    # Print the predictions DataFrame to check its structure
    print("\nPredictions DataFrame:\n", predictions)

    # Return relevant columns
    return predictions[['Score', 'Label']]

if __name__ == "__main__":
    # Load the initial churn data for training
    train_data_path = r'C:\Users\rames\Downloads\Karthik\updated_churn_data.csv' 
    train_data = pd.read_csv(train_data_path)

    # Clean the training data
    train_data = clean_data(train_data)

    # Train the Gradient Boosting model
    train_gradient_boosting_model(train_data)

    # Load new churn data for predictions
    new_data_path = r'C:\Users\rames\Downloads\Karthik\new_churn_data.csv'
    new_data = pd.read_csv(new_data_path)

    # Clean the new data
    new_data = clean_data(new_data)

    # Display the first few rows of the new data for inspection
    print("New data loaded:\n", new_data.head())

    # Optionally, drop columns that are not needed for prediction
    columns_to_drop = ['Tenure_Category', 'MonthlyCharges_log', 'TotalCharges_log']  # Adjust as necessary
    new_data = new_data.drop(columns=[col for col in columns_to_drop if col in new_data.columns], errors='ignore')

    # Get the churn probabili


NaN values before dropping: tenure                 0
PhoneService           0
Contract               0
PaymentMethod          0
MonthlyCharges         0
TotalCharges           0
Churn                  0
Tenure_Category        0
MonthlyCharges_log     0
TotalCharges_log      11
dtype: int64
NaN values after dropping: tenure                0
PhoneService          0
Contract              0
PaymentMethod         0
MonthlyCharges        0
TotalCharges          0
Churn                 0
Tenure_Category       0
MonthlyCharges_log    0
TotalCharges_log      0
dtype: int64


Unnamed: 0,Description,Value
0,Session id,123
1,Target,Churn
2,Target type,Binary
3,Original data shape,"(7032, 10)"
4,Transformed data shape,"(7032, 13)"
5,Transformed train set shape,"(4922, 13)"
6,Transformed test set shape,"(2110, 13)"
7,Numeric features,8
8,Categorical features,1
9,Preprocess,True


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8012,0.8244,0.4962,0.6701,0.5702,0.4446,0.4531
1,0.7931,0.8315,0.5038,0.6408,0.5641,0.431,0.4364
2,0.8333,0.8883,0.5385,0.7609,0.6306,0.5271,0.5402
3,0.7947,0.8357,0.4385,0.6706,0.5302,0.4062,0.4212
4,0.7703,0.7971,0.4885,0.5818,0.5311,0.3806,0.3831
5,0.7967,0.8373,0.5496,0.6372,0.5902,0.456,0.4582
6,0.7907,0.8382,0.4962,0.6373,0.5579,0.4236,0.4293
7,0.7947,0.8526,0.4962,0.65,0.5628,0.4318,0.4385
8,0.7967,0.8373,0.4962,0.6566,0.5652,0.4359,0.4432
9,0.8049,0.8619,0.542,0.6636,0.5966,0.4697,0.4739


Transformation Pipeline and Model Successfully Saved
NaN values before dropping: customerID           0
tenure               0
PhoneService         0
Contract             0
PaymentMethod        0
MonthlyCharges       0
TotalCharges         0
charge_per_tenure    0
dtype: int64
NaN values after dropping: customerID           0
tenure               0
PhoneService         0
Contract             0
PaymentMethod        0
MonthlyCharges       0
TotalCharges         0
charge_per_tenure    0
dtype: int64
New data loaded:
    customerID  tenure  PhoneService  Contract  PaymentMethod  MonthlyCharges  \
0  9305-CKSKC      22             1         0              2           97.40   
1  1452-KNGVK       8             0         1              1           77.30   
2  6723-OKKJM      28             1         0              0           28.25   
3  7832-POPKP      62             1         0              2          101.70   
4  6348-TACGU      10             0         0              1           51.15   


In [109]:
from pycaret.classification import load_model, predict_model

# Step 2: Load your trained model
model = load_model('best_churn_model')

# Step 3: Predict on the new churn data
# Use predict_model to make predictions
predictions = predict_model(model, data=new_data)

# Step 4: Display the predictions
print(predictions)


Transformation Pipeline and Model Successfully Loaded


   customerID  tenure  PhoneService  Contract  PaymentMethod  MonthlyCharges  \
0  9305-CKSKC      22             1         0              2       97.400002   
1  1452-KNGVK       8             0         1              1       77.300003   
2  6723-OKKJM      28             1         0              0       28.250000   
3  7832-POPKP      62             1         0              2      101.699997   
4  6348-TACGU      10             0         0              1       51.150002   

   TotalCharges  charge_per_tenure  prediction_label  prediction_score  
0    811.700012          36.895454                 0            0.5743  
1   1701.949951         212.743744                 0            0.8786  
2    250.899994           8.960714                 0            0.8878  
3   3106.560059          50.105808                 0            0.6541  
4   3440.969971         344.096985                 0            0.7141  


The Above are the predictions that the Python module was able to make based on the trained data. Giving us prediction scores ranging from 0.5743 to 0.8786


# Summary:
In this assignment, we worked on using PyCaret to check churn predictions, which helped us to observe the customer churn probabilities based on the features. We are choosing Precision metric as it optimizes the model to minimize false positives, which is crucial when false alarms.
The prediction scores ranged from 0.5743 to 0.8786 indicating the model's confidence in these predictions, with higher scores suggesting a greater likelihood of retention. By  looking at these prediction scores, the Telecom business can identify customers who are likely to churn those with lower scores and can make targeted strategies to retain them.