The first part of the project is of course our database modeling. our next module will be our webapp, built with flask

In [1]:
import pandas as pd
import seaborn as sns
import math #idk if i need it yet
import matplotlib.pyplot as plt
import numpy as np
import sklearn.preprocessing
from scipy.stats import chi2_contingency
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
from collections import Counter

# getting the df into our code
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn-main.csv")

# removing columns that clearly provide no use
df.drop(columns=["gender", "customerID"], inplace=True)

# let's remind outselves that our goal is to predict if our classmates may churn from their own phone plans
# clearly we'll have no way to tell if they end up churning our not, but the experiment is more social/fun than scientific. 
# for this objective however, we need to make sure that our TelCo IBM Kaggle dataset lines up roughly with our classmates
# thus, we'll remove columns they might not know on the spot, as well as columns irrelevant to them such as 'OnlineSecurity'
df.drop(columns=["OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies", 
                   "PaperlessBilling", "TotalCharges"], inplace=True)
# of course we can assume our classmates are not Senior Citizens, and I'd like to hope most don't have kids in undergrad, 
# but leaving those variables in can help us make the most of our dataset, and we can pre-configure them to false when we ask them.

# renaming tenure for clarity's sake
df = df.rename(columns={"tenure": "Months_Tenure"})

# Manually map certain columns to have their 'Yes' and 'No' to be converted to 1 and 0, respectively
yes_no_columns = [
    "Partner",
    "Dependents",
    "PhoneService",
    "MultipleLines",
    "InternetService",
    "Churn"
]

# Replace 'Yes' with 1 and 'No' with 0 in each specified column
for column in yes_no_columns:
    df[column].replace({'Yes': 1, 'No': 0}, inplace=True)

# consolidate the no/yes values for simplification's sake
df["MultipleLines"].replace({"No phone service": 0}, inplace=True)
df["InternetService"].replace({'DSL': 1, 'Fiber optic': 1}, inplace=True)

# Define function to categorize tenure
def categorize_tenure(Months_Tenure):
    if Months_Tenure <= 6:
        return '0-6'
    elif Months_Tenure <= 18:
        return '7-18'
    elif Months_Tenure <= 36:
        return '19-36'
    elif Months_Tenure <= 72:
        return '37-72'

# Apply the function to the 'Months_Tenure' column and overwrite it with categorical values
df['Months_Tenure'] = df['Months_Tenure'].apply(categorize_tenure)

# Define function to categorize MonthlyCharges
def categorize_monthly_charges(charge):
    if charge < 25:
        return '0-25'
    elif charge < 50:
        return '25-50'
    elif charge < 75:
        return '50-75'
    elif charge < 100:
        return '75-100'
    else:
        return '100+'
    
# Apply the function to the 'MonthlyCharges' column and overwrite it with categorical values
df['MonthlyCharges'] = df['MonthlyCharges'].apply(categorize_monthly_charges)

# checking all the unique values in our data
#unique_per_column = {col: df[col].unique() for col in df.columns}
#for column, values in unique_per_column.items():
#    print(f"{column}: {values}")

# seperating the types of columns before we encode the categorical variables for easier work later
categorical_columns = ["Months_Tenure", "Contract", "PaymentMethod", "MonthlyCharges"]
numerical_columns = df.drop(columns=categorical_columns)

# Initialize encoder
encoder = sklearn.preprocessing.OneHotEncoder(sparse_output=False)

# Fit and transform
encoded_array = encoder.fit_transform(df[categorical_columns])

# Convert back to DataFrame
encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out(categorical_columns))

# combining the numerical and categorical dfs
df = pd.concat([numerical_columns, encoded_df], axis=1)

# covering all dtype to int for clarity
df = df.astype(int)

# Function to calculate Cramér's V, which will help us analyze which variables are important 
# (through their strength of associaton in a heatmap)
def cramers_v(x, y):
    contingency_table = pd.crosstab(x, y)
    chi2 = chi2_contingency(contingency_table)[0]
    n = contingency_table.sum().sum()
    return np.sqrt(chi2 / (n * (min(contingency_table.shape) - 1)))

# Calculate Cramér's V for all variables against Churn (they're all binary)
binary_columns = ['SeniorCitizen', 'Partner', 'Dependents', 'PhoneService',
       'MultipleLines', 'InternetService', 'Months_Tenure_0-6',
       'Months_Tenure_19-36', 'Months_Tenure_37-72', 'Months_Tenure_7-18',
       'Contract_Month-to-month', 'Contract_One year', 'Contract_Two year',
       'PaymentMethod_Bank transfer (automatic)',
       'PaymentMethod_Credit card (automatic)',
       'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check',
       'MonthlyCharges_0-25', 'MonthlyCharges_100+', 'MonthlyCharges_25-50',
       'MonthlyCharges_50-75', 'MonthlyCharges_75-100'] 
cramers_results = {col: cramers_v(df['Churn'], df[col]) for col in binary_columns}

# Convering the results into a df and displaying it as a barplot shows us 
CramerDf = pd.DataFrame(cramers_results.items(), columns=["Variable", "Cramer's V"])
CramerDf = CramerDf.sort_values(by="Cramer's V")
#sns.catplot(x="Cramer's V", y="Variable", data=CramerDf, kind="bar")
#plt.show()

# the plot shows us that several variables are relatively irrelevant and worth dropping
# we also need to drop at least one category from each categorical variable we performed onehotencoding on
df.drop(columns=["MonthlyCharges_25-50", "MonthlyCharges_50-75", "PhoneService","MultipleLines", "Months_Tenure_19-36", "PaymentMethod_Mailed check"])

# now we can finally prepare our logistic regression to measure the probability of churn given our binary variables.

X = df [['SeniorCitizen', 'Partner', 'Dependents', 'InternetService', 'Months_Tenure_0-6', 
    'Months_Tenure_37-72', 'Months_Tenure_7-18', 'Contract_Month-to-month', 'Contract_One year', 
    'Contract_Two year', 'PaymentMethod_Bank transfer (automatic)', 'PaymentMethod_Credit card (automatic)',
    'PaymentMethod_Electronic check', 'MonthlyCharges_0-25', 
    'MonthlyCharges_100+', 'MonthlyCharges_75-100']]
y = df['Churn']

# Step 1: Split the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Check Class Distribution
print(f"Class distribution before SMOTE: {Counter(y_train)}")

# Step 3: Apply SMOTE to Balance the Dataset (i added this 2nd, after testing default logistic regression)
smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)
print(f"Class distribution after SMOTE: {Counter(y_train_sm)}")

# Step 4: Train Logistic Regression Model
log_model = LogisticRegression()
log_model.fit(X_train_sm, y_train_sm)

# Step 5: Make Predictions
# Predicted probabilities
y_pred_probs = log_model.predict_proba(X_test)[:, 1]

# Predicted classes with a custom threshold (I added this 3rd, to balance the model a little more)
threshold = 0.6
y_pred_custom = (y_pred_probs >= threshold).astype(int)

# Step 6: Evaluate the Model
# Default threshold evaluation
print("Logistic Regression Metrics (Default Threshold):")
print(f"Accuracy: {accuracy_score(y_test, log_model.predict(X_test))}")
print(f"AUC: {roc_auc_score(y_test, y_pred_probs)}")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, log_model.predict(X_test))}")

# Custom threshold evaluation. In our business case, we'd want to be a less strict than the model. 
print("\nLogistic Regression Metrics (Custom Threshold 0.6):")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred_custom)}")
print(classification_report(y_test, y_pred_custom))

# Step 7: Model Coefficients
print("\nLogistic Regression Coefficients:")
print("Intercept:", log_model.intercept_)
print("Coefficients:", log_model.coef_)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].replace({'Yes': 1, 'No': 0}, inplace=True)
  df[column].replace({'Yes': 1, 'No': 0}, inplace=True)
  df["MultipleLines"].replace({"No phone service": 0}, inplace=True)
  df["InternetService"].replace({'DSL': 1, 'Fiber optic': 1}, inplace=True)


Class distribution before SMOTE: Counter({0: 4138, 1: 1496})
Class distribution after SMOTE: Counter({0: 4138, 1: 4138})
Logistic Regression Metrics (Default Threshold):
Accuracy: 0.7359829666430092
AUC: 0.8473195524133861
Confusion Matrix:
[[724 312]
 [ 60 313]]

Logistic Regression Metrics (Custom Threshold 0.6):
Confusion Matrix:
[[819 217]
 [ 92 281]]
              precision    recall  f1-score   support

           0       0.90      0.79      0.84      1036
           1       0.56      0.75      0.65       373

    accuracy                           0.78      1409
   macro avg       0.73      0.77      0.74      1409
weighted avg       0.81      0.78      0.79      1409


Logistic Regression Coefficients:
Intercept: [0.67335711]
Coefficients: [[ 0.11530056  0.06050222 -0.47939529  0.310115    1.17514769 -0.34801774
   0.23683483 -0.92931593 -1.93593759 -3.27136096 -0.12306059 -0.20074088
   0.38102869 -0.94640383  1.03457548  0.48418353]]


Here, we can set up our flask app for a fun and interactive display of our project. Unfortunately we do need inline html and css which we'll need to have as strings. So the first code module will be declaring the html and css we need later.

In [2]:
INDEX_STYLE = """

* {
    font-family: sans-serif
}


h1 {
    text-align: center;
}

h3 {
    text-align: center;
    margin-bottom: 50px;
}

#questions {
    display: flex;
    flex-direction: column;

    align-items: flex-start;

    gap: 3px;
    margin-bottom: 50px;
}

.question_and_answers {
    display: flex;
    width: 100%;
    align-items: center;
    justify-content: flex-start;
}

.question_and_answers p {
    font-size: 30px;
    margin-left: 40px;
    margin-right: 70px;


    flex-shrink: 0;
    width: 550px;

}

input[type="radio"] {
    appearance: none;
    padding: 20px 50px;
    background-color: #E0EDF5;
    cursor: pointer;
    display: inline-block;
    text-align: center;
}

input[type="radio"]:checked {
    background-color: #FFBD59;
}

input[type="radio"]:hover {
    background-color: #3E8FBE;
}

/* COPIE COLLE */

.button {
    float: left;
    margin: 0 5px 0 0;
    width: 180px;
    height: 60px;
    position: relative;
  }
  
.button label,
.button input {
    display: block;
    position: absolute;
    top: 0;
    left: 0;
    right: 0;
    bottom: 0;
  }
  
.button label {
    display: flex;
    justify-content: center;
    align-items: center;
    cursor: pointer;
    z-index: 90;
    line-height: 1.8em;
  }
  
#button_div{
    display: flex;
    align-items: center;
    justify-content: center;
}

#submit_button {
    color: black;
    font-size: 20px;

    border: none;
    background-color: #FFE7C2;

    padding: 20px;

    height: 8°px;
    width: 200px;

}

#submit_button:hover {
    background-color: #FF9D0A;
    cursor: pointer;
}

"""

RESULTS_STYLE = """

body{
    display: grid;
    place-items: center;
    height: 97vh;
}

.wrapper{
    font-size: 150px;
    line-height: 1em;
    text-align: center;
    overflow: hidden;
    font-weight: 900;

    margin-top: 175px;
}

.number-area{
    display: inline-block;
    overflow: hidden;
    width: 0.6em;
    height: 150px;
    color: #3E8FBE
}

.num {
    word-break: break-all;
    display: block;
    width: 0;
    padding: 0 1.6em 0 0;
    margin: 0;
    overflow: inherit;
    animation: animate 0.5s steps(10) forwards infinite;
}

.num::before{
    content: attr(data-attr);
    display: inline-block;
    width: 100%;
    height: auto;
}

@keyframes animate{
    100%{
        transform: translate3d(0,-10em, 0);
    }
}

.n1{
   animation-iteration-count: 4; 
}

.n2{
    animation-iteration-count: 8; 

}

#text {
    font-size: 25px;

    opacity: 0; 
    animation: fadeIn 2s ease-in 2s forwards;
    color: #3E8FBE
}


@keyframes fadeIn {
    to {
        opacity: 1; /* Le texte devient visible */
    }
}

button{
    font-size: 20px;

    background-color: #FFE7C2;
    border: none;

    /*margin-right: 30px;*/
    padding: 15px 25px;

    opacity: 0; 
    animation: fadeIn 2s ease-in 3s forwards;

    transition: box-shadow 0.1s ease;

}

button:hover{
    cursor: pointer;

    background-color: #FFBD59;
    box-shadow: 2px 4px 6px rgba(0, 0, 0, 0.3);
}

"""

INDEX_HTML = f"""

<!DOCTYPE html>
<html>
<head>
    <title>Questions</title>
    <style>
            {INDEX_STYLE}
        </style> 
</head>
<body>
    <h1>Will you cut the cord?</h1>
    <h3>Answer our 4 questions to know the answer!</h3>

    <form action="/get_value_from_answers" method="post">
        
    <div id="questions">
        <div class="question_and_answers"> 
            <p>For how long have you been subscribed?</p>

            <div class="button">
                <input type="radio" id="tenure1" name="tenure" value = "0-6">
                <label for="tenure1">0-6 months</label>
            </div>

            <div class="button">
                <input type="radio" id="tenure2" name="tenure" value = "7-18">
                <label for="tenure2">7-18 months</label>
            </div>

            <div class="button">
                <input type="radio" id="tenure3" name="tenure" value = "19+">
                <label for="tenure3">More than 19 months</label>
            </div>

        </div>

        <div class="question_and_answers">
            <p>How often do your renew your contract?</p>

            <div class="button">
                <input type="radio" id="contract_type1" name="contract_type" value = "month_to_month">
                <label for="contract_type1">Every month</label>
            </div>

            <div class="button">
                <input type="radio" id="contract_type2" name="contract_type" value = "one_year">
                <label for="contract_type2">Every year</label>
            </div>

            <div class="button">
                <input type="radio" id="contract_type3" name="contract_type" value = "two_year">
                <label for="contract_type3">Every 2 years</label>
            </div>
        </div>

        <div class="question_and_answers">
            <p>What is your payment method?</p>

            <div class="button">
                <input type="radio" id="payment_method1" name="payment_method" value="bank_transfer">
                <label for="payment_method1">Bank transfer</label>
            </div>

            <div class="button">
                <input type="radio" id="payment_method2" name="payment_method" value="credit_card">
                <label for="payment_method2">Credit card</label>
            </div>

            <div class="button">
                <input type="radio" id="payment_method3" name="payment_method" value="electronic_check">
                <label for="payment_method3">Electronic check</label>
            </div>
        </div>

        <div class="question_and_answers">
            <p>What is your monthly payment?</p>

            <div class="button">
                <input type="radio" id="monthly_payment1" name="monthly_payment" value ="0-74">
                <label for="monthly_payment1">0-74$</label>
            </div>

            <div class="button">
                <input type="radio" id="monthly_payment2" name="monthly_payment" value ="75-100">
                <label for="monthly_payment2">75$-100$</label>
            </div>

            <div class="button">
                <input type="radio" id="monthly_payment3" name="monthly_payment" value ="100+">
                <label for="monthly_payment3">More than 100$</label>
            </div>
        </div>
    </div>

    <div id="button_div">
        <button type="submit" id="submit_button">Tell me if I'll cut the cord!</button>
    </div>
    
    </form>
</body>
</html>

"""

RESULTS_HTML = f"""

<!DOCTYPE html>
<html>
    <head>
        <title>Home page</title>
        <style>
            {RESULTS_STYLE}
        </style>
    </head>
    <body>
        <div class="content">
            <div class="wrapper">
                <div class="number-area">
                    <span class="num n1" data-attr="5741278934">{{{{first_digit}}}}</span>
                </div>
                <div class="number-area">
                    <span class="num n2" data-attr="4785125986">3{{{{second_digit}}}}</span>
                </div>
            </div>
    
            <p id="text">is your likelihood to leave your operator during the next month</p>
        </div>

        <form action="/home">

            <button type="submit">Restart</button>

        </form>

    </body>
</html>

"""

Now with that out of the way, let's start the flask app. To do this in jupyter notebook, we also need to enable threading, but first let's just set up the flask app.

In [3]:
from flask import Flask, redirect, url_for, render_template_string,request
from math import exp

#Run the app : python -m flask --app .\app.py run (to be written in the terminal)

app = Flask(__name__)
 
@app.route("/")
@app.route("/home")
def home():
    return render_template_string(INDEX_HTML) #Define the landing page of the app


def sigmoid(logit):
    '''
    The sigmoid function turns any regular equation into a probabilistic function
    It is very useful and even mandatory for many regressions, such as a logistic regression for example

    It returns values from 0 to 1 only
    '''

    return 1/ (1+ exp(-logit)) 


def get_digits_from_probability(probability):

    percentage = int(100*probability)

    if percentage < 10:
        first_digit = 0
        second_digit = percentage
    else:
        first_digit = percentage // 10
        second_digit = percentage % 10

    return first_digit, second_digit

def calculate_churn_probability(tenure, contract_type, payment_method, monthly_payment):

    '''
    The function uses the answers of the user to compute its probability to churn
    It thus returns a prevision

    This prevision is made possible using the logistic regression made from our original database
    It uses the same coefficients for the prevision

    Here, we assumed some answers for the user to make it more interactive. 

    '''
    
    intercept = 0.67335711

    senior_citizen = 0 * 0.06050222     #We assume the students are not senior citizens, so the value is 0
    partner = 0 * 0.06050222            #We assume the students do not currently live with a partner, so the value is 0
    dependents = 0 * -0.47939529        #We assume the students have no children or parents to take care of at home, so the value is 0
    internet_service = 1*0.310115       #1 is for fiber optic. We assume the students have access to fiber optic instead of DSL

    
    #Tenure, contract_type, payment_method, monthly_payment are categorical variables that had been encoded as 3 booleans for the regression
    #So for each of them, we look up for the case entered by the users and encode the answer to be used in the prevision

    match tenure:

        case '0-6':
            months_tenure_0_6 = 1 * 1.17514769
            months_tenure_7_18 = 0
            months_tenure_37_72 = 0

        case '7-18':
            months_tenure_0_6 = 0
            months_tenure_7_18 = 1 * 0.23683483
            months_tenure_37_72 = 0

        case '19+':
            months_tenure_0_6 = 0
            months_tenure_7_18 = 0
            months_tenure_37_72 = 1 * (-0.34801774)


    match contract_type:

        case 'month_to_month':
            contract_month_to_month = 1 * (-0.92931593)
            contract_one_year = 0
            contract_two_year = 0

        case 'one_year':
            contract_month_to_month = 0
            contract_one_year = 1 * (-1.93593759)
            contract_two_year = 0

        case 'two_year':
            contract_month_to_month = 0
            contract_one_year = 0
            contract_two_year = 1 * (-3.27136096)


    match payment_method:

        case 'bank_transfer':
            payment_method_bank_transfer = 1 * (-0.12306059)
            payment_method_credit_card = 0
            payment_method_electronic_check = 0
        
        case 'credit_card':
            payment_method_bank_transfer = 0
            payment_method_credit_card = 1 * (-0.20074088)
            payment_method_electronic_check = 0

        case 'electronic_check':
            payment_method_bank_transfer = 0
            payment_method_credit_card = 0
            payment_method_electronic_check = 1 * 0.38102869

        
    match monthly_payment:

        case '0-74':
            monthly_charges_0_74 = 1 * (-0.94640383)
            monthly_charges_75_100 = 0
            monthly_charges_100 = 0

        case '75-100':
            monthly_charges_0_74 = 0
            monthly_charges_75_100 = 1 * 0.48418353
            monthly_charges_100 = 0

        case '100+':
            monthly_charges_0_74 = 0
            monthly_charges_75_100 = 0
            monthly_charges_100 = 1 * 1.03457548

    logit = (
        intercept +
        senior_citizen +
        partner +
        dependents +
        internet_service +
        months_tenure_0_6 +
        months_tenure_7_18 +
        months_tenure_37_72 +
        contract_month_to_month +
        contract_one_year +
        contract_two_year +
        payment_method_bank_transfer +
        payment_method_credit_card +
        payment_method_electronic_check +
        monthly_charges_0_74 +
        monthly_charges_75_100 +
        monthly_charges_100
    )
    
    probability = sigmoid(logit)

    #Logit doesn't directly return the probability
    #We have to apply the sigmoïde function to turn the result into a probability and, thus be able to interpret it

    return probability


@app.route('/get_value_from_answers',methods=['POST','GET'])

def get_value_from_answers():

    '''
    This function is used to turn the churn probability going from 0 to 100, into a tupple of length 2

    Index 0 : The first digit (ten)
    Index 1 : The second digit (unit)

    It is used to simplify to process of animating the answer on the result.html template, using css only

    Example:
        93 returns (9,3)
        9 returns (0,9)
    '''

    tenure = request.form['tenure']
    contract_type = request.form['contract_type']
    payment_method = request.form['payment_method']
    monthly_payment = request.form['monthly_payment']

    churn_probability = calculate_churn_probability(tenure,
                                                    contract_type, 
                                                    payment_method, 
                                                    monthly_payment)
    
    first_digit = get_digits_from_probability(churn_probability)[0]
    second_digit = get_digits_from_probability(churn_probability)[1]

    return render_template_string(RESULTS_HTML, first_digit = first_digit, second_digit=second_digit)

@app.route('/restart_questions')
def restart_questions():
    return render_template_string(INDEX_HTML)


Now, finally the flask app is defined, and we can run it.

In [None]:
# Run Flask in a Background Thread
import threading

def run_app():
    app.run(debug=False, use_reloader=False)

flask_thread = threading.Thread(target=run_app)
flask_thread.start()

# Display the app in an iframe
from IPython.display import HTML

HTML('<iframe src="http://127.0.0.1:5000" width=800 height=600></iframe>')

# do note that it will be prettier if you open the localhost address above in your own browser.



 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [28/Nov/2024 17:57:40] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [28/Nov/2024 17:57:44] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [28/Nov/2024 17:57:44] "GET /favicon.ico HTTP/1.1" 404 -
127.0.0.1 - - [28/Nov/2024 17:59:13] "POST /get_value_from_answers HTTP/1.1" 200 -
127.0.0.1 - - [28/Nov/2024 17:59:14] "GET /favicon.ico HTTP/1.1" 404 -
127.0.0.1 - - [28/Nov/2024 17:59:23] "GET /home HTTP/1.1" 200 -
127.0.0.1 - - [28/Nov/2024 17:59:23] "GET /favicon.ico HTTP/1.1" 404 -
127.0.0.1 - - [28/Nov/2024 18:01:29] "POST /get_value_from_answers HTTP/1.1" 200 -
127.0.0.1 - - [28/Nov/2024 18:01:29] "GET /favicon.ico HTTP/1.1" 404 -
127.0.0.1 - - [28/Nov/2024 20:58:54] "GET /home HTTP/1.1" 200 -
127.0.0.1 - - [28/Nov/2024 20:58:54] "GET /favicon.ico HTTP/1.1" 404 -
127.0.0.1 - - [28/Nov/2024 20:59:18] "POST /get_value_from_answers HTTP/1.1" 200 -
127.0.0.1 - - [28/Nov/2024 20:59:18] "GET /favicon.ico HTTP/1.1" 404 -
127.0.0.1 - - [28/No