In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import warnings 
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('data/telco_customer_churn_preprocessed.csv')

In [3]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
print(df['gender'].unique())

['Female' 'Male']


In [5]:
for col in df.columns:
    print(f"Column {col} data type: {df[col].dtype}")

Column gender data type: object
Column SeniorCitizen data type: int64
Column Partner data type: object
Column Dependents data type: object
Column tenure data type: int64
Column PhoneService data type: object
Column MultipleLines data type: object
Column InternetService data type: object
Column OnlineSecurity data type: object
Column OnlineBackup data type: object
Column DeviceProtection data type: object
Column TechSupport data type: object
Column StreamingTV data type: object
Column StreamingMovies data type: object
Column Contract data type: object
Column PaperlessBilling data type: object
Column PaymentMethod data type: object
Column MonthlyCharges data type: float64
Column TotalCharges data type: float64
Column Churn data type: object


In [6]:
for col in df.columns:
    print(f"Column {col} values: {df[col].unique()}")

Column gender values: ['Female' 'Male']
Column SeniorCitizen values: [0 1]
Column Partner values: ['Yes' 'No']
Column Dependents values: ['No' 'Yes']
Column tenure values: [ 1 34  2 45  8 22 10 28 62 13 16 58 49 25 69 52 71 21 12 30 47 72 17 27
  5 46 11 70 63 43 15 60 18 66  9  3 31 50 64 56  7 42 35 48 29 65 38 68
 32 55 37 36 41  6  4 33 67 23 57 61 14 20 53 40 59 24 44 19 54 51 26  0
 39]
Column PhoneService values: ['No' 'Yes']
Column MultipleLines values: ['No phone service' 'No' 'Yes']
Column InternetService values: ['DSL' 'Fiber optic' 'No']
Column OnlineSecurity values: ['No' 'Yes' 'No internet service']
Column OnlineBackup values: ['Yes' 'No' 'No internet service']
Column DeviceProtection values: ['No' 'Yes' 'No internet service']
Column TechSupport values: ['No' 'Yes' 'No internet service']
Column StreamingTV values: ['No' 'Yes' 'No internet service']
Column StreamingMovies values: ['No' 'Yes' 'No internet service']
Column Contract values: ['Month-to-month' 'One year' 'Two y

In [7]:
binary_columns = ["gender", "Partner", "Dependents", "PhoneService", "PaperlessBilling"]
multi_value_columns = ["MultipleLines", "InternetService", "OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies"]
dummies_columns = ["Contract", "PaymentMethod"]
numeric_columns = ["tenure", "MonthlyCharges", "TotalCharges"]

In [8]:
def preprocess_binary_columns(df):
    df = df.copy()
    for col in binary_columns:
        df[col] = df[col].apply(lambda x: 1 if x == "Yes" else 0)
    return df

def preprocess_multi_value_columns(df):
    # One-hot encoding
    df = df.copy()
    for col in multi_value_columns:
        # If value is "Yes", then 1, else 0
        df[col] = df[col].apply(lambda x: 1 if x == "Yes" else 0)
    return df

def preprocess_dummies_columns(df):
    df = df.copy()
    df = pd.get_dummies(df, columns=dummies_columns)
    # Convert all columns to 1 and 0
    for col in df.columns:
        # if col starts with Contract or PaymentMethod
        if col.startswith("Contract") or col.startswith("PaymentMethod"):
            df[col] = df[col].apply(lambda x: 1 if x == 1 else 0)
    return df

def preprocess_numeric_columns(df):
    df = df.copy()
    for col in numeric_columns:
        df[col] = df[col].astype(float)
    scaler = StandardScaler()
    df[numeric_columns] = scaler.fit_transform(df[numeric_columns])
    return df

def preprocess_target_column(df):
    df = df.copy()
    df['Churn'] = df['Churn'].apply(lambda x: 1 if x == "Yes" else 0)
    return df

def preprocess_df(df):
    df = df.copy()
    df = preprocess_binary_columns(df)
    df = preprocess_multi_value_columns(df)
    df = preprocess_dummies_columns(df)
    df = preprocess_numeric_columns(df)
    df = preprocess_target_column(df)
    return df

In [9]:
df_processed = preprocess_df(df)

In [10]:
df_processed.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,...,MonthlyCharges,TotalCharges,Churn,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,0,1,0,-1.277445,0,0,0,0,1,...,-1.160323,-0.992611,0,1,0,0,0,0,1,0
1,0,0,0,0,0.066327,1,0,0,1,0,...,-0.259629,-0.172165,0,0,1,0,0,0,0,1
2,0,0,0,0,-1.236724,1,0,0,1,1,...,-0.36266,-0.958066,1,1,0,0,0,0,0,1
3,0,0,0,0,0.514251,0,0,0,1,0,...,-0.746535,-0.193672,0,0,1,0,1,0,0,0
4,0,0,0,0,-1.236724,1,0,0,0,0,...,0.197365,-0.938874,1,1,0,0,0,0,1,0


In [11]:
X = df_processed.drop('Churn', axis=1)
y = df_processed['Churn']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (5634, 24)
y_train shape: (5634,)
X_test shape: (1409, 24)
y_test shape: (1409,)


In [29]:
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(random_state=42)

model.fit(X_train, y_train)

In [33]:
y_pred = model.predict(X_test)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_auc_score

print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Precision: {precision_score(y_test, y_pred)}")
print(f"Recall: {recall_score(y_test, y_pred)}")
print(f"F1: {f1_score(y_test, y_pred)}")
print(f"AUC: {roc_auc_score(y_test, y_pred)}")
print(f"Confusion matrix:\n {confusion_matrix(y_test, y_pred)}")
print(f"Classification report:\n {classification_report(y_test, y_pred)}")

Accuracy: 0.8090844570617459
Precision: 0.674496644295302
Recall: 0.5388739946380697
F1: 0.5991058122205662
AUC: 0.7226223255043631
Confusion matrix:
 [[939  97]
 [172 201]]
Classification report:
               precision    recall  f1-score   support

           0       0.85      0.91      0.87      1036
           1       0.67      0.54      0.60       373

    accuracy                           0.81      1409
   macro avg       0.76      0.72      0.74      1409
weighted avg       0.80      0.81      0.80      1409



In [34]:
# Cross validation
from sklearn.model_selection import cross_val_score

scoring_list = ["accuracy", "precision", "recall", "f1", "roc_auc"]

for scoring in scoring_list:
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring=scoring)
    print(f"Scoring: {scoring}")
    print(f"Scores: {scores}")
    print(f"Mean: {scores.mean()}")
    print(f"Standard deviation: {scores.std()}")
    print()

Scoring: accuracy
Scores: [0.81011535 0.80922804 0.79503106 0.78970719 0.79662522]
Mean: 0.8001413709355856
Standard deviation: 0.008116632529023502

Scoring: precision
Scores: [0.66798419 0.69090909 0.64782609 0.62857143 0.67857143]
Mean: 0.662772444946358
Standard deviation: 0.022188033493196955

Scoring: recall
Scores: [0.56521739 0.5083612  0.49832776 0.51333333 0.44481605]
Mean: 0.5060111482720178
Standard deviation: 0.038388515106629675

Scoring: f1
Scores: [0.61231884 0.58574181 0.56332703 0.56513761 0.53737374]
Mean: 0.572779807188758
Standard deviation: 0.025030767272054697

Scoring: roc_auc
Scores: [0.85797263 0.84499257 0.83651827 0.83052197 0.83756617]
Mean: 0.8415143204294667
Standard deviation: 0.009427135762665184



In [38]:
# Hyperparameter tuning
from sklearn.model_selection import GridSearchCV
warnings.filterwarnings('ignore', category=FutureWarning, module='xgboost')

param_grid = {
    "learning_rate": [0.01, 0.1, 0.2, 1],
    "max_depth": [3, 4, 5, 6, 7],
    "n_estimators": [10, 20, 50, 100, 150],
}

gb_model = GradientBoostingClassifier(random_state=42)

grid_search = GridSearchCV(estimator=gb_model, param_grid=param_grid, cv=5, scoring="accuracy", n_jobs=-1, verbose=1)

grid_search.fit(X_train, y_train)

print(f"Best score: {grid_search.best_score_}")
print(f"Best parameters: {grid_search.best_params_}")

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best score: 0.8006736001992116
Best parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}


In [39]:
# Using best parameters
gb_model = GradientBoostingClassifier(learning_rate=0.1, max_depth=3, n_estimators=50, random_state=42)

gb_model.fit(X_train, y_train)

y_pred = gb_model.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Precision: {precision_score(y_test, y_pred)}")
print(f"Recall: {recall_score(y_test, y_pred)}")
print(f"F1: {f1_score(y_test, y_pred)}")
print(f"AUC: {roc_auc_score(y_test, y_pred)}")
print(f"Confusion matrix:\n {confusion_matrix(y_test, y_pred)}")
print(f"Classification report:\n {classification_report(y_test, y_pred)}")

Accuracy: 0.808374733853797
Precision: 0.6832740213523132
Recall: 0.514745308310992
F1: 0.5871559633027523
AUC: 0.714418986201828
Confusion matrix:
 [[947  89]
 [181 192]]
Classification report:
               precision    recall  f1-score   support

           0       0.84      0.91      0.88      1036
           1       0.68      0.51      0.59       373

    accuracy                           0.81      1409
   macro avg       0.76      0.71      0.73      1409
weighted avg       0.80      0.81      0.80      1409



In [40]:
# Save model
import pickle

with open('model/gb_model.pkl', 'wb') as file:
    pickle.dump(gb_model, file)

In [41]:
# Load model
with open('model/gb_model.pkl', 'rb') as file:
    gb_model = pickle.load(file)

In [42]:
import boto3

s3 = boto3.client('s3')

s3.upload_file("model/gb_model.pkl", "mlops-python", "models/gradient_boosting/gb_model.pkl")

In [43]:
# Read model from S3
s3.download_file("mlops-python", "models/gradient_boosting/gb_model.pkl", "model/gb_model_s3.pkl")

with open('model/gb_model_s3.pkl', 'rb') as file:
    gb_model_s3 = pickle.load(file)

y_pred = gb_model_s3.predict(X_test)

In [25]:
file = "s3://mlops-python/synthetic_data/2023-09-19-04-46-42.csv"

df = pd.read_csv(file)

In [26]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,1,No,No,22,Yes,Yes,Fiber optic,Yes,Yes,Yes,No,No,No,Month-to-month,No,Mailed check,105.13634,2624.346791,No
1,Male,0,No,No,58,Yes,Yes,Fiber optic,Yes,No,No,No,No,Yes,One year,No,Electronic check,88.218598,790.576462,Yes
2,Male,1,Yes,No,0,Yes,Yes,DSL,No,Yes,Yes,Yes,No,No,One year,No,Credit card (automatic),62.750437,8684.8,No
3,Male,0,No,No,33,Yes,Yes,No,No internet service,No internet service,No internet service,No,No internet service,Yes,One year,No,Bank transfer (automatic),21.777537,5427.033536,No
4,Male,0,No,Yes,1,Yes,Yes,No,No internet service,Yes,No internet service,No internet service,No internet service,No internet service,Two year,No,Credit card (automatic),25.824823,680.152635,No


In [27]:
csv_file = s3.get_object(Bucket="mlops-python", Key="synthetic_data/2023-09-19-04-46-42.csv")

df = pd.read_csv(csv_file['Body'])

In [28]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,1,No,No,22,Yes,Yes,Fiber optic,Yes,Yes,Yes,No,No,No,Month-to-month,No,Mailed check,105.13634,2624.346791,No
1,Male,0,No,No,58,Yes,Yes,Fiber optic,Yes,No,No,No,No,Yes,One year,No,Electronic check,88.218598,790.576462,Yes
2,Male,1,Yes,No,0,Yes,Yes,DSL,No,Yes,Yes,Yes,No,No,One year,No,Credit card (automatic),62.750437,8684.8,No
3,Male,0,No,No,33,Yes,Yes,No,No internet service,No internet service,No internet service,No,No internet service,Yes,One year,No,Bank transfer (automatic),21.777537,5427.033536,No
4,Male,0,No,Yes,1,Yes,Yes,No,No internet service,Yes,No internet service,No internet service,No internet service,No internet service,Two year,No,Credit card (automatic),25.824823,680.152635,No
