In [1]:
import pandas as pd

In [2]:
# Read in the dataset.
data = pd.read_csv("./data/customer_churn.csv")

In [3]:
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
# Remove columns that you believe are not relevant.
data = data.drop("customerID", axis=1)

In [5]:
# The data type of column `TotalCharges` is an object. However, this should be a float.
# If you tryed to turn this entire column into a float using .astype(float), this would not work.
# Figure out why this is the case, delete the rows that are throwing an error, and turn this column into a float.

problematic_indices = []
for idx, row in data.iterrows():
    charges = row["TotalCharges"]
    try:
        float(charges)
    except:
        problematic_indices.append(idx)
        
data = data.drop(problematic_indices, axis=0)


In [None]:
# Show Filter -- as well

data["TotalCharges"] = data["TotalCharges"].astype(float)

In [6]:
# One-hot Encode the categorical variables except for `Churn`.
def _transform_column_into_dummies(dataframe, name_of_column):
    dummies = pd.get_dummies(dataframe[name_of_column], prefix=f"{name_of_column} = ")
    
    dataframe = pd.concat([dataframe, dummies], axis=1)
    
    dataframe = dataframe.drop([name_of_column], axis=1)
    
    return dataframe

for column in data.columns:
    if column == "Churn":
        continue
    elif data[column].dtype == object:
        data = _transform_column_into_dummies(data, column)
    

In [7]:
# For the `Churn` column, transform a value of No to 0 and a value of Yes to 1
# Ex: If data.iloc[0]["Churn"] is "No", turn this value into a 0.

data["Churn"] = data["Churn"].replace("No", 0)
data["Churn"] = data["Churn"].replace("Yes", 1)
data["Churn"] = data["Churn"].astype(int)

In [8]:
# Split the dataset into a singular training/testing split using a random_state of 42 and using "Churn" as the target.
from sklearn.model_selection import train_test_split

X = data.drop("Churn", axis=1)
y = data["Churn"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Train your model, predict on the test set, and get the F1-Score (feel free to use the scikit-learn f1-score metric).
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

lr = LogisticRegression(max_iter=200)
lr.fit(X_train, y_train)
preds = lr.predict(X_test)
single_f1 = f1_score(y_test, preds)
print(f"F1-Score: {single_f1}")

F1-Score: 0.5605839416058395


In [10]:
# Define a K-fold CV class with the number of folds being 5, random state of 42, and shuffle being True.
from sklearn.model_selection import KFold

num_folds = 5
random_seed = 42

kfold = KFold(
    n_splits=num_folds,
    random_state=random_seed,
    shuffle=True
)

In [11]:
# Define a new Logistic Regression Model and define the X and Y's.
X = data.drop("Churn", axis=1)
y = data["Churn"]

lr = LogisticRegression(max_iter=300)

In [12]:
# For each of the 5 folds, get the F1 Score. Metrics: https://scikit-learn.org/stable/modules/model_evaluation.html
# Note: A list (or np.array()) should be returned.
from sklearn.model_selection import cross_val_score

results = cross_val_score(
    lr, X, y, 
    cv=kfold,
    scoring="f1"
)

## Theoretical Question

1. Intuitively, what do you think these 5 different scores represent?

The different f1 scores for each fold.

2. With this cross-validated model, say that someone asks you for a single F1-Score. What would you respond with?

The mean of the F1-Scores.

In [13]:
## Challenge: Without using Scikit-Learn's cross validation function, can you create your own?
# Note: The random library may be helpful depending on your implementation.

import random
import numpy as np

num_folds = 5

for idx, row in data.iterrows():
    data.loc[idx, "fold"] = random.randint(0, num_folds)
    
f1_scores = []
for fold in range(num_folds):
    
    testing_set = data[data["fold"] == fold]
    training_set = data[data["fold"] != fold]
    
    X_train = training_set.drop("Churn", axis=1)
    y_train = training_set["Churn"]
    
    X_test = testing_set.drop("Churn", axis=1)
    y_test = testing_set["Churn"]
    
    lr = LogisticRegression(max_iter=300)
    lr.fit(X_train, y_train)
    preds = lr.predict(X_test)
    curr_f1_score = f1_score(y_test, preds)
    f1_scores.append(curr_f1_score)

print(f"My CV score: {np.mean(f1_scores)}")

My CV score: 0.5930809647073727
