In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression

from sklearn.model_selection import cross_val_score, KFold

In [2]:
# Read in the dataset.
data = pd.read_csv("./data/customer_churn.csv")

In [4]:
# Remove columns that you believe are not relevant.
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [19]:
data.shape

(7043, 21)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [40]:
# The data type of column `TotalCharges` is an object. However, this should be a float.
# If you tried to turn this entire column into a float using .astype(float), this would not work.
# Figure out why this is the case, delete the rows that are throwing an error, and turn this column into a float.

In [12]:
# data['TotalCharges'].astype(float)
# new_values = astype_array_safe(values, dtype, copy=copy, errors=errors)
# ValueError: could not convert string to float: ' '
# I think this means there is an issue with a blank cell 

In [None]:
# filipa's solution
# data.loc[cond_tcharge_nonnumb, "TotalCharges"] = data.loc[cond_tcharge_nonnumb, "TotalCharges"].replace(" ", 0)

In [6]:
data.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [17]:
data.eq(' ').sum() + data.isnull().sum()

customerID           0
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [None]:
# Tried to use the .dropna() to remove rows with blanks in TotalCharges, but this 
# did not work

In [5]:
data = data.replace(' ', None)

In [6]:
# why does this not show none values in TotalCharges?
data.isnull().sum()

customerID           0
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [7]:
data = data.dropna()

In [8]:
data.shape

(7032, 21)

In [9]:
data['TotalCharges_Float'] = data['TotalCharges'].astype(float)

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7032 entries, 0 to 7042
Data columns (total 22 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   customerID          7032 non-null   object 
 1   gender              7032 non-null   object 
 2   SeniorCitizen       7032 non-null   int64  
 3   Partner             7032 non-null   object 
 4   Dependents          7032 non-null   object 
 5   tenure              7032 non-null   int64  
 6   PhoneService        7032 non-null   object 
 7   MultipleLines       7032 non-null   object 
 8   InternetService     7032 non-null   object 
 9   OnlineSecurity      7032 non-null   object 
 10  OnlineBackup        7032 non-null   object 
 11  DeviceProtection    7032 non-null   object 
 12  TechSupport         7032 non-null   object 
 13  StreamingTV         7032 non-null   object 
 14  StreamingMovies     7032 non-null   object 
 15  Contract            7032 non-null   object 
 16  PaperlessBi

In [11]:
data.shape

(7032, 22)

In [12]:
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,TotalCharges_Float
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No,29.85
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,No,No,No,One year,No,Mailed check,56.95,1889.5,No,1889.5
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,108.15
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No,1840.75
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,151.65


In [13]:
# Now I will remove columns that I think are not relevant
data = data.drop(["customerID", "TotalCharges"], axis=1)

In [14]:
data.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,Churn,TotalCharges_Float
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,No,29.85
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,No,1889.5
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,Yes,108.15
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,No,1840.75
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,Yes,151.65


In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7032 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   gender              7032 non-null   object 
 1   SeniorCitizen       7032 non-null   int64  
 2   Partner             7032 non-null   object 
 3   Dependents          7032 non-null   object 
 4   tenure              7032 non-null   int64  
 5   PhoneService        7032 non-null   object 
 6   MultipleLines       7032 non-null   object 
 7   InternetService     7032 non-null   object 
 8   OnlineSecurity      7032 non-null   object 
 9   OnlineBackup        7032 non-null   object 
 10  DeviceProtection    7032 non-null   object 
 11  TechSupport         7032 non-null   object 
 12  StreamingTV         7032 non-null   object 
 13  StreamingMovies     7032 non-null   object 
 14  Contract            7032 non-null   object 
 15  PaperlessBilling    7032 non-null   object 
 16  PaymentMeth

In [18]:
# One-hot Encode the categorical variables except for `Churn`.
def _transform_column_into_dummies(dataframe, name_of_column):
    dummies = pd.get_dummies(dataframe[name_of_column], prefix=f"{name_of_column} = ")
    
    dataframe = pd.concat([dataframe, dummies], axis=1)
    
    dataframe = dataframe.drop([name_of_column], axis=1)
    
    return dataframe

In [19]:
data = _transform_column_into_dummies(data, "gender")
data = _transform_column_into_dummies(data, "SeniorCitizen")
data = _transform_column_into_dummies(data, "Partner")
data = _transform_column_into_dummies(data, "Dependents")
data = _transform_column_into_dummies(data, "PhoneService")
data = _transform_column_into_dummies(data, "MultipleLines")
data = _transform_column_into_dummies(data, "InternetService")
data = _transform_column_into_dummies(data, "OnlineSecurity")
data = _transform_column_into_dummies(data, "OnlineBackup")
data = _transform_column_into_dummies(data, "DeviceProtection")
data = _transform_column_into_dummies(data, "TechSupport")
data = _transform_column_into_dummies(data, "StreamingTV")
data = _transform_column_into_dummies(data, "StreamingMovies")
data = _transform_column_into_dummies(data, "Contract")
data = _transform_column_into_dummies(data, "PaperlessBilling")
data = _transform_column_into_dummies(data, "PaymentMethod")

In [20]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7032 entries, 0 to 7042
Data columns (total 47 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   tenure                                      7032 non-null   int64  
 1   MonthlyCharges                              7032 non-null   float64
 2   Churn                                       7032 non-null   object 
 3   TotalCharges_Float                          7032 non-null   float64
 4   gender = _Female                            7032 non-null   bool   
 5   gender = _Male                              7032 non-null   bool   
 6   SeniorCitizen = _0                          7032 non-null   bool   
 7   SeniorCitizen = _1                          7032 non-null   bool   
 8   Partner = _No                               7032 non-null   bool   
 9   Partner = _Yes                              7032 non-null   bool   
 10  Dependents = _No 

In [21]:
data.head()

Unnamed: 0,tenure,MonthlyCharges,Churn,TotalCharges_Float,gender = _Female,gender = _Male,SeniorCitizen = _0,SeniorCitizen = _1,Partner = _No,Partner = _Yes,...,StreamingMovies = _Yes,Contract = _Month-to-month,Contract = _One year,Contract = _Two year,PaperlessBilling = _No,PaperlessBilling = _Yes,PaymentMethod = _Bank transfer (automatic),PaymentMethod = _Credit card (automatic),PaymentMethod = _Electronic check,PaymentMethod = _Mailed check
0,1,29.85,No,29.85,True,False,True,False,False,True,...,False,True,False,False,False,True,False,False,True,False
1,34,56.95,No,1889.5,False,True,True,False,True,False,...,False,False,True,False,True,False,False,False,False,True
2,2,53.85,Yes,108.15,False,True,True,False,True,False,...,False,True,False,False,False,True,False,False,False,True
3,45,42.3,No,1840.75,False,True,True,False,True,False,...,False,False,True,False,True,False,True,False,False,False
4,2,70.7,Yes,151.65,True,False,True,False,True,False,...,False,True,False,False,False,True,False,False,True,False


In [46]:
# For the `Churn` column, transform a value of No to 0 and a value of Yes to 1
# Ex: If data.iloc[0]["Churn"] is "No", turn this value into a 0.

In [22]:
data['Churn'] = data['Churn'].map({'Yes': 1, 'No': 0})

In [23]:
data.head()

Unnamed: 0,tenure,MonthlyCharges,Churn,TotalCharges_Float,gender = _Female,gender = _Male,SeniorCitizen = _0,SeniorCitizen = _1,Partner = _No,Partner = _Yes,...,StreamingMovies = _Yes,Contract = _Month-to-month,Contract = _One year,Contract = _Two year,PaperlessBilling = _No,PaperlessBilling = _Yes,PaymentMethod = _Bank transfer (automatic),PaymentMethod = _Credit card (automatic),PaymentMethod = _Electronic check,PaymentMethod = _Mailed check
0,1,29.85,0,29.85,True,False,True,False,False,True,...,False,True,False,False,False,True,False,False,True,False
1,34,56.95,0,1889.5,False,True,True,False,True,False,...,False,False,True,False,True,False,False,False,False,True
2,2,53.85,1,108.15,False,True,True,False,True,False,...,False,True,False,False,False,True,False,False,False,True
3,45,42.3,0,1840.75,False,True,True,False,True,False,...,False,False,True,False,True,False,True,False,False,False
4,2,70.7,1,151.65,True,False,True,False,True,False,...,False,True,False,False,False,True,False,False,True,False


In [55]:
# Split the dataset into a singular training/testing split using a random_state of 42 and using "Churn" as the target.

In [24]:
X = data.drop(["Churn"], axis =1)
y = data["Churn"]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [1]:
# Train your model, predict on the test set, and get the F1-Score (feel free to use the scikit-learn f1-score metric).

In [25]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter = 1000)
lr.fit(X_train, y_train)

In [26]:
lr.intercept_

array([-0.13591423])

In [27]:
lr.coef_

array([[-5.97607760e-02,  4.26322852e-03,  3.10301611e-04,
        -5.10570398e-02, -8.25341995e-02, -1.92608679e-01,
         5.90174397e-02, -8.50194814e-02, -4.85717579e-02,
         5.13993716e-02, -1.84990611e-01,  1.30200248e-01,
        -2.63791487e-01, -2.26236172e-01,  1.30200248e-01,
        -3.75553151e-02, -3.77706253e-01,  3.47546938e-01,
        -1.03431925e-01,  2.13051276e-01, -1.03431925e-01,
        -2.43210590e-01,  5.17100565e-02, -1.03431925e-01,
        -8.18693708e-02,  2.16148613e-03, -1.03431925e-01,
        -3.23208004e-02,  2.06657443e-01, -1.03431925e-01,
        -2.36816758e-01, -1.08065022e-01, -1.03431925e-01,
         7.79057081e-02, -1.17510945e-01, -1.03431925e-01,
         8.73516304e-02,  6.15808514e-01, -2.15829448e-01,
        -5.33570306e-01, -2.05456887e-01,  7.18656475e-02,
        -3.87441575e-02, -1.93918873e-01,  2.03938345e-01,
        -1.04866553e-01]])

In [28]:
y_pred = lr.predict(X_test)

In [29]:
from sklearn.metrics import f1_score
f1_score(y_test, y_pred)

0.5630498533724341

In [30]:
# Define a K-fold CV class with the number of folds being 5, random state of 42, and shuffle being True.
from sklearn.model_selection import KFold

num_folds = 5
random_seed = 42

kfold = KFold(
    n_splits=num_folds,
    shuffle=True,
    random_state=random_seed,
)

In [3]:
# Define a new Logistic Regression Model and define the X and Y's.

In [None]:
# not sure of this step - Filipa confirmed in lab this is a mistake

In [31]:
# For each of the 5 folds, get the F1 Score. Metrics: https://scikit-learn.org/stable/modules/model_evaluation.html
# Note: A list (or np.array()) should be returned.
from sklearn.model_selection import cross_val_score

In [32]:
results = cross_val_score(
    estimator = lr, 
    X = X_train, 
    y = y_train, 
    cv = kfold,
    scoring="f1" 
)

In [33]:
results

array([0.60915493, 0.61896243, 0.60833333, 0.58699473, 0.58906526])

In [34]:
# average performance
round(np.mean(results),3)

0.603

In [35]:
# std
round(np.std(results), 3)

0.012

In [93]:
0.6-(2*0.012)

0.576

## Theoretical Question

1. Intuitively, what do you think these 5 different scores represent?

f1_score of five logistic regression models run using k-fold cross validation

2. With this cross-validated model, say that someone asks you for a single F1-Score. What would you respond with?

Perhaps the f1_score that is closest to the mean

In [5]:
## Challenge: Without using Scikit-Learn's cross validation function, can you create your own?
# Note: The random library may be helpful depending on your implementation.