# Step 1 - Import and Explore

In [1]:
import pandas as pd

In [2]:
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [4]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

In [5]:
df = pd.read_csv('Churn.csv')

In [6]:
df

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2.0,0.00,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1.0,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8.0,159660.80,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1.0,0.00,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2.0,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,France,Male,39,5.0,0.00,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10.0,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7.0,0.00,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3.0,75075.31,2,1,0,92888.52,1


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
RowNumber          10000 non-null int64
CustomerId         10000 non-null int64
Surname            10000 non-null object
CreditScore        10000 non-null int64
Geography          10000 non-null object
Gender             10000 non-null object
Age                10000 non-null int64
Tenure             9091 non-null float64
Balance            10000 non-null float64
NumOfProducts      10000 non-null int64
HasCrCard          10000 non-null int64
IsActiveMember     10000 non-null int64
EstimatedSalary    10000 non-null float64
Exited             10000 non-null int64
dtypes: float64(3), int64(8), object(3)
memory usage: 1.1+ MB


In [8]:
df.describe()

Unnamed: 0,RowNumber,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10000.0,10000.0,10000.0,10000.0,9091.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,15690940.0,650.5288,38.9218,4.99769,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037
std,2886.89568,71936.19,96.653299,10.487806,2.894723,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769
min,1.0,15565700.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,2500.75,15628530.0,584.0,32.0,2.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,5000.5,15690740.0,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0
75%,7500.25,15753230.0,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0
max,10000.0,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


With an initial look at the data, we might want to consider standardizing some of the numerical variables. I'll dive into this a bit more in the next section. Before that, I don't see RowNumber as an important variable for this task, nor is CustomerId or Surname -none of these would logically have an impact on how long they stay with the bank. 

This leaves only Gender and Geography in as a categorical variable, which I'll want to create dummy variables for so whatever algorithm we use can process it. 

Another variable of note is Tenure, which has missing data. Due to the nature of the metrics, it should be safe to assume these missing values should be simply 0s in our case, which I will go ahead and replace them with. 

# Step 2 - Clean and Preprocess Data

In [9]:
df['Tenure'].fillna(0, inplace=True)

In [10]:
df.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1, inplace=True)

In [11]:
df = pd.get_dummies(df, columns=['Geography', 'Gender'], drop_first=True)

In [12]:
df.tail()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain,Gender_Male
9995,771,39,5.0,0.0,2,1,0,96270.64,0,0,0,1
9996,516,35,10.0,57369.61,1,1,1,101699.77,0,0,0,1
9997,709,36,7.0,0.0,1,0,1,42085.58,1,0,0,0
9998,772,42,3.0,75075.31,2,1,0,92888.52,1,1,0,1
9999,792,28,0.0,130142.79,1,1,0,38190.78,0,0,0,0


In [13]:
X = df.drop('Exited', axis=1)
y = df['Exited']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, random_state=47)

In [15]:
X_test, X_valid, y_test, y_valid = train_test_split(X_test, y_test, test_size=0.50, random_state=47)

With each of the numeric columns downsampled and the categorical columns one-hot-encoded, we are now able to properly train and test a model. Our features (X), and target (y) are ready to be input into a variety of models to find the best possible accuracy. 

We've also created an initial training set, validation set, and final testing set. With all this in place, we are going to explore three different potential models and tune accordingly. 

# Step 3 - Hyperparameter Tuning and Model Selection

In [16]:
model = LogisticRegression(random_state=47, solver='liblinear')
model.fit(X_train, y_train)
score = model.score(X_valid, y_valid)
print('Accuracy:', score)

Accuracy: 0.7865


In [17]:
prob_valid = model.predict_proba(X_valid)
prob_one_valid = prob_valid[:, 1]

auc_roc = roc_auc_score(y_valid, prob_one_valid)

print('AUC ROC:', auc_roc)

AUC ROC: 0.6562818691724942


In [18]:
predicted_valid = model.predict(X_valid)

f1 = f1_score(y_valid, predicted_valid)
print('F1 Score', f1)

F1 Score 0.10855949895615866


In [19]:
for depth in range(1, 7):
    model = DecisionTreeClassifier(max_depth=depth, random_state=47)
    model.fit(X_train, y_train)
    score = model.score(X_valid, y_valid)
    print("max_depth=", depth, ":", score)

max_depth= 1 : 0.792
max_depth= 2 : 0.842
max_depth= 3 : 0.831
max_depth= 4 : 0.848
max_depth= 5 : 0.8535
max_depth= 6 : 0.8555


In [20]:
prob_valid = model.predict_proba(X_valid)
prob_one_valid = prob_valid[:, 1]

auc_roc = roc_auc_score(y_valid, prob_one_valid)

print('AUC ROC:', auc_roc)

AUC ROC: 0.8291751651126652


In [21]:
predicted_valid = model.predict(X_valid)

f1 = f1_score(y_valid, predicted_valid)
print('F1 Score', f1)

F1 Score 0.5587786259541986


In [22]:
for estims in range(10, 61, 10):
    model = RandomForestClassifier(random_state=47, n_estimators=estims)
    model.fit(X_train, y_train)
    score = model.score(X_valid, y_valid)
    print("n_estimators=", estims, ":", score)

n_estimators= 10 : 0.844
n_estimators= 20 : 0.856
n_estimators= 30 : 0.8585
n_estimators= 40 : 0.855
n_estimators= 50 : 0.8585
n_estimators= 60 : 0.857


In [23]:
model = RandomForestClassifier(random_state=47, n_estimators=50)
model.fit(X_train, y_train)

RandomForestClassifier(n_estimators=50, random_state=47)

In [24]:
prob_valid = model.predict_proba(X_valid)
prob_one_valid = prob_valid[:, 1]

auc_roc = roc_auc_score(y_valid, prob_one_valid)

print('AUC ROC:', auc_roc)

AUC ROC: 0.8395394752816627


In [25]:
predicted_valid = model.predict(X_valid)

f1 = f1_score(y_valid, predicted_valid)
print('F1 Score', f1)

F1 Score 0.5844346549192365


Since we were aiming for an F1 of at least 0.59, we have failed thus far to reach our goal. While we will definitely want to explore some feature normalization using both a standard scaler and downsampling, we will first look to further tune the random forest model by testing some different max_depths for our tree.  

In [26]:
for depth in range(10, 101, 10):
    model = RandomForestClassifier(random_state=47, n_estimators=50, max_depth=depth)
    model.fit(X_train, y_train)
    predicted_valid = model.predict(X_valid)
    f1 = f1_score(y_valid, predicted_valid)
    print("max_depth=", depth, ":", f1)

max_depth= 10 : 0.5574803149606299
max_depth= 20 : 0.5970149253731344
max_depth= 30 : 0.5844346549192365
max_depth= 40 : 0.5844346549192365
max_depth= 50 : 0.5844346549192365
max_depth= 60 : 0.5844346549192365
max_depth= 70 : 0.5844346549192365
max_depth= 80 : 0.5844346549192365
max_depth= 90 : 0.5844346549192365
max_depth= 100 : 0.5844346549192365


In [27]:
model = RandomForestClassifier(random_state=47, n_estimators=50, max_depth=20)
model.fit(X_train, y_train)

RandomForestClassifier(max_depth=20, n_estimators=50, random_state=47)

In [28]:
model.score(X_test, y_test)

0.8595

In [29]:
prob_test = model.predict_proba(X_test)
prob_one_test = prob_test[:, 1]

auc_roc = roc_auc_score(y_test, prob_one_test)
print(auc_roc)

0.823242556219711


In [30]:
predicted_test = model.predict(X_test)

f1 = f1_score(y_test, predicted_test)
print(f1)

0.5629860031104199


After our hyperparameter tuning, we are able to achieve an f1 score of 0.59 with a max_depth of 20 and n_estimators at 50 for our Random Forest Classifier. However, as you might expect with a testing set, this score falls on the test data.  

With that in mind, we'll look at balancing our numeric classes to see if we can obtain a better score. 

# Step 4 - Training with Standard Scaler

In [31]:
numeric = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']

In [32]:
scaler = StandardScaler()

In [33]:
scaler.fit(X_train[numeric])

StandardScaler()

In [34]:
X_train_scaled = scaler.transform(X_train[numeric])
X_valid_scaled = scaler.transform(X_valid[numeric])

In [35]:
model = RandomForestClassifier(random_state=47, n_estimators=50, max_depth=20)
model.fit(X_train_scaled, y_train)

RandomForestClassifier(max_depth=20, n_estimators=50, random_state=47)

In [36]:
prob_valid = model.predict_proba(X_valid_scaled)
prob_one_valid = prob_valid[:, 1]

auc_roc = roc_auc_score(y_valid, prob_one_valid)

print('AUC ROC:', auc_roc)

AUC ROC: 0.8043513561091686


In [37]:
predicted_valid = model.predict(X_valid_scaled)

f1 = f1_score(y_valid, predicted_valid)
print('F1 Score:', f1)

F1 Score: 0.4809384164222873


It appears that, if we stay with our original model, a standard scaler actually reduces our accuracy. With that in mind, we will try downsampling to rebalance our classes next. 

# Step 5 - Training with Downsampling

In [38]:
def downsample(X, y, fraction):
    X_zeros = X[y == 0]
    X_ones = X[y == 1]
    y_zeros = y[y == 0]
    y_ones = y[y == 1]

    X_downsampled = pd.concat(
        [X_zeros.sample(frac=fraction, random_state=47)] + [X_ones])
    y_downsampled = pd.concat(
        [y_zeros.sample(frac=fraction, random_state=47)] + [y_ones])
    
    X_downsampled, y_downsampled = shuffle(
        X_downsampled, y_downsampled, random_state=47)
    
    return X_downsampled, y_downsampled

In [39]:
X_downsampled, y_downsampled = downsample(X_train, y_train, 0.5)

In [40]:
model = RandomForestClassifier(random_state=47, n_estimators=50, max_depth=20)
model.fit(X_downsampled, y_downsampled)

RandomForestClassifier(max_depth=20, n_estimators=50, random_state=47)

In [41]:
prob_valid = model.predict_proba(X_valid)
prob_one_valid = prob_valid[:, 1]

auc_roc = roc_auc_score(y_valid, prob_one_valid)

print('AUC ROC:', auc_roc)

AUC ROC: 0.8490577347999223


In [42]:
predicted_valid = model.predict(X_valid)

f1 = f1_score(y_valid, predicted_valid)
print('F1 Score:', f1)

F1 Score: 0.6028368794326242


In [43]:
predicted_test = model.predict(X_test)

f1 = f1_score(y_test, predicted_test)
print(f1)

0.5797807551766139


Before moving forward, we want to consider our hyperparameter tuning on the random forest classifier. Downsampling does improve our performance, and reworking the model parameters based on this may result in further improvements. 

Prior this however, we will attempt upsampling. 

# Step 6 - Upsampling the Data

In [44]:
def upsample(X, y, repeat):
    X_zeros = X[y == 0]
    X_ones = X[y == 1]
    y_zeros = y[y == 0]
    y_ones = y[y == 1]

    X_upsampled = pd.concat([X_zeros] + [X_ones] * repeat)
    y_upsampled = pd.concat([y_zeros] + [y_ones] * repeat)
    
    X_upsampled, y_upsampled = shuffle(
        X_upsampled, y_upsampled, random_state=47)
    
    return X_upsampled, y_upsampled

In [45]:
X_upsampled, y_upsampled = upsample(X_train, y_train, 100)

In [46]:
model = RandomForestClassifier(random_state=47, n_estimators=50, max_depth=20)
model.fit(X_upsampled, y_upsampled)

RandomForestClassifier(max_depth=20, n_estimators=50, random_state=47)

In [47]:
prob_valid = model.predict_proba(X_valid)
prob_one_valid = prob_valid[:, 1]

auc_roc = roc_auc_score(y_valid, prob_one_valid)

print('AUC ROC:', auc_roc)

AUC ROC: 0.836015655351593


In [48]:
predicted_valid = model.predict(X_valid)

f1 = f1_score(y_valid, predicted_valid)
print('F1 Score:', f1)

F1 Score: 0.5849056603773585


In [49]:
predicted_test = model.predict(X_test)

f1 = f1_score(y_test, predicted_test)
print('Test f1 Score:', f1)

Test f1 Score: 0.5646766169154229


While we could try adjusting our hyperparameters further, it appears downsampling worked better overall than upsampling. For this reason, we'll proceed to test some different parameters with our downsamping. 

# Step 7 - Trying New Hyperparameters

In [50]:
for estims in range(100, 1001, 100):
    model = RandomForestClassifier(random_state=47, n_estimators=estims)
    model.fit(X_downsampled, y_downsampled)
    predicted_valid = model.predict(X_valid)
    f1 = f1_score(y_valid, predicted_valid)
    print("n_estimators=", estims, ":", f1)

n_estimators= 100 : 0.6063454759106932
n_estimators= 200 : 0.6103286384976525
n_estimators= 300 : 0.608695652173913
n_estimators= 400 : 0.6081871345029239
n_estimators= 500 : 0.6072684642438452
n_estimators= 600 : 0.6081871345029239
n_estimators= 700 : 0.6065573770491802
n_estimators= 800 : 0.6051401869158878
n_estimators= 900 : 0.6051401869158878
n_estimators= 1000 : 0.6107226107226108


In [51]:
for depth in range(10, 101, 10):
    model = RandomForestClassifier(random_state=47, n_estimators=1000, max_depth=depth)
    model.fit(X_downsampled, y_downsampled)
    predicted_valid = model.predict(X_valid)
    f1 = f1_score(y_valid, predicted_valid)
    print("max_depth=", depth, ":", f1)

max_depth= 10 : 0.6179640718562875
max_depth= 20 : 0.6157461809635723
max_depth= 30 : 0.6107226107226108
max_depth= 40 : 0.6107226107226108
max_depth= 50 : 0.6107226107226108
max_depth= 60 : 0.6107226107226108
max_depth= 70 : 0.6107226107226108
max_depth= 80 : 0.6107226107226108
max_depth= 90 : 0.6107226107226108
max_depth= 100 : 0.6107226107226108


Since higher max depth seems to have little impact, I wanted to further test this with max depths lower than 20. 

In [52]:
for depth in range(2, 19, 2):
    model = RandomForestClassifier(random_state=47, n_estimators=1000, max_depth=depth)
    model.fit(X_downsampled, y_downsampled)
    predicted_valid = model.predict(X_valid)
    f1 = f1_score(y_valid, predicted_valid)
    print("max_depth=", depth, ":", f1)

max_depth= 2 : 0.3699633699633699
max_depth= 4 : 0.5885558583106267
max_depth= 6 : 0.6183699870633894
max_depth= 8 : 0.6262376237623762
max_depth= 10 : 0.6179640718562875
max_depth= 12 : 0.6148325358851674
max_depth= 14 : 0.6124852767962309
max_depth= 16 : 0.6130177514792899
max_depth= 18 : 0.6226415094339622


In [53]:
model = RandomForestClassifier(random_state=47, n_estimators=1000, max_depth=8)
model.fit(X_downsampled, y_downsampled)

RandomForestClassifier(max_depth=8, n_estimators=1000, random_state=47)

With a max_depth of 8 and number of estimators set at 1000, our f1 score of .62 on the initial data suggests we might achieve our target score on test data. Knowing this, we proceed to test our data. 

# Step 8 - Testing Our Final Model

In [54]:
model.score(X_test, y_test)

0.841

In [55]:
prob_test = model.predict_proba(X_test)
prob_one_test = prob_test[:, 1]

auc_roc = roc_auc_score(y_test, prob_one_test)
print('Final Auc_Roc', auc_roc)

Final Auc_Roc 0.8422258856944861


In [56]:
predicted_test = model.predict(X_test)

f1 = f1_score(y_test, predicted_test)
print('Test F1', f1)

Test F1 0.5943877551020408


With our data preprocessing and model hyperparameter testing, we are able to maintain an F1 of .59 and AUC of 0.84 while also keeping a 0.84 accuracy score on the model. This should give us a better way to predict customer churn and help devise strategies to retain these customers longer. 