# Imbalanced Classification Project

## 1. Import the required libraries and load data

In [23]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.utils import shuffle
from sklearn.metrics import roc_auc_score

churn = pd.read_csv("https://bit.ly/2XZK7Bo")



## 2. Data cleaning and Preparation:
a. View first few records
b. Check the shape of data
c. Check for and deal with missing values
d. Check for and deal with duplicates

In [24]:
# Check the first few records

churn.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2.0,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1.0,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8.0,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1.0,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2.0,125510.82,1,1,1,79084.1,0


In [25]:
# Check the shape of the data

churn.shape

(10000, 14)

In [26]:
# Check for missing data

churn.isnull().sum()

RowNumber            0
CustomerId           0
Surname              0
CreditScore          0
Geography            0
Gender               0
Age                  0
Tenure             909
Balance              0
NumOfProducts        0
HasCrCard            0
IsActiveMember       0
EstimatedSalary      0
Exited               0
dtype: int64

In [27]:
# Drop the missing data and recheck

churn.dropna(inplace = True)

churn.isnull().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [28]:
# Check for duplicates

churn.duplicated().sum()

0

## 3. Splitting the data into training, validation and test sets

In [29]:
churn['Balance'] = churn['Balance'] / 100000
churn['EstimatedSalary'] = churn['EstimatedSalary'] / 100000
#churn = churn.reset_index()

features = churn.drop(['CustomerId', 'Surname', 'Geography', 'Gender', 'Exited'], axis = 1)
target = churn['Exited']

# Spliting the data into training, validation and test sets (ratio 60:20:20 used)
# First split into 2, training and remainder

features_train, features_remainder, target_train, target_remainder = train_test_split(
    features, target, test_size=0.4, random_state=12345
)

# Proceed to split the remainder into validation and test sets

features_valid, features_test, target_valid, target_test = train_test_split(
    features_remainder, target_remainder, test_size=0.5, random_state=12345
)

print(features_train.shape)
print(features_valid.shape)
print(features_test.shape)
print(target_train.shape)
print(target_valid.shape)
print(target_test.shape)



(5454, 9)
(1818, 9)
(1819, 9)
(5454,)
(1818,)
(1819,)


## 4. Check class balances

In [30]:
# Check class balances

churn['Exited'].value_counts()

0    7237
1    1854
Name: Exited, dtype: int64

Positive observations are way less than negative observations only accounting for 0.2 of the total.

## 5. Train the models without taking into consideration class imbalance

In [31]:
# Train model and predict: Decision Tree

for depth in range(1, 11, 1):
  model = DecisionTreeClassifier(random_state=12345, max_depth = depth)
  model.fit(features_train, target_train)

  predicted_valid = model.predict(features_valid)
  print(accuracy_score(target_valid, predicted_valid))



0.7926292629262927
0.8305830583058306
0.8333333333333334
0.8509350935093509
0.8558855885588559
0.8498349834983498
0.8415841584158416
0.828932893289329
0.8267326732673267
0.8338833883388339


Testing the Decision tree using various max depths, max_depth of 5 gives the best accuracy of 0.8558.

In [32]:
# Train model and predict: Random Forest

for estimator in range(10, 110, 10):
  model = RandomForestClassifier(n_estimators=estimator, max_depth=5, random_state=12345)
  model.fit(features_train, target_train)
  predicted_valid = model.predict(features_valid)
  print(accuracy_score(target_valid, predicted_valid))

0.8487348734873488
0.8487348734873488
0.8536853685368537
0.8558855885588559
0.8542354235423543
0.8509350935093509
0.8536853685368537
0.8547854785478548
0.8536853685368537
0.8547854785478548


Testing Random Forest model using various number of estimators and max_depth of 5, we find the best accuracy of 0.8558 at 30 trees.

In [33]:
# Train model and predict: LogisticRegression

model = LogisticRegression(random_state=12345, solver='liblinear')
model.fit(features_train, target_train)
predicted_valid = model.predict(features_valid)
print(accuracy_score(target_valid, predicted_valid))

0.8036303630363036


Logistic Regression gives accuracy of 0.8036

## 6. Addressing the Class Imbalance:

### a) Upsampling

In [34]:
# Make the observations of the rare class less rare by repeating them several times over

def upsample(features, target, repeat):
    features_zeros = features[target == 0]
    features_ones = features[target == 1]
    target_zeros = target[target == 0]
    target_ones = target[target == 1]

    features_upsampled = pd.concat([features_zeros] + [features_ones] * repeat)
    target_upsampled = pd.concat([target_zeros] + [target_ones] * repeat)

    features_upsampled, target_upsampled = shuffle(
        features_upsampled, target_upsampled, random_state=12345
    )

    return features_upsampled, target_upsampled


features_upsampled, target_upsampled = upsample(
    features_train, target_train, 10
)

print(features_upsampled.shape)
print(target_upsampled.shape)

(15588, 9)
(15588,)


#### Train the different models using the upsampled data

In [35]:
# Train model and predict: Decision Tree

model = DecisionTreeClassifier(random_state=12345, max_depth = 2)
model.fit(features_upsampled, target_upsampled)

predicted_valid = model.predict(features_valid)
print('F1:', f1_score(target_valid, predicted_valid))

F1: 0.5100788781770377


In [36]:
# Train model and predict: Random Forest

model = RandomForestClassifier(n_estimators=110, max_depth=15, random_state=12345)
model.fit(features_upsampled, target_upsampled)
predicted_valid = model.predict(features_valid)
print('F1:', f1_score(target_valid, predicted_valid))

F1: 0.5830429732868757


In [37]:
# Train model and predict: LogisticRegression

model = LogisticRegression(random_state=12345, solver='liblinear')
model.fit(features_upsampled, target_upsampled)
predicted_valid = model.predict(features_valid)
print('F1:', f1_score(target_valid, predicted_valid))

F1: 0.3977961432506887


The best F1 of 0.583 is obtained using Random Forest method with 100 estimators and max_depth of 15.

### b) Downsampling:

In [38]:
# Make the observations of the most frequent class less frequent by applying a fraction:

def downsample(features, target, fraction):
    features_zeros = features[target == 0]
    features_ones = features[target == 1]
    target_zeros = target[target == 0]
    target_ones = target[target == 1]

    features_downsampled = pd.concat(
[features_zeros.sample(frac=fraction, random_state=12345)] + [features_ones])
    target_downsampled = pd.concat(
[target_zeros.sample(frac=fraction, random_state=12345)] + [target_ones])

    return features_downsampled, target_downsampled


features_downsampled, target_downsampled = downsample(
    features_train, target_train, 0.1
)

print(features_downsampled.shape)
print(target_downsampled.shape)



(1559, 9)
(1559,)


#### Train the different models using the downsampled data

In [39]:
# Train model and predict: Decision Tree

model = DecisionTreeClassifier(random_state=12345, max_depth = 4)
model.fit(features_downsampled, target_downsampled)
predicted_valid = model.predict(features_valid)

print('F1:', f1_score(target_valid, predicted_valid))




F1: 0.4705046197583511


In [40]:
# Train model and predict: Random Forest

model = RandomForestClassifier(n_estimators=50, max_depth=15, random_state=12345)
model.fit(features_downsampled, target_downsampled)
predicted_valid = model.predict(features_valid)
print('F1:', f1_score(target_valid, predicted_valid))

F1: 0.45448676565526147


In [41]:
# Train model and predict: LogisticRegression

model = LogisticRegression(random_state=12345, solver='liblinear')
model.fit(features_downsampled, target_downsampled)
predicted_valid = model.predict(features_valid)
print('F1:', f1_score(target_valid, predicted_valid))

F1: 0.3901907356948229


All the 3 models give poorer F1 scores when downsampling is used to address the class imbalance as opposed to when upsampling is used. Therefore upsampling in this case addresses the class imbalance better. The best model is Random Forest.

## 7. Check the AUC-ROC:

In [42]:
# Check  the AUC-ROC for Random Forest using the upsampled data.

model = RandomForestClassifier(n_estimators=110, max_depth=15, random_state=12345)
model.fit(features_upsampled, target_upsampled)

probabilities_valid = model.predict_proba(features_valid)
probabilities_one_valid = probabilities_valid[:, 1]
auc_roc = roc_auc_score(target_valid, probabilities_one_valid)

print(auc_roc)

0.8277905300806065


The AUC-ROC model is 0.8277 which is way better than the F1 of 0.5830.

## 8. Finally test the model using the test data set.

In [43]:
# Test the model using test data set: Random Forest

model = RandomForestClassifier(n_estimators=110, max_depth=15, random_state=12345)
model.fit(features_upsampled, target_upsampled)
predicted_test = model.predict(features_test)
print('F1:', f1_score(target_test, predicted_test))

F1: 0.5429292929292929


An F1 score of 0.5429 is obtained when the model is tested using the test data set.