In [1]:
# Importing python libraries
#
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns; sns.set(style = 'darkgrid')
import requests
from io import StringIO
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, KFold, cross_val_score
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import export_graphviz 
from IPython.display import Image  
import pydotplus
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
import warnings
warnings.filterwarnings(action = 'ignore')

In [2]:
df = pd.read_csv('../notebook/cleaned_donor_data.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19372 entries, 0 to 19371
Data columns (total 50 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   control_number               19372 non-null  int64  
 1   donated                      19372 non-null  int64  
 2   amount_donated               19372 non-null  float64
 3   months_since_origin          19372 non-null  int64  
 4   donor_age                    19372 non-null  int64  
 5   in_house                     19372 non-null  int64  
 6   urbanicity                   19372 non-null  int64  
 7   ses                          19372 non-null  int64  
 8   cluster_code                 19372 non-null  int64  
 9   home_owner                   19372 non-null  int64  
 10  donor_gender                 19372 non-null  int64  
 11  income_group                 19372 non-null  int64  
 12  published_phone              19372 non-null  int64  
 13  overlay_source  

In [4]:
df.head()

Unnamed: 0,control_number,donated,amount_donated,months_since_origin,donor_age,in_house,urbanicity,ses,cluster_code,home_owner,...,lifetime_gift_range,lifetime_max_gift_amt,lifetime_min_gift_amt,last_gift_amt,card_prom_12,number_prom_12,months_since_last_gift,months_since_first_gift,file_avg_gift,file_card_gift
0,5,0,0.0,101,87,0,5,5,28,0,...,15.0,20.0,5.0,15.0,5,12,26,92,8.49,7
1,12,1,10.0,137,79,0,1,2,45,0,...,20.0,25.0,5.0,17.0,7,21,7,122,14.72,12
2,37,0,0.0,113,75,0,2,1,11,0,...,23.0,28.0,5.0,19.0,11,32,6,105,16.75,16
3,38,0,0.0,92,60,0,4,2,4,0,...,14.0,17.0,3.0,15.0,11,33,6,92,11.76,12
4,41,0,0.0,101,74,0,1,2,49,1,...,20.0,25.0,5.0,25.0,6,19,18,92,8.83,3


In [5]:
# I'm dropping these two columns as they aren't needed for the work going forward
df.drop(['amount_donated'], axis=1, inplace=True)

In [6]:
df.head()

Unnamed: 0,control_number,donated,months_since_origin,donor_age,in_house,urbanicity,ses,cluster_code,home_owner,donor_gender,...,lifetime_gift_range,lifetime_max_gift_amt,lifetime_min_gift_amt,last_gift_amt,card_prom_12,number_prom_12,months_since_last_gift,months_since_first_gift,file_avg_gift,file_card_gift
0,5,0,101,87,0,5,5,28,0,1,...,15.0,20.0,5.0,15.0,5,12,26,92,8.49,7
1,12,1,137,79,0,1,2,45,0,1,...,20.0,25.0,5.0,17.0,7,21,7,122,14.72,12
2,37,0,113,75,0,2,1,11,0,0,...,23.0,28.0,5.0,19.0,11,32,6,105,16.75,16
3,38,0,92,60,0,4,2,4,0,0,...,14.0,17.0,3.0,15.0,11,33,6,92,11.76,12
4,41,0,101,74,0,1,2,49,1,0,...,20.0,25.0,5.0,25.0,6,19,18,92,8.83,3


In [7]:
df.set_index('control_number', inplace = True)

# Data Modelling

## RandomForest Classifier

In [8]:
# Selecting the independent variables and the target variable
#
y = df['donated']
X = df.drop('donated', axis = 1)


# Splitting the data into training and testing sets
#
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

# Standardising the data
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Instantiating the model
#
rf = RandomForestClassifier(random_state = 0)
rf.fit(X_train, y_train)

# Making predictions
#
y_pred = rf.predict(X_test)

# Measuring the accuracy of the model
#
print(f'The accuracy score is: {accuracy_score(y_test, y_pred)} and the f1 score is {f1_score(y_test, y_pred)}')
print('\n')
print(f'{classification_report(y_test, y_pred)}')
confusion_matrix(y_test, y_pred)

The accuracy score is: 0.7465588437715073 and the f1 score is 0.06713109563014566


              precision    recall  f1-score   support

           0       0.75      0.99      0.85      4336
           1       0.51      0.04      0.07      1476

    accuracy                           0.75      5812
   macro avg       0.63      0.51      0.46      5812
weighted avg       0.69      0.75      0.65      5812



array([[4286,   50],
       [1423,   53]], dtype=int64)

## Feature Importance

In [None]:
# Checking for the most important features that contribute most in predicting the target
# Creating a dataframe of features and their respective importances
#
impo_df = pd.DataFrame({'feature': X.columns, 'importance': np.round(rf.feature_importances_, 4)}).set_index('feature').sort_values(by = 'importance', ascending = False)
impo_df

### Visualising the most important features

In [None]:
# Creating a bar chart of feature importance in descending order
#
impo_df = impo_df[:20].sort_values(by = 'importance', ascending = True)
impo_df.plot(kind = 'barh', figsize = (10, 10), color = 'purple')
plt.legend(loc = 'center right')
plt.title('Bar chart showing feature importance', color = 'indigo', fontsize = 14)
plt.xlabel('Features', fontsize = 12, color = 'indigo')
plt.show()

*From the above feature importance barchart, it can be observed that:*

*   The most important feature in predicting whether a person will donate or not is **median home value**


### Remodelling with the most important features only

In [None]:
print(impo_df)

In [None]:
# Selecting only important features and the y variable
#

y = df['donated']
X = df[['recent_avg_card_gift_amt','months_since_last_gift','number_prom_12','lifetime_card_prom', 'recent_avg_gift_amt',
'months_since_first_gift', 'lifetime_prom', 'recent_response_prop', 'cluster_code', 'lifetime_gift_amount', 'lifetime_avg_gift_amt', 
'file_avg_gift', 'pct_attribute3', 'pct_attribute2', 'donor_age', 'pct_attribute4', 'pct_owner_occupied', 'median_household_income', 
'per_capita_income', 'median_home_value']]

# Splitting the data into training and testing sets
#
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

# Instantiating the model
#
rf = RandomForestClassifier(random_state = 0)
rf.fit(X_train, y_train)

# Making predictions
#
y_pred = rf.predict(X_test)

# Measuring the accuracy of the model
#
print(f'The accuracy score is: {accuracy_score(y_test, y_pred)} and the f1 score is {f1_score(y_test, y_pred)}')
print('\n')
print(f'{classification_report(y_test, y_pred)}')
confusion_matrix(y_test, y_pred)

*There is no  marginal decrease in f1 score*

### Parameter Tuning

In [None]:
# Previewing the parameters to tune
#
RandomForestClassifier()

In [None]:
# Creating a dictionary of parameters to tune
#
params = {'n_estimators': [10, 20, 30, 50, 100],
         'max_depth': [1, 2, 3, 4, 5]}

# Setting the number of folds to 10 and instantiating the model
# 
kfold = KFold(n_splits=10, shuffle=True)
model = RandomForestClassifier()

search = GridSearchCV(model, param_grid=params, scoring = 'f1', cv = kfold)

# Fitting the grid search with the X and the y variables
#
search.fit(X, y)

# Checking for the best parameters
#
print(f'The best parameters are: {search.best_params_}')

In [None]:
# Applying the best parameters to the model
#
# Selecting only important features and the y variable
#
y = df['donated']
X = df[['recent_avg_card_gift_amt','months_since_last_gift','number_prom_12','lifetime_card_prom', 'recent_avg_gift_amt',
'months_since_first_gift', 'lifetime_prom', 'recent_response_prop', 'cluster_code', 'lifetime_gift_amount', 'lifetime_avg_gift_amt', 
'file_avg_gift', 'pct_attribute3', 'pct_attribute2', 'donor_age', 'pct_attribute4', 'pct_owner_occupied', 'median_household_income', 
'per_capita_income', 'median_home_value']]

# Splitting the data into training and testing sets
#
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

# Instantiating the model
#
rf = RandomForestClassifier(n_estimators=10, max_depth=5, random_state = 0)
rf.fit(X_train, y_train)

# Making predictions
#
y_pred = rf.predict(X_test)

# Measuring the accuracy of the model
#
print(f'The accuracy score is: {accuracy_score(y_test, y_pred)} and the f1 score is {f1_score(y_test, y_pred)}')
print('\n')
print(f'{classification_report(y_test, y_pred)}')
confusion_matrix(y_test, y_pred)

*Parameter tuning hasn't decreased or increased the f1 score*

### Cross Validation to check for the stability of the model

In [None]:
# Performing cross validation of ten folds
#
scores = cross_val_score(rf, X, y, scoring = 'f1', cv = 10)

# Calculating the mean of the cross validation scores
#
print(f'Mean of cross validation scores is {scores.mean()}')

# Calculating the variance of the cross validation scores from the mean
#
print(f'Standard deviation of the cross validation scores is {scores.std()}')

*The true f1 score of the model is approximately 99%, with a variance of 0.000413.*

## AdaBoostClassifier

In [None]:
# Selecting the independent variables and the target variable
#
# Selecting the independent variables and the target variable
#
y = df['donated']
X = df.drop('donated', axis = 1)

# Splitting the data into training and testing sets
#
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

# Instantiating the model
#
ada = AdaBoostClassifier(random_state = 0)
ada.fit(X_train, y_train)

# Making predictions
#
y_pred = ada.predict(X_test)

# Measuring the accuracy of the model
#
acc_1 = accuracy_score(y_test, y_pred)
f1_1 = f1_score(y_test, y_pred)

print(f'The accuracy score is: {acc_1} and the f1 score is {f1_1}')
print('\n')
print(f'{classification_report(y_test, y_pred)}')
confusion_matrix(y_test, y_pred)

In [None]:
# Checking for the most important features that contribute most in predicting the target
# Creating a dataframe of features and their respective importances
#
impo_df = pd.DataFrame({'feature': X.columns, 'importance': ada.feature_importances_}).set_index('feature').sort_values(by = 'importance', ascending = False)
impo_df

### Visualising the most important features

In [None]:
# Creating a bar chart of feature importance in descending order
#
impo_df = impo_df[:26].sort_values(by = 'importance', ascending = True)
impo_df.plot(kind = 'barh', figsize = (10, 7), color = 'purple')
plt.legend(loc = 'center right')
plt.title('Bar chart showing feature importance', color = 'indigo', fontsize = 14)
plt.xlabel('Features', fontsize = 12, color = 'indigo')
plt.show()

In [None]:
impo_df['importance'].to_list


In [None]:

[['mor_hit_rate', 'file_card_gift', 'months_since_origin', 'file_avg_gift', 'recent_response_prop', 'recent_avg_gift_amt', 
'recent_card_response_prop', 'months_since_first_gift', 'months_since_last_prom_resp', 'months_since_last_gift', 'lifetime_card_prom',
'pct_attribute4', 'card_prom_12lifetime_min_gift_amt', 'cluster_code', 'number_prom_12', 'income_group', 'recent_star_status',
'median_household_income', 'lifetime_max_gift_amt', 'pct_owner_occupied', 'frequency_status_97nk', 'last_gift_amt', 'donor_age', 
'median_home_value', 'recent_avg_card_gift_am']]


*When compairing the most important features between the AdaBoostClassifier and RandomForestClassier; Ada uses fewer features and makes a better prediction. Ada only uses 10 features while RandomForest uses 12 features*

### Remodelling with the most important features only

In [None]:
# Selecting only important features and the y variable
#
y = df['donated']
X = df[['mor_hit_rate', 'file_card_gift', 'months_since_origin', 'file_avg_gift', 'recent_response_prop', 'recent_avg_gift_amt', 
'recent_card_response_prop', 'months_since_first_gift', 'months_since_last_prom_resp', 'months_since_last_gift', 'lifetime_card_prom',
'pct_attribute4', 'card_prom_12','lifetime_min_gift_amt', 'cluster_code', 'number_prom_12', 'income_group', 'recent_star_status',
'median_household_income', 'lifetime_max_gift_amt', 'pct_owner_occupied', 'frequency_status_97nk', 'last_gift_amt', 'donor_age', 
'median_home_value', 'recent_avg_card_gift_amt']]

# Splitting the data into training and testing sets
#
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

# Instantiating the model
#
ada = AdaBoostClassifier(random_state = 0)
ada.fit(X_train, y_train)

# Making predictions
#
y_pred = ada.predict(X_test)

# Measuring the accuracy of the model
#
print(f'The accuracy score is: {accuracy_score(y_test, y_pred)} and the f1 score is {f1_score(y_test, y_pred)}')
print('\n')
print(f'{classification_report(y_test, y_pred)}')
confusion_matrix(y_test, y_pred)

### Hyperparameter Tuning

In [None]:
# Previewing the parameters to tune
#
AdaBoostClassifier()

In [None]:
# Creating a dictionary of parameters to tune
#
params = {'n_estimators': [10, 20, 30, 50, 100],
         'learning_rate': [1, 2, 3, 4, 5]}

# Setting the number of folds to 10 and instantiating the model
# 
kfold = KFold(n_splits=10, shuffle=True)
model = AdaBoostClassifier()

search = GridSearchCV(model, param_grid=params, scoring = 'f1', cv = kfold)

# Fitting the grid search with the X and the y variables
#
search.fit(X, y)

# Checking for the best parameters
#
print(f'The best parameters are: {search.best_params_}')

In [None]:
# Selecting only important features and the y variable
#
y = df['donated']
X = df[['mor_hit_rate', 'file_card_gift', 'months_since_origin', 'file_avg_gift', 'recent_response_prop', 'recent_avg_gift_amt', 
'recent_card_response_prop', 'months_since_first_gift', 'months_since_last_prom_resp', 'months_since_last_gift', 'lifetime_card_prom',
'pct_attribute4', 'card_prom_12','lifetime_min_gift_amt', 'cluster_code', 'number_prom_12', 'income_group', 'recent_star_status',
'median_household_income', 'lifetime_max_gift_amt', 'pct_owner_occupied', 'frequency_status_97nk', 'last_gift_amt', 'donor_age', 
'median_home_value', 'recent_avg_card_gift_amt']]
# Splitting the data into training and testing sets
#
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

# Instantiating the model
#
ada = AdaBoostClassifier(learning_rate=3, n_estimators=10, random_state = 0)
ada.fit(X_train, y_train)

# Making predictions
#
y_pred = ada.predict(X_test)

# Measuring the accuracy of the model
#
print(f'The accuracy score is: {accuracy_score(y_test, y_pred)} and the f1 score is {f1_score(y_test, y_pred)}')
print('\n')
print(f'{classification_report(y_test, y_pred)}')
confusion_matrix(y_test, y_pred)

In [None]:
dot_data = StringIO()
# pick a specific tree from the forest
tree = ada.estimators_[3]

export_graphviz(tree, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True,feature_names = X.columns)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())

### Cross Validation to check for the stability of the model

In [None]:
# Performing cross validation of ten folds
#
scores = cross_val_score(ada, X, y, scoring = 'f1', cv = 10)

# Calculating the mean of the cross validation scores
#
print(f'Mean of cross validation scores is {scores.mean()}')

# Calculating the variance of the cross validation scores from the mean
#
print(f'Standard deviation of the cross validation scores is {scores.std()}')

*When comparing Ada and RandomForest, Ada has a general f1 score of 87% while RandomForest has a general f1 score of 83%. Therefore AdaBoostClassifier is a better predicter than RandomForestClassifier for this dataset*

### Comparing svc kernels to find the one with a maximum f1 score - using only two features

## Challenging the solution

*Using different models to check whether performance can be improved*

In [None]:
# Importing different classification models
#
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')

# Creating a list of classifier algorithms to compare with
#
models = [GradientBoostingClassifier(), DecisionTreeClassifier(), XGBClassifier(), KNeighborsClassifier(),\
          GaussianNB(), LogisticRegression()]

# Creating lists of the algorithms, to store the accuracy scores of each fold
#
GradientBoosting, DecisionTree, XGB, KNeighbors, GaussianNB, LogisticRegression = ([] for x in range(6))

# Creating a list containig the list of each algorithm. Created for easy iteration
#
model_list = [GradientBoosting, DecisionTree, XGB, KNeighbors, GaussianNB, LogisticRegression]

# Selecting the independent variables and the target variable
#
y = df['donated']
X = df.drop('donated', axis = 1)

# Creating a cross validation of 10 folds
#
kfold  = KFold(n_splits=10, shuffle = True, random_state = 0)

# Iterating through each model and appending the scores of each fold to the appriopriate list
#
for i, j in zip(models, model_list):
  j.extend(list(cross_val_score(i, X, y, scoring = 'f1', cv = kfold)))
  
# Creating a dataframe of all the scores from the iterations for each model
#
scores = pd.DataFrame({'GradientBoosting': GradientBoosting, 'DecisionTree': DecisionTree, 'XGB': XGB,\
              'KNeighbors': KNeighbors, 'GaussianNB': GaussianNB, 'LogisticRegression': LogisticRegression}, index = range(1, 11))

# Calculating the mean and standard deviation score of each algorithm
#
scores.loc['mean'] = scores.mean()
scores.loc['std'] = scores.std()

# Previewing the scores dataframe
#
scores

In [None]:
# Finding the model with the highest accuracy
#
scores.loc['mean'].idxmax()

*An alternative model that can match or outperform the AdaBoostClassifier is the XGBClassifier. More data is needed to increase the predictive power of the model. As the data is highky imbalanced, The f1 score metric of success has been used*

## Conclusion

* *Out of all the models used to predict whether a person has hypothyroid, the AdaBoostClassifier performs well with an f1 score of approximately 86%*


*  *The best performing kernel in the SupportVectorClassifier is Linear with an accuracy score 98.5% of and an f1 score of 86.%*


