# Financial Inclusion in Africa

This notebook covers the following sections
- Loading all datasets provided
- Data preprocessing and wrangling
- Creating multiple models and picking the best solution
- Making improvements to chosen model
- Making submissions

## Importing libraries and modules and loading datasets

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix, plot_confusion_matrix

In [2]:
train_df = pd.read_csv('Train.csv')
test_df = pd.read_csv('Test.csv')
variables = pd.read_csv('VariableDefinitions.csv')

In [3]:
# Another quick look at the dataset
train_df

Unnamed: 0,country,year,uniqueid,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018,uniqueid_1,Yes,Rural,Yes,3,24,Female,Spouse,Married/Living together,Secondary education,Self employed
1,Kenya,2018,uniqueid_2,No,Rural,No,5,70,Female,Head of Household,Widowed,No formal education,Government Dependent
2,Kenya,2018,uniqueid_3,Yes,Urban,Yes,5,26,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed
3,Kenya,2018,uniqueid_4,No,Rural,Yes,5,34,Female,Head of Household,Married/Living together,Primary education,Formally employed Private
4,Kenya,2018,uniqueid_5,No,Urban,No,8,26,Male,Child,Single/Never Married,Primary education,Informally employed
...,...,...,...,...,...,...,...,...,...,...,...,...,...
23519,Uganda,2018,uniqueid_2113,No,Rural,Yes,4,48,Female,Head of Household,Divorced/Seperated,No formal education,Other Income
23520,Uganda,2018,uniqueid_2114,No,Rural,Yes,2,27,Female,Head of Household,Single/Never Married,Secondary education,Other Income
23521,Uganda,2018,uniqueid_2115,No,Rural,Yes,5,27,Female,Parent,Widowed,Primary education,Other Income
23522,Uganda,2018,uniqueid_2116,No,Urban,Yes,7,30,Female,Parent,Divorced/Seperated,Secondary education,Self employed


In [4]:
train_df.isnull().sum()

# No null values in the dataset, so I can go ahead with data prepocessing and preparation

country                   0
year                      0
uniqueid                  0
bank_account              0
location_type             0
cellphone_access          0
household_size            0
age_of_respondent         0
gender_of_respondent      0
relationship_with_head    0
marital_status            0
education_level           0
job_type                  0
dtype: int64

In [5]:
train_df.dtypes

country                   object
year                       int64
uniqueid                  object
bank_account              object
location_type             object
cellphone_access          object
household_size             int64
age_of_respondent          int64
gender_of_respondent      object
relationship_with_head    object
marital_status            object
education_level           object
job_type                  object
dtype: object

In [6]:
# encoding for the following columns relationship_with_head, marital_status, education_level, job_type

print(train_df['relationship_with_head'].unique())
print(train_df['marital_status'].unique())
print(train_df['education_level'].unique())
print(train_df['job_type'].unique())

['Spouse' 'Head of Household' 'Other relative' 'Child' 'Parent'
 'Other non-relatives']
['Married/Living together' 'Widowed' 'Single/Never Married'
 'Divorced/Seperated' 'Dont know']
['Secondary education' 'No formal education'
 'Vocational/Specialised training' 'Primary education'
 'Tertiary education' 'Other/Dont know/RTA']
['Self employed' 'Government Dependent' 'Formally employed Private'
 'Informally employed' 'Formally employed Government'
 'Farming and Fishing' 'Remittance Dependent' 'Other Income'
 'Dont Know/Refuse to answer' 'No Income']


In [7]:
# Develop a function that does the preprocessing

def prepare_data(data):

    # changing year, household_size and age of respondents to float
    data['year'] = data['year'].astype(float)
    data['household_size'] = data['household_size'].astype(float)
    data['age_of_respondent'] = data['age_of_respondent'].astype(float)
    
    # label encoding for the following columns location_type, cellphone_access, gender_of_respondent
    le = LabelEncoder()

    data['location_type'] = le.fit_transform(data['location_type'])
    data['cellphone_access'] = le.fit_transform(data['cellphone_access'])
    data['gender_of_respondent'] = le.fit_transform(data['gender_of_respondent'])
    
    # Labeling other categorical data using the map function
    data['relationship_with_head'] = data['relationship_with_head'].map(
        {'Spouse':0, 
         'Head of Household':1, 
         'Other relative':2, 
         'Child':3, 
         'Parent':4, 
         'Other non-relatives':5})
    data['marital_status'] = data['marital_status'].map(
        {'Married/Living together': 0, 
         'Widowed': 1, 
         'Single/Never Married' :2, 
         'Divorced/Seperated': 3, 
         'Dont know':4})
    data['education_level'] = data['education_level'].map(
        {'Secondary education': 0, 
         'No formal education': 1, 
         'Vocational/Specialised training': 2, 
         'Primary education':3, 
         'Tertiary education':4, 
         'Other/Dont know/RTA':5})
    data['job_type'] = data['job_type'].map(
        {'Self employed':0, 
         'Government Dependent':1, 
         'Formally employed Private':2, 
         'Informally employed':3,
         'Formally employed Government': 4, 
         'Farming and Fishing':5, 
         'Remittance Dependent':6, 
         'Other Income':7, 
         'Dont Know/Refuse to answer':8, 
         'No Income':9})
    
    # drop uniqueid column
    data = data.drop(['uniqueid'], axis=1)
    
    return data

In [8]:
processed_train_data = prepare_data(train_df)
processed_test_data = prepare_data(test_df)

In [9]:
processed_train_data

Unnamed: 0,country,year,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018.0,Yes,0,1,3.0,24.0,0,0,0,0,0
1,Kenya,2018.0,No,0,0,5.0,70.0,0,1,1,1,1
2,Kenya,2018.0,Yes,1,1,5.0,26.0,1,2,2,2,0
3,Kenya,2018.0,No,0,1,5.0,34.0,0,1,0,3,2
4,Kenya,2018.0,No,1,0,8.0,26.0,1,3,2,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...
23519,Uganda,2018.0,No,0,1,4.0,48.0,0,1,3,1,7
23520,Uganda,2018.0,No,0,1,2.0,27.0,0,1,2,0,7
23521,Uganda,2018.0,No,0,1,5.0,27.0,0,4,1,3,7
23522,Uganda,2018.0,No,1,1,7.0,30.0,0,4,3,0,0


In [10]:
le = LabelEncoder()

processed_train_data['bank_account'] = le.fit_transform(processed_train_data['bank_account'])

In [11]:
X_train = processed_train_data[['location_type', 'cellphone_access', 'household_size', 'age_of_respondent', 'gender_of_respondent',
                     'relationship_with_head', 'marital_status', 'education_level', 'job_type']]
y_train = processed_train_data['bank_account']

In [12]:
X_Train, X_val, y_Train, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=42)

In [13]:
# Logistic Regression Model
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=500)

model.fit(X_Train, y_Train)

LogisticRegression(max_iter=500)

In [14]:
y_pred = model.predict(X_val)
log_accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy for logistic regression model: {log_accuracy:.3f}")

Validation Accuracy for logistic regression model: 0.860


In [15]:
print(confusion_matrix(y_val, y_pred))

[[6073    0]
 [ 985    0]]


In [16]:
# Using a random forrest model
from sklearn.ensemble import RandomForestClassifier

model2 = RandomForestClassifier()

model2.fit(X_Train, y_Train)

RandomForestClassifier()

In [17]:
y_pred2 = model2.predict(X_val)
Rand_accuracy = accuracy_score(y_val, y_pred2)
print(f"Validation Accuracy for random forrest model: {Rand_accuracy: .3f}")

Validation Accuracy for random forrest model:  0.859


In [18]:
print(confusion_matrix(y_val, y_pred2))

[[5710  363]
 [ 630  355]]


In [19]:
#Using a support vector machine (SVM) model
from sklearn.svm import SVC

model3 = SVC()

model3.fit(X_Train, y_Train)

SVC()

In [20]:
y_pred3 = model3.predict(X_val)
svc_accuracy = accuracy_score(y_val, y_pred3)
print(f"Validation Accuracy for random forrest model: {svc_accuracy: .3f}")

Validation Accuracy for random forrest model:  0.860


In [21]:
print(confusion_matrix(y_val, y_pred3))

[[6073    0]
 [ 985    0]]


So from this, the accuracy of the different models are close I can just pick one and work with it. Using the Random forrest model, I can make my submission

In [22]:
test_data = processed_test_data[['location_type', 'cellphone_access', 'household_size', 'age_of_respondent', 
                            'gender_of_respondent', 'relationship_with_head', 'marital_status', 'education_level', 'job_type']]
test_df['bank_account'] = model2.predict(test_data)

In [23]:
submission = pd.DataFrame({"uniqueid": test_df["uniqueid"] + " x " + test_df["country"],
                           "bank_account": test_df['bank_account']})

In [24]:
submission.head()

Unnamed: 0,uniqueid,bank_account
0,uniqueid_6056 x Kenya,1
1,uniqueid_6060 x Kenya,1
2,uniqueid_6065 x Kenya,0
3,uniqueid_6072 x Kenya,0
4,uniqueid_6073 x Kenya,0


In [25]:
submission.sample(5)

Unnamed: 0,uniqueid,bank_account
3906,uniqueid_10041 x Rwanda,0
4844,uniqueid_10979 x Rwanda,1
9407,uniqueid_2324 x Uganda,0
987,uniqueid_7056 x Kenya,0
3382,uniqueid_9517 x Rwanda,0


In [26]:
submission['bank_account'].value_counts()

0    9170
1     916
Name: bank_account, dtype: int64

In [27]:
submission.to_csv('first_submission.csv', index = False)

In [28]:
# Another submission
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

In [29]:
pipe = Pipeline([
    ("scale", StandardScaler()),
    ("model", KNeighborsClassifier()),
])

In [30]:
pipe.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()), ('model', KNeighborsClassifier())],
 'verbose': False,
 'scale': StandardScaler(),
 'model': KNeighborsClassifier(),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__algorithm': 'auto',
 'model__leaf_size': 30,
 'model__metric': 'minkowski',
 'model__metric_params': None,
 'model__n_jobs': None,
 'model__n_neighbors': 5,
 'model__p': 2,
 'model__weights': 'uniform'}

In [31]:
mod = GridSearchCV(estimator=pipe,
                  param_grid={'model__n_neighbors' : [1,2,3,4,5,6,7,8,9,10]},
                  cv=5)

In [32]:
mod.fit(X_Train, y_Train)
pd.DataFrame(mod.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.048657,0.007897,0.120031,0.005123,1,{'model__n_neighbors': 1},0.830905,0.814151,0.832979,0.822047,0.828424,0.825701,0.006844,10
1,0.045995,0.001412,0.136002,0.00501,2,{'model__n_neighbors': 2},0.872192,0.859399,0.863043,0.864561,0.867294,0.865298,0.004288,8
2,0.046403,0.002503,0.151006,0.006555,3,{'model__n_neighbors': 3},0.867335,0.848163,0.856058,0.8512,0.850592,0.854669,0.006831,9
3,0.044401,0.002309,0.15559,0.005855,4,{'model__n_neighbors': 4},0.870067,0.865472,0.869724,0.862739,0.865776,0.866756,0.002775,5
4,0.052208,0.004445,0.194,0.013728,5,{'model__n_neighbors': 5},0.869156,0.863043,0.86942,0.860006,0.865776,0.86548,0.003606,7
5,0.049969,0.007303,0.180617,0.012297,6,{'model__n_neighbors': 6},0.86946,0.866383,0.869116,0.860917,0.871242,0.867424,0.003606,2
6,0.0464,0.00372,0.184599,0.007629,7,{'model__n_neighbors': 7},0.867638,0.867598,0.868509,0.862435,0.868813,0.866999,0.002331,4
7,0.046,0.001546,0.195808,0.004533,8,{'model__n_neighbors': 8},0.867942,0.866991,0.868813,0.864865,0.868813,0.867484,0.001473,1
8,0.052788,0.007658,0.210001,0.011189,9,{'model__n_neighbors': 9},0.866424,0.864258,0.867902,0.863954,0.867598,0.866027,0.001647,6
9,0.045807,0.002029,0.207994,0.007296,10,{'model__n_neighbors': 10},0.866727,0.86608,0.869116,0.868205,0.865472,0.86712,0.00135,3


In [33]:
y_pred4 = mod.predict(X_val)
knn_accuracy = accuracy_score(y_val, y_pred4)
print(f"Validation Accuracy for random forrest model: {knn_accuracy: .3f}")

Validation Accuracy for random forrest model:  0.868


In [34]:
print("Error rate of KNN classifier: ", 1 - accuracy_score(y_val, y_pred4))

Error rate of KNN classifier:  0.13162368943043357


In [35]:
from xgboost import XGBClassifier


pipe2 = Pipeline([
    ("scale", StandardScaler()),
    ("model", XGBClassifier())
])

In [52]:
pipe2.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model',
   XGBClassifier(base_score=None, booster=None, callbacks=None,
                 colsample_bylevel=None, colsample_bynode=None,
                 colsample_bytree=None, device=None, early_stopping_rounds=None,
                 enable_categorical=False, eval_metric=None, feature_types=None,
                 gamma=None, grow_policy=None, importance_type=None,
                 interaction_constraints=None, learning_rate=None, max_bin=None,
                 max_cat_threshold=None, max_cat_to_onehot=None,
                 max_delta_step=None, max_depth=None, max_leaves=None,
                 min_child_weight=None, missing=nan, monotone_constraints=None,
                 multi_strategy=None, n_estimators=None, n_jobs=None,
                 num_parallel_tree=None, random_state=None, ...))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': XGBClassifier(base_score=None, booster=None, callbacks=None,
               cols

In [40]:
xgb_mod = GridSearchCV(estimator=pipe2,
                       param_grid={
                           'model__min_child_weighth': [1, 5, 10],
                           'model__gamma': [0.5, 1],
                           'model__subsample': [0.6, 0.8, 1.0],
                           'model__max_depth': [3, 5]
                       },
                       n_jobs=3,
                       verbose=3,
                       cv=5)

In [41]:
xgb_mod.fit(X_Train, y_Train)
pd.DataFrame(xgb_mod.cv_results_)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


Parameters: { "min_child_weighth" } are not used.



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__gamma,param_model__max_depth,param_model__min_child_weighth,param_model__subsample,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.325399,0.078385,0.012004,0.002963,0.5,3,1,0.6,"{'model__gamma': 0.5, 'model__max_depth': 3, '...",0.886157,0.880656,0.886122,0.877619,0.889159,0.883943,0.004187,10
1,0.234001,0.030586,0.010401,0.001853,0.5,3,1,0.8,"{'model__gamma': 0.5, 'model__max_depth': 3, '...",0.8898,0.881567,0.883389,0.877923,0.885515,0.883639,0.003962,16
2,0.146399,0.030703,0.0138,0.010187,0.5,3,1,1.0,"{'model__gamma': 0.5, 'model__max_depth': 3, '...",0.888585,0.880352,0.887337,0.877316,0.885818,0.883882,0.004322,13
3,0.2436,0.008039,0.0098,0.00172,0.5,3,5,0.6,"{'model__gamma': 0.5, 'model__max_depth': 3, '...",0.886157,0.880656,0.886122,0.877619,0.889159,0.883943,0.004187,10
4,0.227598,0.018345,0.009601,0.001497,0.5,3,5,0.8,"{'model__gamma': 0.5, 'model__max_depth': 3, '...",0.8898,0.881567,0.883389,0.877923,0.885515,0.883639,0.003962,16
5,0.148602,0.007389,0.008797,0.002477,0.5,3,5,1.0,"{'model__gamma': 0.5, 'model__max_depth': 3, '...",0.888585,0.880352,0.887337,0.877316,0.885818,0.883882,0.004322,13
6,0.2442,0.012859,0.010001,0.001094,0.5,3,10,0.6,"{'model__gamma': 0.5, 'model__max_depth': 3, '...",0.886157,0.880656,0.886122,0.877619,0.889159,0.883943,0.004187,10
7,0.237,0.013311,0.009802,0.000399,0.5,3,10,0.8,"{'model__gamma': 0.5, 'model__max_depth': 3, '...",0.8898,0.881567,0.883389,0.877923,0.885515,0.883639,0.003962,16
8,0.1218,0.00725,0.007199,0.001166,0.5,3,10,1.0,"{'model__gamma': 0.5, 'model__max_depth': 3, '...",0.888585,0.880352,0.887337,0.877316,0.885818,0.883882,0.004322,13
9,0.2662,0.005075,0.0114,0.001745,0.5,5,1,0.6,"{'model__gamma': 0.5, 'model__max_depth': 5, '...",0.885549,0.877923,0.880352,0.877619,0.881263,0.880541,0.002865,28


In [43]:
print(xgb_mod.best_params_)

{'model__gamma': 1, 'model__max_depth': 3, 'model__min_child_weighth': 1, 'model__subsample': 0.8}


In [45]:
from sklearn.metrics import confusion_matrix, accuracy_score

# fit by setting best parameters and Evaluate model
best_xgb_model = XGBClassifier(min_child_weight=1, gamma=1, subsample=0.8, max_depth=5)

best_xgb_model.fit(X_Train, y_Train)
y_pred_xgb = best_xgb_model.predict(X_val)

# Get error rate
print("Error rate of the XGB classifier: ", 1 - accuracy_score(y_val, y_pred_xgb))

Error rate of the XGB classifier:  0.11547180504392174


In [47]:
# Get the predicted result for the test Data
xgb_test_data = processed_test_data[['location_type', 'cellphone_access', 'household_size', 'age_of_respondent', 
                            'gender_of_respondent', 'relationship_with_head', 'marital_status', 'education_level', 'job_type']]
test_df['bank_account'] = best_xgb_model.predict(xgb_test_data)

In [49]:
# Create submission DataFrame
submission2 = pd.DataFrame({"uniqueid": test_df["uniqueid"] + " x " + test_df["country"],
                           "bank_account": test_df.bank_account})

In [53]:
submission2.head()

Unnamed: 0,uniqueid,bank_account
0,uniqueid_6056 x Kenya,1
1,uniqueid_6060 x Kenya,1
2,uniqueid_6065 x Kenya,0
3,uniqueid_6072 x Kenya,0
4,uniqueid_6073 x Kenya,0


In [54]:
submission2.sample(5)

Unnamed: 0,uniqueid,bank_account
9593,uniqueid_2510 x Uganda,0
4005,uniqueid_10140 x Rwanda,0
3531,uniqueid_9666 x Rwanda,0
3748,uniqueid_9883 x Rwanda,0
7817,uniqueid_8092 x Tanzania,0


In [55]:
submission2['bank_account'].value_counts()

0    9381
1     705
Name: bank_account, dtype: int64

In [56]:
submission2.to_csv('second_submission.csv', index = False)