# Financial Inclusion in Africa

This notebook covers the following sections
- Loading all datasets provided
- Data preprocessing and wrangling
- Creating multiple models and picking the best solution
- Making improvements to chosen model
- Making submissions

## Importing libraries and modules and loading datasets

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix, plot_confusion_matrix

In [2]:
train_df = pd.read_csv('Train.csv')
test_df = pd.read_csv('Test.csv')
variables = pd.read_csv('VariableDefinitions.csv')

In [3]:
# Another quick look at the dataset
train_df

Unnamed: 0,country,year,uniqueid,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018,uniqueid_1,Yes,Rural,Yes,3,24,Female,Spouse,Married/Living together,Secondary education,Self employed
1,Kenya,2018,uniqueid_2,No,Rural,No,5,70,Female,Head of Household,Widowed,No formal education,Government Dependent
2,Kenya,2018,uniqueid_3,Yes,Urban,Yes,5,26,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed
3,Kenya,2018,uniqueid_4,No,Rural,Yes,5,34,Female,Head of Household,Married/Living together,Primary education,Formally employed Private
4,Kenya,2018,uniqueid_5,No,Urban,No,8,26,Male,Child,Single/Never Married,Primary education,Informally employed
...,...,...,...,...,...,...,...,...,...,...,...,...,...
23519,Uganda,2018,uniqueid_2113,No,Rural,Yes,4,48,Female,Head of Household,Divorced/Seperated,No formal education,Other Income
23520,Uganda,2018,uniqueid_2114,No,Rural,Yes,2,27,Female,Head of Household,Single/Never Married,Secondary education,Other Income
23521,Uganda,2018,uniqueid_2115,No,Rural,Yes,5,27,Female,Parent,Widowed,Primary education,Other Income
23522,Uganda,2018,uniqueid_2116,No,Urban,Yes,7,30,Female,Parent,Divorced/Seperated,Secondary education,Self employed


In [4]:
train_df.isnull().sum()

# No null values in the dataset, so I can go ahead with data prepocessing and preparation

country                   0
year                      0
uniqueid                  0
bank_account              0
location_type             0
cellphone_access          0
household_size            0
age_of_respondent         0
gender_of_respondent      0
relationship_with_head    0
marital_status            0
education_level           0
job_type                  0
dtype: int64

In [5]:
train_df.dtypes

country                   object
year                       int64
uniqueid                  object
bank_account              object
location_type             object
cellphone_access          object
household_size             int64
age_of_respondent          int64
gender_of_respondent      object
relationship_with_head    object
marital_status            object
education_level           object
job_type                  object
dtype: object

In [6]:
# encoding for the following columns relationship_with_head, marital_status, education_level, job_type

print(train_df['relationship_with_head'].unique())
print(train_df['marital_status'].unique())
print(train_df['education_level'].unique())
print(train_df['job_type'].unique())

['Spouse' 'Head of Household' 'Other relative' 'Child' 'Parent'
 'Other non-relatives']
['Married/Living together' 'Widowed' 'Single/Never Married'
 'Divorced/Seperated' 'Dont know']
['Secondary education' 'No formal education'
 'Vocational/Specialised training' 'Primary education'
 'Tertiary education' 'Other/Dont know/RTA']
['Self employed' 'Government Dependent' 'Formally employed Private'
 'Informally employed' 'Formally employed Government'
 'Farming and Fishing' 'Remittance Dependent' 'Other Income'
 'Dont Know/Refuse to answer' 'No Income']


In [7]:
# Develop a function that does the preprocessing

def prepare_data(data):

    # changing year, household_size and age of respondents to float
    data['year'] = data['year'].astype(float)
    data['household_size'] = data['household_size'].astype(float)
    data['age_of_respondent'] = data['age_of_respondent'].astype(float)
    
    # label encoding for the following columns location_type, cellphone_access, gender_of_respondent
    le = LabelEncoder()

    data['location_type'] = le.fit_transform(data['location_type'])
    data['cellphone_access'] = le.fit_transform(data['cellphone_access'])
    data['gender_of_respondent'] = le.fit_transform(data['gender_of_respondent'])
    
    # Labeling other categorical data using the map function
    data['relationship_with_head'] = data['relationship_with_head'].map(
        {'Spouse':0, 
         'Head of Household':1, 
         'Other relative':2, 
         'Child':3, 
         'Parent':4, 
         'Other non-relatives':5})
    data['marital_status'] = data['marital_status'].map(
        {'Married/Living together': 0, 
         'Widowed': 1, 
         'Single/Never Married' :2, 
         'Divorced/Seperated': 3, 
         'Dont know':4})
    data['education_level'] = data['education_level'].map(
        {'Secondary education': 0, 
         'No formal education': 1, 
         'Vocational/Specialised training': 2, 
         'Primary education':3, 
         'Tertiary education':4, 
         'Other/Dont know/RTA':5})
    data['job_type'] = data['job_type'].map(
        {'Self employed':0, 
         'Government Dependent':1, 
         'Formally employed Private':2, 
         'Informally employed':3,
         'Formally employed Government': 4, 
         'Farming and Fishing':5, 
         'Remittance Dependent':6, 
         'Other Income':7, 
         'Dont Know/Refuse to answer':8, 
         'No Income':9})
    
    # drop uniqueid column
    data = data.drop(['uniqueid'], axis=1)
    
    return data

In [8]:
train_data = prepare_data(train_df)

In [11]:
train_data

Unnamed: 0,country,year,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018.0,1,0,1,3.0,24.0,0,0,0,0,0
1,Kenya,2018.0,0,0,0,5.0,70.0,0,1,1,1,1
2,Kenya,2018.0,1,1,1,5.0,26.0,1,2,2,2,0
3,Kenya,2018.0,0,0,1,5.0,34.0,0,1,0,3,2
4,Kenya,2018.0,0,1,0,8.0,26.0,1,3,2,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...
23519,Uganda,2018.0,0,0,1,4.0,48.0,0,1,3,1,7
23520,Uganda,2018.0,0,0,1,2.0,27.0,0,1,2,0,7
23521,Uganda,2018.0,0,0,1,5.0,27.0,0,4,1,3,7
23522,Uganda,2018.0,0,1,1,7.0,30.0,0,4,3,0,0


In [10]:
le = LabelEncoder()

train_data['bank_account'] = le.fit_transform(train_data['bank_account'])

In [12]:
X_train = train_data[['location_type', 'cellphone_access', 'household_size', 'age_of_respondent', 'gender_of_respondent',
                     'relationship_with_head', 'marital_status', 'education_level', 'job_type']]
y_train = train_data['bank_account']

In [13]:
X_Train, X_val, y_Train, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=42)

In [14]:
# Logistic Regression Model
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

model.fit(X_Train, y_Train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [16]:
y_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy:.2f}")

Validation Accuracy: 0.86


In [21]:
print(confusion_matrix(y_val, y_pred))

[[6073    0]
 [ 985    0]]
