In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib

In [17]:
data = pd.read_csv('data/Financial_inclusion_dataset.csv')
data.head()

Unnamed: 0,country,year,uniqueid,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018,uniqueid_1,Yes,Rural,Yes,3,24,Female,Spouse,Married/Living together,Secondary education,Self employed
1,Kenya,2018,uniqueid_2,No,Rural,No,5,70,Female,Head of Household,Widowed,No formal education,Government Dependent
2,Kenya,2018,uniqueid_3,Yes,Urban,Yes,5,26,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed
3,Kenya,2018,uniqueid_4,No,Rural,Yes,5,34,Female,Head of Household,Married/Living together,Primary education,Formally employed Private
4,Kenya,2018,uniqueid_5,No,Urban,No,8,26,Male,Child,Single/Never Married,Primary education,Informally employed


In [18]:

for i in data.columns:
    if (data[i].isnull().sum() / len(data) * 100) < 30:
        if data[i].dtypes != 'O':
            data[i] = data[i].fillna(data[i].median())
        else:
            data[i] = data[i].fillna(data[i].mode()[0])
    else:
        data.drop(i, axis=1, inplace=True)

data.isnull().sum()




country                   0
year                      0
uniqueid                  0
bank_account              0
location_type             0
cellphone_access          0
household_size            0
age_of_respondent         0
gender_of_respondent      0
relationship_with_head    0
marital_status            0
education_level           0
job_type                  0
dtype: int64

In [19]:
data.drop(["uniqueid"], axis=1, inplace=True)

In [20]:
cat = data.select_dtypes(exclude="number")
num = data.select_dtypes(include="number")

print("Categorical data\n\n")
display(cat.head())

print("")
print("")
print("Numerical Data \n\n")
display(num.head())


Categorical data




Unnamed: 0,country,bank_account,location_type,cellphone_access,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,Yes,Rural,Yes,Female,Spouse,Married/Living together,Secondary education,Self employed
1,Kenya,No,Rural,No,Female,Head of Household,Widowed,No formal education,Government Dependent
2,Kenya,Yes,Urban,Yes,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed
3,Kenya,No,Rural,Yes,Female,Head of Household,Married/Living together,Primary education,Formally employed Private
4,Kenya,No,Urban,No,Male,Child,Single/Never Married,Primary education,Informally employed




Numerical Data 




Unnamed: 0,year,household_size,age_of_respondent
0,2018,3,24
1,2018,5,70
2,2018,5,26
3,2018,5,34
4,2018,8,26


In [21]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
import joblib

df = data.copy()
column_to_drop = ["bank_account"]

for col in df.drop(column_to_drop, axis=1).columns:
    if df[col].dtype in ["int64", "float64"]:
        scaler = StandardScaler()
        df[col] = scaler.fit_transform(df[[col]])
        scaler_path = f"scalers/{col}_scaler.pkl"
        print(f"Saving scaler for '{col}' to: {scaler_path}")
        joblib.dump(scaler, open(f"{scaler_path}", "wb"))
    else:
        encoder = LabelEncoder()
        df[col] = encoder.fit_transform(df[col])
        encoder_path = f"encoders/{col}_encoder.pkl"
        print(f"Saving encoder for '{col}' to: {encoder_path}")
        joblib.dump(encoder, open(f"{encoder_path}", "wb"))

df.head()

Saving encoder for 'country' to: encoders/country_encoder.pkl
Saving scaler for 'year' to: scalers/year_scaler.pkl
Saving encoder for 'location_type' to: encoders/location_type_encoder.pkl
Saving encoder for 'cellphone_access' to: encoders/cellphone_access_encoder.pkl
Saving scaler for 'household_size' to: scalers/household_size_scaler.pkl
Saving scaler for 'age_of_respondent' to: scalers/age_of_respondent_scaler.pkl
Saving encoder for 'gender_of_respondent' to: encoders/gender_of_respondent_encoder.pkl
Saving encoder for 'relationship_with_head' to: encoders/relationship_with_head_encoder.pkl
Saving encoder for 'marital_status' to: encoders/marital_status_encoder.pkl
Saving encoder for 'education_level' to: encoders/education_level_encoder.pkl
Saving encoder for 'job_type' to: encoders/job_type_encoder.pkl


Unnamed: 0,country,year,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,0,1.208541,Yes,0,1,-0.358007,-0.896188,0,5,2,3,9
1,0,1.208541,No,0,0,0.539834,1.888279,0,1,4,0,4
2,0,1.208541,Yes,1,1,0.539834,-0.775124,1,3,3,5,9
3,0,1.208541,No,0,1,0.539834,-0.290869,0,1,2,2,3
4,0,1.208541,No,1,0,1.886596,-0.775124,1,0,3,2,5


In [22]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

X = df.drop("bank_account", axis=1)
y = df["bank_account"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.8652497343251859
              precision    recall  f1-score   support

          No       0.91      0.94      0.92      4063
         Yes       0.51      0.39      0.44       642

    accuracy                           0.87      4705
   macro avg       0.71      0.67      0.68      4705
weighted avg       0.85      0.87      0.86      4705



In [23]:
test_pred = model.predict(X_test)
print(f"Test Score is: {accuracy_score(y_test, test_pred)}")

Test Score is: 0.8652497343251859


In [24]:
data.to_csv("data/Financial_inclusion_dataset_final.csv", index=False)
joblib.dump(model, open("models/Financial_inclusion_dataset.pkl", "wb"))
