In [50]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from ydata_profiling import ProfileReport


In [93]:
df=pd.read_csv("Financial_inclusion_dataset.csv")
df.head()

Unnamed: 0,country,year,uniqueid,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018,uniqueid_1,Yes,Rural,Yes,3,24,Female,Spouse,Married/Living together,Secondary education,Self employed
1,Kenya,2018,uniqueid_2,No,Rural,No,5,70,Female,Head of Household,Widowed,No formal education,Government Dependent
2,Kenya,2018,uniqueid_3,Yes,Urban,Yes,5,26,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed
3,Kenya,2018,uniqueid_4,No,Rural,Yes,5,34,Female,Head of Household,Married/Living together,Primary education,Formally employed Private
4,Kenya,2018,uniqueid_5,No,Urban,No,8,26,Male,Child,Single/Never Married,Primary education,Informally employed


In [10]:
df.isnull().sum()
print(f"There is {df.isnull().sum().sum()} nan values")

There is 0 nan values


In [9]:
df[df.duplicated()]
print(f"There are {len(df[df.duplicated()])} duplicates in the dataset")

There are 0 duplicates in the dataset


In [14]:
vardef =pd.read_csv("Definitions.csv")
vardef

Unnamed: 0,Variable Definitions,Unnamed: 1
0,country,Country interviewee is in.
1,year,Year survey was done in.
2,uniqueid,Unique identifier for each interviewee
3,location_type,"Type of location: Rural, Urban"
4,cellphone_access,"If interviewee has access to a cellphone: Yes, No"
5,household_size,Number of people living in one house
6,age_of_respondent,The age of the interviewee
7,gender_of_respondent,"Gender of interviewee: Male, Female"
8,relationship_with_head,The interviewee’s relationship with the head o...
9,marital_status,The martial status of the interviewee: Married...


In [12]:
df.dtypes

country                   object
year                       int64
uniqueid                  object
bank_account              object
location_type             object
cellphone_access          object
household_size             int64
age_of_respondent          int64
gender_of_respondent      object
relationship_with_head    object
marital_status            object
education_level           object
job_type                  object
dtype: object

In [90]:
df=df.drop("uniqueid",axis=1)

In [94]:
encoders={}
original_mappings = {}

for col in df.select_dtypes(include="object"):
    label = LabelEncoder()
    df[col] = label.fit_transform(df[col])
    encoders[col] = label
    
    # Store the mapping for the column
    original_mappings[col] = dict(zip(label.transform(label.classes_), label.classes_))

In [107]:
encoders

{'country': LabelEncoder(),
 'uniqueid': LabelEncoder(),
 'bank_account': LabelEncoder(),
 'location_type': LabelEncoder(),
 'cellphone_access': LabelEncoder(),
 'gender_of_respondent': LabelEncoder(),
 'relationship_with_head': LabelEncoder(),
 'marital_status': LabelEncoder(),
 'education_level': LabelEncoder(),
 'job_type': LabelEncoder()}

In [96]:
df

Unnamed: 0,country,year,uniqueid,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,0,2018,0,1,0,1,3,24,0,5,2,3,9
1,0,2018,1111,0,0,0,5,70,0,1,4,0,4
2,0,2018,2222,1,1,1,5,26,1,3,3,5,9
3,0,2018,3333,0,0,1,5,34,0,1,2,2,3
4,0,2018,4444,0,1,0,8,26,1,0,3,2,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
23519,3,2018,1239,0,0,1,4,48,0,1,0,0,7
23520,3,2018,1240,0,0,1,2,27,0,1,3,3,7
23521,3,2018,1241,0,0,1,5,27,0,4,4,2,7
23522,3,2018,1242,0,1,1,7,30,0,4,0,3,9


In [97]:
df.corr()["bank_account"]

country                  -0.161362
year                      0.112318
uniqueid                 -0.022321
bank_account              1.000000
location_type             0.087288
cellphone_access          0.209669
household_size           -0.028326
age_of_respondent         0.019429
gender_of_respondent      0.117234
relationship_with_head   -0.070853
marital_status           -0.038739
education_level           0.323768
job_type                 -0.064171
Name: bank_account, dtype: float64

In [98]:
# columns to use
column=["country","year","cellphone_access","gender_of_respondent","education_level","relationship_with_head","job_type"]

In [99]:
X = df[column]
y=df["bank_account"]

In [100]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [101]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [108]:
rf=RandomForestClassifier(n_estimators=100,random_state=42,max_depth=4)

In [109]:
rf.fit(X_train,y_train)

In [110]:
y_preds=rf.predict(X_test)

In [111]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_preds)
print(f"Accuracy: {accuracy}")

# Print a detailed classification report
print("Classification Report:")
print(classification_report(y_test, y_preds))

Accuracy: 0.8841657810839533
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.99      0.94      4063
           1       0.81      0.20      0.32       642

    accuracy                           0.88      4705
   macro avg       0.85      0.60      0.63      4705
weighted avg       0.88      0.88      0.85      4705



In [112]:
import joblib
joblib.dump((rf, encoders), 'model_rf.pkl')


['model_rf.pkl']

In [61]:
df.dtypes

country                   int64
year                      int64
bank_account              int64
location_type             int64
cellphone_access          int64
household_size            int64
age_of_respondent         int64
gender_of_respondent      int64
relationship_with_head    int64
marital_status            int64
education_level           int64
job_type                  int64
dtype: object

In [92]:
original_mappings

{'country': {0: 'Kenya', 1: 'Rwanda', 2: 'Tanzania', 3: 'Uganda'},
 'bank_account': {0: 'No', 1: 'Yes'},
 'location_type': {0: 'Rural', 1: 'Urban'},
 'cellphone_access': {0: 'No', 1: 'Yes'},
 'gender_of_respondent': {0: 'Female', 1: 'Male'},
 'relationship_with_head': {0: 'Child',
  1: 'Head of Household',
  2: 'Other non-relatives',
  3: 'Other relative',
  4: 'Parent',
  5: 'Spouse'},
 'marital_status': {0: 'Divorced/Seperated',
  1: 'Dont know',
  2: 'Married/Living together',
  3: 'Single/Never Married',
  4: 'Widowed'},
 'education_level': {0: 'No formal education',
  1: 'Other/Dont know/RTA',
  2: 'Primary education',
  3: 'Secondary education',
  4: 'Tertiary education',
  5: 'Vocational/Specialised training'},
 'job_type': {0: 'Dont Know/Refuse to answer',
  1: 'Farming and Fishing',
  2: 'Formally employed Government',
  3: 'Formally employed Private',
  4: 'Government Dependent',
  5: 'Informally employed',
  6: 'No Income',
  7: 'Other Income',
  8: 'Remittance Dependent',
 