In [11]:
# Load and explore the dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score,mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
df = pd.read_csv('Train.csv')
print("Dataset shape:", df.shape)
print("\
First few rows:")
print(df.head())

Dataset shape: (23524, 13)
First few rows:
  country  year    uniqueid bank_account location_type cellphone_access  \
0   Kenya  2018  uniqueid_1          Yes         Rural              Yes   
1   Kenya  2018  uniqueid_2           No         Rural               No   
2   Kenya  2018  uniqueid_3          Yes         Urban              Yes   
3   Kenya  2018  uniqueid_4           No         Rural              Yes   
4   Kenya  2018  uniqueid_5           No         Urban               No   

   household_size  age_of_respondent gender_of_respondent  \
0               3                 24               Female   
1               5                 70               Female   
2               5                 26                 Male   
3               5                 34               Female   
4               8                 26                 Male   

  relationship_with_head           marital_status  \
0                 Spouse  Married/Living together   
1      Head of Household         

In [2]:
# Check data info and target variable distribution
print("Dataset info:")
print(df.info())
print("\
Target variable distribution:")
print(df['bank_account'].value_counts())
print("\
Missing values:")
print(df.isnull().sum())

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23524 entries, 0 to 23523
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   country                 23524 non-null  object
 1   year                    23524 non-null  int64 
 2   uniqueid                23524 non-null  object
 3   bank_account            23524 non-null  object
 4   location_type           23524 non-null  object
 5   cellphone_access        23524 non-null  object
 6   household_size          23524 non-null  int64 
 7   age_of_respondent       23524 non-null  int64 
 8   gender_of_respondent    23524 non-null  object
 9   relationship_with_head  23524 non-null  object
 10  marital_status          23524 non-null  object
 11  education_level         23524 non-null  object
 12  job_type                23524 non-null  object
dtypes: int64(3), object(10)
memory usage: 2.3+ MB
None
Target variable distribution:
bank_accoun

In [3]:
# Prepare the data for modeling
# Drop non-predictive columns
X = df.drop(['bank_account', 'uniqueid', 'country', 'year'], axis=1)
y = df['bank_account']

# Encode categorical variables
le_dict = {}
categorical_cols = ['location_type', 'cellphone_access', 'gender_of_respondent', 
                   'relationship_with_head', 'marital_status', 'education_level', 'job_type']

X_encoded = X.copy()
for col in categorical_cols:
    le = LabelEncoder()
    X_encoded[col] = le.fit_transform(X[col])
    le_dict[col] = le

# Encode target variable
y_encoded = LabelEncoder().fit_transform(y)

print("Features after encoding:")
print(X_encoded.head())

Features after encoding:
   location_type  cellphone_access  household_size  age_of_respondent  \
0              0                 1               3                 24   
1              0                 0               5                 70   
2              1                 1               5                 26   
3              0                 1               5                 34   
4              1                 0               8                 26   

   gender_of_respondent  relationship_with_head  marital_status  \
0                     0                       5               2   
1                     0                       1               4   
2                     1                       3               3   
3                     0                       1               2   
4                     1                       0               3   

   education_level  job_type  
0                3         9  
1                0         4  
2                5         9  
3        

In [9]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred_rf = rf_model.predict(X_test_scaled)
 
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_rf)
print("Random Forest Accuracy:", accuracy)
print("\
Classification Report:")
print(classification_report(y_test, y_pred_rf, target_names=['No', 'Yes']))


Random Forest Accuracy: 0.8643995749202975
Classification Report:
              precision    recall  f1-score   support

          No       0.90      0.94      0.92      4043
         Yes       0.52      0.39      0.45       662

    accuracy                           0.86      4705
   macro avg       0.71      0.67      0.68      4705
weighted avg       0.85      0.86      0.86      4705



In [12]:
mae = mean_absolute_error(y_test, y_pred_rf) 
print(mae)

0.13560042507970244
