# DS ML Project - Financial Inclusion in Africa

## Load and Inspect the Data

In [None]:
pip install imblearn

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error, accuracy_score
from imblearn.over_sampling import SMOTE


import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load the dataset
df = pd.read_csv('data/Train.csv')

In [369]:
df.shape

(23524, 13)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe(include='all')

In [None]:
df.isnull().sum()

In [None]:
# check for zero values
zero_count = (df == 0).sum()

print(zero_count)

In [None]:
# check for unique values in each column
unique_vals = {column: df[column].unique() for column in df.columns}
for column, unique_vals in unique_vals.items():
    print(f"Unique values in column {column}: {unique_vals}")

In [None]:
df.uniqueid.value_counts()

We have +23k rows but only 8759 unique IDs, which means some people have multiple accounts. Other than removing the duplicate rows, there shouldn't be need for extra cleaning.

## Trying to Deal with the Duplicates (uniqueid)

In [None]:
# removing duplicates 
df_no_dup = df.drop_duplicates()

In [None]:
print(df.uniqueid.value_counts().head(3000))

In [None]:
uniqueidval = 'uniqueid_4393'
rows_with_uniqueid = df[df['uniqueid'] == uniqueidval]

rows_with_uniqueid.head(100)

(bold guess but) I think re-indexing the data and ignoring the old uniqueid should work

In [None]:
# create new unique index 
df['global_id'] = range(1, len(df) + 1)

In [None]:
df.head()

In [None]:
# drop the old uniqueid column
df.drop('uniqueid', axis=1, inplace=True)

In [None]:
df.head(10)

In [None]:
df.global_id.value_counts()

Now I convert bank_account column to represent numerical values

1: Yes

0: No

In [None]:
df.bank_account = df.bank_account.apply(lambda x: 1 if x == 'Yes' else 0)

In [None]:
df_num = df[['global_id','age_of_respondent', 'household_size', 'bank_account']]

In [None]:
df_num.head()

### Some visualizations

Gender vs. Bank Account Ownership

In [None]:
sns.countplot(data= df, x='gender_of_respondent', hue='bank_account')
plt.title('Bank Account Ownership by Gender')
plt.show()

In [None]:
# Histogram of age with respect to bank account ownership
g = sns.FacetGrid(df, col='bank_account', height=5, aspect=1.5)
g.map(plt.hist, 'age_of_respondent', bins=30)
g.set_axis_labels('Age of Respondent', 'Frequency')
plt.show()


In [None]:
# Calculate mean household size by bank account ownership
household_size_stats = df.groupby('bank_account')['household_size'].mean().reset_index()
print(household_size_stats)


In [None]:
# Cross-tabulation of location type and bank account ownership
location_cross_tab = pd.crosstab(df['location_type'], df['bank_account'], normalize='index')
print(location_cross_tab)


In [None]:
# Pairplot of numerical variables
sns.pairplot(df, hue='bank_account')
plt.show()

Since we have a lot of categorical features, we see a lot of dots in lines, instead of a spread.

In [None]:
# overview of numerical feature distribution
for column in ['household_size', 'age_of_respondent']:
    plt.figure(figsize=(10, 6))
    sns.histplot(df[column], kde=True)
    plt.title(f'Distribution of {column}')
    plt.show()

In [None]:
# overview of categorical feature distributions
for column in ['country', 'year', 'location_type', 'cellphone_access', 'gender_of_respondent',
               'relationship_with_head', 'marital_status', 'education_level', 'job_type']:
    plt.figure(figsize=(10, 6))
    sns.countplot(data=df, x=column)
    plt.title(f'Distribution of {column}')
    plt.xticks(rotation=90)
    plt.show()

In [None]:
# Target Variable Analysis - Bank account 1/0 for yes/no
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='bank_account')
plt.title('Distribution of Bank Account Ownership')
plt.show()

In [None]:
# Feature Relationships with Target Variable
for column in ['country', 'year', 'location_type', 'cellphone_access', 'gender_of_respondent',
               'relationship_with_head', 'marital_status', 'education_level', 'job_type']:
    plt.figure(figsize=(10, 6))
    sns.countplot(data=df, x=column, hue='bank_account')
    plt.title(f'Bank Account Ownership by {column}')
    plt.xticks(rotation=90)
    plt.show()

In [None]:
# correlation matrix for numerical features 
numerical_features = ['household_size', 'age_of_respondent']
correlation_matrix = df[numerical_features].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

No feature seems to have collinearity. Now trying to encode categorical features.

In [None]:
categorical_features = ['country', 'year', 'location_type', 'cellphone_access',
                        'gender_of_respondent', 'relationship_with_head', 'marital_status',
                        'education_level', 'job_type']

In [None]:
# one-hot encoding
df_encoded = pd.get_dummies(df, columns=categorical_features, drop_first=True)


# reformat column names 
df_encoded.columns = df_encoded.columns.str.lower().str.replace(' ', '_')

In [None]:
df_encoded.head()

In [None]:
df_encoded.shape

In [None]:
df_encoded.info()


In [None]:
encoded_features = ["country_rwanda",
                    "country_tanzania",
                    "country_uganda",
                    "year_2017",
                    "year_2018",
                    "location_type_urban",
                    "cellphone_access_yes",
                    "gender_of_respondent_male",
                    "relationship_with_head_head_of_household",
                    "relationship_with_head_other_non-relatives",
                    "relationship_with_head_other_relative",
                    "relationship_with_head_parent",
                    "relationship_with_head_spouse",
                    "marital_status_dont_know",
                    "marital_status_married/living_together",
                    "marital_status_single/never_married",
                    "marital_status_widowed",
                    "education_level_other/dont_know/rta",
                    "education_level_primary_education",
                    "education_level_secondary_education",
                    "education_level_tertiary_education",
                    "education_level_vocational/specialised_training",
                    "job_type_farming_and_fishing",
                    "job_type_formally_employed_government",
                    "job_type_formally_employed_private",
                    "job_type_government_dependent",
                    "job_type_informally_employed",
                    "job_type_no_income",
                    "job_type_other_income",
                    "job_type_remittance_dependent",
                    "job_type_self_employed"
                    ]

In [None]:
# Convert the boolean columns in encoded_features to integers
df_encoded = df_encoded.astype(int)

In [None]:
df_encoded.head(10)

In [None]:
df.head(10)

In [None]:
# compute correlation matrix
correlation_matrix_enc = df_encoded.corr()

In [None]:
# plot correlation matrix
plt.figure(figsize=(20, 15))
sns.heatmap(correlation_matrix_enc, annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Scale numerical features using MinMaxScaler
scaler = MinMaxScaler()
scaled_numerical_features = scaler.fit_transform(df[numerical_features])

In [None]:
# Extract the target variable
y = df['bank_account']

In [None]:
# Verify that there are no NaNs in y
assert not np.any(pd.isna(y)), "Target variable y contains NaN values"


In [None]:
y.value_counts(normalize=True)
X = df_encoded

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
y_train.value_counts(normalize=True)

In [None]:
feature_names = X_train.columns.tolist()
print(feature_names)

In [None]:
y_test.value_counts(normalize=True)

In [None]:
# Apply SMOTE to the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [None]:
y_train_resampled.value_counts(normalize=True)

In [None]:
# Model selection and training
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier()
}

# maybe try hyperparameters 

In [None]:
# Evaluate models using cross-validation with additional metrics
for name, model in models.items():
    mae_scores = cross_val_score(model, X_train_resampled, y_train_resampled, cv=5, scoring='neg_mean_absolute_error')
    accuracy_scores = cross_val_score(model, X_train_resampled, y_train_resampled, cv=5, scoring='accuracy')
    precision_scores = cross_val_score(model, X_train_resampled, y_train_resampled, cv=5, scoring='precision')
    recall_scores = cross_val_score(model, X_train_resampled, y_train_resampled, cv=5, scoring='recall')
    f1_scores = cross_val_score(model, X_train_resampled, y_train_resampled, cv=5, scoring='f1')
    
    print(f'{name} MAE: {-mae_scores.mean()}')
    print(f'{name} Accuracy: {accuracy_scores.mean()}')
    print(f'{name} Precision: {precision_scores.mean()}')
    print(f'{name} Recall: {recall_scores.mean()}')
    print(f'{name} F1 Score: {f1_scores.mean()}')


In [None]:
# Fit the best model (as an example, using RandomForest here)
best_model = RandomForestClassifier()
best_model.fit(X_train, y_train)

In [None]:
# Predict and evaluate the final model
predictions = best_model.predict(X_test)


In [None]:
# Check if the model is making constant predictions
unique_predictions = np.unique(predictions)

In [368]:
# If the model is making constant predictions, we need to re-evaluate the approach
if len(unique_predictions) == 1:
    print("Model is making constant predictions. Re-evaluate the approach.")
else:
    mae = mean_absolute_error(y_test, predictions)
    accuracy = accuracy_score(y_test, predictions)
    print(f'Random Forest MAE on Test Set: {mae}')
    print(f'Random Forest Accuracy on Test Set: {accuracy}')


Random Forest MAE on Test Set: 0.0
Random Forest Accuracy on Test Set: 1.0


In [None]:
# Evaluate models using cross-validation
for name, model in models.items():
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')
    print(f'{name} MAE: {-cv_scores.mean()}')