In [187]:
#importing necessary libraries
import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings("ignore")

In [188]:
#importing datasets 
data_path = ['C:/Users/Munshi Khan']
filepath1 = os.sep.join(data_path + ["shrug_rbi.dta"])
filepath2 = os.sep.join(data_path + ["shrug_secc.dta"])
filepath3 = os.sep.join(data_path + ["shrug_pc11.dta"])
rbi = pd.read_stata(filepath1)
secc = pd.read_stata(filepath2)
pc11 = pd.read_stata(filepath3)

In [191]:
#Calculating an average access indicator for financial services in a village or town. To calculate this indicator
#we calculate number of branches and total population using rbi and population datasets 
branches = len(rbi)
total_population = pc11["pc11_pca_tot_p"].sum()
avg_access_indicator = (branches/total_population)*10000

In [192]:
#The average access indicator is turned out to be around 1.2777
avg_access_indicator

1.2769590473361352

In [193]:
rbi.replace(r'^\s*$', np.NaN, regex=True) #replacing empty strings with NaN in rbi dataset
rbi = rbi.dropna(axis=0)                  #removing NaN values from rbi dataset

In [194]:
#calculating no of branches using shrid as shrid is unique to each village or town
rbi2 = rbi.groupby('shrid').size().reset_index(name='no_of_branches')

Unnamed: 0,shrid,no_of_branches
0,,2332
1,11-01-000020,1
2,11-01-000024,1
3,11-01-000032,1
4,11-01-000037,1
...,...,...
37423,11-35-645529,1
37424,11-35-645550,2
37425,11-35-645551,9
37426,11-35-645558,2


In [195]:
#removing duplicate values using shrid, this will reduce the size of our rbi dataset
rbi.drop_duplicates(subset= "shrid", inplace = True)

In [196]:
#Now we combine rbi dataset with pc11, secc & the dataset we created for no of branches in last step
rbi = rbi.merge(rbi2, on = 'shrid')
rbi = rbi.merge(pc11, on = 'shrid')
rbi = rbi.merge(secc, on = 'shrid')

In [198]:
# calculating average access score for each shrid
rbi['avg_access_score'] = (rbi['no_of_branches']/rbi['pc11_pca_tot_p'])*10000 

# as we have removed many observations, we will use weights to get a more accurate access score
rbi['weight'] = rbi['pc11_pca_tot_p']/rbi['pc11_pca_tot_p'].sum() 
rbi['final_score'] = rbi['avg_access_score']*rbi['weight']
# we calculate a benchmark average access score for entire dataset
benchmark_avg = rbi['final_score'].sum()
benchmark_avg

2.5105114

In [199]:
# classifying each village or town as below or above average access using a binary indicator where 1 represents above avg access
rbi['binary_indicator'] = np.where(rbi['avg_access_score'] > benchmark_avg, 1, 0)

In [200]:
#selecting important features to train our model from the dataset
X_df = ['pc11_state_id',
 'rbi_region',
 'rbi_population_group',
 'no_of_branches',
 'pc11_pca_tot_p',
 'pc11_pca_no_hh',
 'pc11_pca_p_sc',
 'pc11_pca_p_st',
 'pc11_pca_p_lit',
 'pc11_vd_p_sch',
 'pc11_vd_m_sch',
 'pc11_vd_s_sch',
 'pc11_vd_s_s_sch',
 'pc11_vd_college',
 'pc11_vd_tar_road',
 'pc11_vd_area',
 'pc11_vd_power_all_sum',
 'secc_inc_cultiv_share',
 'nco2d_cultiv_share',
 'secc_cons_pc_rural',
 'secc_pov_rate_rural',
 'secc_pov_rate_tend_rural',
 'num_members_mean_rural']

In [201]:
# defining dependent and independent variables
rbi_X = rbi[X_df]
rbi_y = rbi['binary_indicator']

In [202]:
# many variables in our dataset are object and we would like to convert them into dummy variables
dummy_cols = rbi_X.dtypes[rbi_X.dtypes == np.object]  # filtering by string categoricals
dummy_cols = dummy_cols.index.tolist()
rbi_X = pd.get_dummies(rbi_X, columns = dummy_cols, drop_first = True)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  dummy_cols = rbi_X.dtypes[rbi_X.dtypes == np.object]  # filtering by string categoricals


In [203]:
#many features contains missing values and we fill those missing values using mean
rbi_X = rbi_X.apply(lambda x: x.fillna(x.mean()),axis=0)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36159 entries, 0 to 36158
Data columns (total 62 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   no_of_branches                   36159 non-null  int64  
 1   pc11_pca_tot_p                   36159 non-null  float32
 2   pc11_pca_no_hh                   36159 non-null  float32
 3   pc11_pca_p_sc                    36159 non-null  float32
 4   pc11_pca_p_st                    36159 non-null  float32
 5   pc11_pca_p_lit                   36159 non-null  float32
 6   pc11_vd_p_sch                    36159 non-null  float64
 7   pc11_vd_m_sch                    36159 non-null  float64
 8   pc11_vd_s_sch                    36159 non-null  float64
 9   pc11_vd_s_s_sch                  36159 non-null  float64
 10  pc11_vd_college                  36159 non-null  float64
 11  pc11_vd_tar_road                 36159 non-null  float64
 12  pc11_vd_area      

In [204]:
# Splitting the dataset into train and test set using a stratified shuffling
from sklearn.model_selection import StratifiedShuffleSplit

# Get the split indexes
strat_shuf_split = StratifiedShuffleSplit(n_splits=1, 
                                          test_size=0.2, 
                                          random_state=42)

train_idx, test_idx = next(strat_shuf_split.split(rbi_X, rbi_y))

# Create the dataframes
X_train = rbi_X.loc[train_idx]
y_train = rbi_y.loc[train_idx]

X_test  = rbi_X.loc[test_idx]
y_test  = rbi_y.loc[test_idx]

In [222]:
# Training a logistic regression on training dataset and predicting using our test dataset
# we also print all error metrics to assess the performance of the model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report,
from sklearn.metrics import f1_score, precision_score, recall_score
lr = LogisticRegression(penalty = 'l1',
                       C = 100.0,
                       solver = 'liblinear')
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

# Preciision, recall, f-score from the multi-class support function
print(classification_report(y_test, y_pred_lr))
print('Accuracy score: ', round(accuracy_score(y_test, y_pred_lr), 3))
print('F1 Score: ', round(f1_score(y_test, y_pred_lr), 3))
print('Precision score:', round(precision_score(y_test, y_pred_lr), 3))
print('recall score:', round(recall_score(y_test, y_pred_lr), 3))

              precision    recall  f1-score   support

           0       0.73      0.49      0.59      2899
           1       0.72      0.88      0.79      4333

    accuracy                           0.72      7232
   macro avg       0.73      0.69      0.69      7232
weighted avg       0.73      0.72      0.71      7232

Accuracy score:  0.724
F1 Score:  0.792
Precision score: 0.721
recall score: 0.879


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [223]:
# calculating confusion matrix for our logistic regression model
conf_matrix = pd.DataFrame(confusion_matrix(y_test, y_pred_lr))
conf_matrix

Unnamed: 0,0,1
0,1425,1474
1,523,3810


In [220]:
# training a K-Nearest neighbors classifier on training dataset
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=20)
knn = knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)

# Preciision, recall, f-score from the multi-class support function
print(classification_report(y_test, y_pred_knn))
print('Accuracy score: ', round(accuracy_score(y_test, y_pred_knn), 3))
print('F1 Score: ', round(f1_score(y_test, y_pred_knn), 3))
print('Precision score:', round(precision_score(y_test, y_pred_knn), 3))
print('recall score:', round(recall_score(y_test, y_pred_knn), 3))

              precision    recall  f1-score   support

           0       0.80      0.93      0.86      2899
           1       0.95      0.84      0.89      4333

    accuracy                           0.88      7232
   macro avg       0.87      0.89      0.87      7232
weighted avg       0.89      0.88      0.88      7232

Accuracy score:  0.877
F1 Score:  0.891
Precision score: 0.947
recall score: 0.841


In [215]:
#confusion matrix for K-Nearest neighbor classification model
conf_matrix = pd.DataFrame(confusion_matrix(y_test, y_pred_knn))
conf_matrix

Unnamed: 0,0,1
0,2455,444
1,609,3724
