In [2]:
import pyodbc
import pandas as pd
import config as cfg

In [3]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
cnxn = pyodbc.connect( 'DRIVER={ODBC Driver 13 for SQL Server};SERVER=' + cfg.mssql['server'] + ';DATABASE=' 
                      + cfg.mssql['database'] + ';UID=' + cfg.mssql['username'] + ';PWD=' + cfg.mssql['password'] )

In [5]:
query = "SELECT * FROM BankView WHERE [State]='TX';"
data = pd.read_sql(query, cnxn, index_col='BankID')
data.head()

Unnamed: 0_level_0,UniqueNum,Name,Address1,Address2,City,State,Zip,Deposit,Lat,Lng,...,MeanPSDistance,PSCount,Take,PDistance,Officers1000,FFLCount,AvgRating,Target,Population,CrimeRate1000
BankID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,6371,"Austin Bank, Texas National Association",169 S. Frankston Hwy,,Frankston,TX,75763,89823000,32.159764,-95.455885,...,8.073403,2,44911,0.0,0.0,3,,,1163.0,42.13
2,221632,"Austin Bank, Texas National Association",2745 South Loop 256,,Palestine,TX,75801,74868000,31.733813,-95.623431,...,2.158156,1,37434,0.484184,1.772727,6,4.8,,18194.0,39.13
3,16168,"Capital One, National Association",2121 South Loop 256,,Palestine,TX,75801,105194000,31.735576,-95.612655,...,2.316097,1,52597,0.46839,1.772727,6,,,18194.0,39.13
4,2196,Citizens National Bank,207 West Spring Street,,Palestine,TX,75801,92886000,31.761967,-95.633062,...,0.134378,1,46443,0.686562,1.772727,6,2.7,,18194.0,39.13
5,569848,"Commercial Bank of Texas, National Association",109 West Parker St,,Elkhart,TX,75839,33338000,31.625053,-95.579769,...,10.5,0,16669,0.0,0.0,1,5.0,,,


In [6]:
data['CrimeRate1000'].mean()

36.84859592711564

In [7]:
data['Population'].mean()

598919.9247945695

In [8]:
data.isnull().sum()

UniqueNum               0
Name                    0
Address1                0
Address2                0
City                    0
State                   0
Zip                     0
Deposit                 0
Lat                     8
Lng                     8
ClosestStationID      419
ClosestPSDistance       0
MeanPSDistance          0
PSCount                 0
Take                    0
PDistance               0
Officers1000            0
FFLCount                0
AvgRating            2404
Target               6507
Population            909
CrimeRate1000         909
dtype: int64

In [9]:
values = {'CrimeRate1000': data['CrimeRate1000'].mean(), 'Population': data['Population'].mean(), 'AvgRating' : data['AvgRating'].mean()}
data.fillna(value=values, inplace=True)
data.shape

(6507, 22)

## Use Model to Predict banks to Target

In [10]:
try:
    import cPickle as pickle
except ImportError:
    import pickle
    
## Load Model
model_filepath = 'targetbanks_randomforestclassifier.pkl'
in_logreg = open(model_filepath, 'rb')
rfc = pickle.load(in_logreg)
in_logreg.close()

In [11]:
feature_cols = ['ClosestPSDistance', 'Take', 'PDistance', 'Officers1000',
       'FFLCount', 'AvgRating', 'Population', 'CrimeRate1000']
X = data[feature_cols]
data['y_pred'] = rfc.predict(X)

In [12]:
# Total TARGETABLE banks
tgtsum = data[data.y_pred == 1].Name.count();
total = data.Name.count()
print(tgtsum, 'out of', total, ' pct:', round((tgtsum/total * 100)), '%')

1061 out of 6507  pct: 16.0 %


# Show Result

In [13]:
# Drop columns
dftarget = data[data.y_pred == 1].copy()
dftarget.drop(['UniqueNum', 
       'Deposit', 'Lat', 'Lng', 'ClosestStationID',
       'MeanPSDistance', 'PSCount', 'Target', 'Population'], axis=1, inplace=True)

## Top 5 banks with highest Take

In [14]:
# Top 5 banks with highest Take
dftop5 = dftarget.sort_values(by=['Take'], ascending=[False]).head(5)
dftop5

Unnamed: 0_level_0,Name,Address1,Address2,City,State,Zip,ClosestPSDistance,Take,PDistance,Officers1000,FFLCount,AvgRating,CrimeRate1000,y_pred
BankID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
429,USAA Federal Savings Bank,10750 Mcdermott Freeway,,San Antonio,TX,78288,3.287828,36000101,0.371217,4.054054,29,3.0,59.25,1
1037,LegacyTexas Bank,5851 Legacy Circle,,Plano,TX,75024,3.452618,1638172,0.354738,1.39677,33,3.0,20.44,1
601,"Wells Fargo Bank, National Association",9821 Broadway,,Pearland,TX,77584,4.944473,1099152,0.205553,2.318182,16,3.708603,19.93,1
3379,"Texas Capital Bank, National Association","1 Riverway, Suite 150",,Houston,TX,77056,3.244498,1097901,0.37555,2.512702,76,1.0,53.76,1
1743,"TIB The Independent Bankersbank, National Asso...",11701 Luna Road,,Dallas,TX,75234,4.074525,742480,0.292548,2.84,27,1.8,41.9,1


## Top 5 of banks with longest distance to Police Station

In [15]:
# Top 5 of banks with highest distance to Police Station
dftop5 = dftarget.sort_values(by=['ClosestPSDistance'], ascending=False).head(5)
dftop5

Unnamed: 0_level_0,Name,Address1,Address2,City,State,Zip,ClosestPSDistance,Take,PDistance,Officers1000,FFLCount,AvgRating,CrimeRate1000,y_pred
BankID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
6505,Zapata National Bank,703 Hidalgo Blvd,,Zapata,TX,78076,10.5,37399,0.0,0.0,4,4.5,36.848596,1
860,City Bank,107 West Taylor Street,,Morton,TX,79346,10.5,29222,0.0,0.0,1,3.708603,36.848596,1
3412,Trustmark National Bank,6809 Fm 1960 West,,Houston,TX,77069,10.5,47442,0.0,0.0,42,3.708603,53.76,1
3357,Regions Bank,12100 Huffmeister,,Cypress,TX,77429,10.5,17599,0.0,0.0,34,3.9,36.848596,1
3291,"MERCANTIL BANK, NATIONAL ASSOCIATION",12145 Fm 1960 Road West,,Houston,TX,77065,10.5,42481,0.0,0.0,30,3.708603,53.76,1


# Use K-Means with RandomForest

In [16]:
feature_cols = [ 'Take', 'PDistance', 'Officers1000', 'FFLCount', 'AvgRating', 'CrimeRate1000']
X = data[feature_cols]

In [18]:
# K-means 10
km = KMeans(n_clusters=10, random_state=1)
km.fit(X)
data['cluster'] = km.labels_

In [19]:
data_X = X.copy()
data_X['cluster'] = km.labels_
centers = data_X.groupby('cluster').mean()
centers

Unnamed: 0_level_0,Take,PDistance,Officers1000,FFLCount,AvgRating,CrimeRate1000
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,21510.11,0.420639,1.810202,24.766883,3.724451,36.495588
1,36000100.0,0.371217,4.054054,29.0,3.0,59.25
2,5804019.0,0.535881,2.756409,39.0,4.7,41.9
3,13385490.0,0.60056,2.756409,35.0,3.708603,41.9
4,28485120.0,0.600889,2.756409,35.0,2.6,41.9
5,1290031.0,0.533699,2.391057,44.428571,3.387558,42.658571
6,460179.1,0.501659,2.382405,41.109589,3.644823,41.351391
7,2475113.0,0.512535,2.268996,41.25,3.327151,49.185
8,38696360.0,0.669694,2.828173,49.0,4.0,53.76
9,110619.4,0.447556,1.98975,33.028769,3.622269,38.450155


In [20]:
data[data.y_pred == 1].groupby('cluster').mean()

Unnamed: 0_level_0,Deposit,Lat,Lng,ClosestStationID,ClosestPSDistance,MeanPSDistance,PSCount,Take,PDistance,Officers1000,FFLCount,AvgRating,Population,CrimeRate1000,y_pred
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,79624890.0,30.87759,-97.330138,346.007092,6.182931,7.203386,2.54,39812.18,0.172079,0.772874,19.282857,3.627656,672323.2,38.102409,1.0
1,72000200000.0,29.541059,-98.575308,580.0,3.287828,5.779003,8.0,36000100.0,0.371217,4.054054,29.0,3.0,1498642.0,59.25,1.0
5,2556818000.0,30.799651,-95.886048,129.0,3.88053,5.531378,4.666667,1278408.0,0.311947,2.075884,41.666667,2.569534,911801.3,31.376667,1.0
6,912220900.0,30.822679,-98.567043,409.0,3.970859,5.824221,4.882353,456110.2,0.323502,1.965392,30.941176,3.136306,1022855.0,37.656471,1.0
9,215509900.0,31.065933,-97.263557,357.139394,4.212353,5.712814,3.608824,107754.7,0.299391,1.606042,26.955882,3.547084,815767.0,40.434478,1.0


In [21]:
# Drop columns
dftarget2 = data[data.y_pred == 1].copy()
dftarget2.drop(['UniqueNum', 
       'Deposit', 'Lat', 'Lng', 'ClosestStationID',
       'MeanPSDistance', 'PSCount', 'Target', 'Population'], axis=1, inplace=True)

## Top 5 banks with highest Take

In [22]:
# Top 5 banks with highest Take and lower pDistance values from K-Means centers
dftop5 = dftarget2[dftarget2.cluster.isin([0, 9])].sort_values(by=['Take'], ascending=[False]).head(5)
dftop5

Unnamed: 0_level_0,Name,Address1,Address2,City,State,Zip,ClosestPSDistance,Take,PDistance,Officers1000,FFLCount,AvgRating,CrimeRate1000,y_pred,cluster
BankID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1106,"Wells Fargo Bank, National Association",4975 Preston Park Boulevard,,Plano,TX,75093,4.475632,285174,0.252437,4.308281,35,2.6,20.44,1,9
2703,Allegiance Bank,8727 W. Sam Houston Parkway North,,Houston,TX,77040,7.948971,270999,0.0,0.0,32,4.6,53.76,1,9
3639,"ZB, National Association",13430 Northwest Highway,,Houston,TX,77040,4.454557,263763,0.254544,4.075323,57,3.708603,53.76,1,9
297,Frost Bank,1250 Northeast Loop 410,,San Antonio,TX,78209,2.802044,258927,0.419796,3.990196,42,3.5,59.25,1,9
351,Jefferson Bank,"1777 Ne Loop 410, Ste 100",,San Antonio,TX,78217,3.194971,249876,0.380503,2.767318,40,4.7,59.25,1,9
