In [34]:
import pyodbc
import pandas as pd
import config as cfg

In [35]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt

In [36]:
cnxn = pyodbc.connect( 'DRIVER={ODBC Driver 13 for SQL Server};SERVER=' + cfg.mssql['server'] + ';DATABASE=' 
                      + cfg.mssql['database'] + ';UID=' + cfg.mssql['username'] + ';PWD=' + cfg.mssql['password'] )

In [37]:
query = "SELECT * FROM BankView WHERE [State]='NY';"
data = pd.read_sql(query, cnxn, index_col='BankID')
data.head()

Unnamed: 0_level_0,UniqueNum,Name,Address1,Address2,City,State,Zip,Deposit,Lat,Lng,...,MeanPSDistance,PSCount,Take,PDistance,Officers1000,FFLCount,AvgRating,Target,Population,CrimeRate1000
BankID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6509,590864,Ballston Spa National Bank,1973 Western Ave,,Albany,NY,12203,1917000,42.695,-73.8794,...,7.330062,11,958,0.373815,1.09375,8,,,98617.0,43.744993
6510,580447,Ballston Spa National Bank,1207 Troy Schenectady Rd,,Latham,NY,12110,3875000,42.7725,-73.823,...,6.917659,14,1937,0.405223,1.425641,8,,,,
6511,4660,"Bank of America, National Association",69 State Street,,Albany,NY,12201,435463000,42.6499,-73.7522,...,5.144061,9,217731,0.601389,2.820146,5,4.1,,98617.0,43.744993
6512,213160,"Bank of America, National Association",1450 Western Avenue,,Albany,NY,12203,97728000,42.6808,-73.8373,...,7.046373,12,48864,0.290482,2.341346,10,3.0,,98617.0,43.744993
6513,214981,"Bank of America, National Association",1791 Western Avenue,,Albany,NY,12203,62203000,42.6905,-73.8662,...,7.621675,13,31101,0.300045,1.318182,10,5.0,,98617.0,43.744993


In [38]:
data['CrimeRate1000'].mean()

29.42786864578506

In [39]:
data['Population'].mean()

3629207.8229166665

In [40]:
data.isnull().sum()

UniqueNum               0
Name                    0
Address1                0
Address2                0
City                    0
State                   0
Zip                     0
Deposit                 0
Lat                     9
Lng                     9
ClosestStationID      233
ClosestPSDistance       0
MeanPSDistance          0
PSCount                 0
Take                    0
PDistance               0
Officers1000            0
FFLCount                0
AvgRating            2113
Target               5033
Population           3401
CrimeRate1000        3401
dtype: int64

In [41]:
values = {'CrimeRate1000': data['CrimeRate1000'].mean(), 'Population': data['Population'].mean(), 'AvgRating' : data['AvgRating'].mean()}
data.fillna(value=values, inplace=True)
data.shape

(5033, 22)

## Use Model to Predict banks to Target

In [42]:
try:
    import cPickle as pickle
except ImportError:
    import pickle
    
## Load Model
model_filepath = 'targetbanks_randomforestclassifier.pkl'
in_logreg = open(model_filepath, 'rb')
rfc = pickle.load(in_logreg)
in_logreg.close()

In [43]:
feature_cols = ['ClosestPSDistance', 'Take', 'PDistance', 'Officers1000',
       'FFLCount', 'AvgRating', 'Population', 'CrimeRate1000']
X = data[feature_cols]
data['y_pred'] = rfc.predict(X)

In [44]:
# Total TARGETABLE banks
tgtsum = data[data.y_pred == 1].Name.count();
total = data.Name.count()
print(tgtsum, 'out of', total, ' pct:', round((tgtsum/total * 100)), '%')

1697 out of 5033  pct: 34.0 %


## Show Result

In [45]:
dftarget = data[data.y_pred == 1].copy()
dftarget.drop(['UniqueNum', 
       'Deposit', 'Lat', 'Lng', 'ClosestStationID',
       'MeanPSDistance', 'PSCount', 'Target', 'Population'], axis=1, inplace=True)

## Top 5 banks with highest Take

In [46]:
# Top 5 banks with highest Take
dftop5 = dftarget.sort_values(by=['Take'], ascending=[False]).head(5)
dftop5

Unnamed: 0_level_0,Name,Address1,Address2,City,State,Zip,ClosestPSDistance,Take,PDistance,Officers1000,FFLCount,AvgRating,CrimeRate1000,y_pred
BankID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
8936,"JPMorgan Chase Bank, National Association",401 Madison Avenue,,New York,NY,10017,3.320694,174260736,0.367931,5.0,10,1.0,20.357615,1
8850,"HSBC Bank USA, National Association",452 Fifth Avenue,,New York,NY,10018,2.940012,43650604,0.405999,5.0,10,5.0,20.357615,1
8934,"JPMorgan Chase Bank, National Association",401 Madison Avenue,,New York,NY,10017,3.320694,33282569,0.367931,5.0,10,1.0,20.357615,1
8639,"Bank of America, National Association",115 West 42nd Street,,New York,NY,10036,3.113936,22024089,0.388606,5.0,9,3.7,20.357615,1
8768,"Citibank, National Association",399 Park Avenue,,New York,NY,10022,3.602696,12862500,0.33973,5.0,10,3.665959,20.357615,1


## Top 5 of banks with longest distance to Police Station

In [13]:
# Top 5 of banks with highest distance to Police Station
dftop5 = dftarget.sort_values(by=['ClosestPSDistance'], ascending=False).head(5)
dftop5

Unnamed: 0_level_0,Name,Address1,Address2,City,State,Zip,ClosestPSDistance,Take,PDistance,Officers1000,FFLCount,AvgRating,CrimeRate1000,y_pred
BankID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
7436,Berkshire Bank,West Main Street,,West Winfield,NY,13491,10.5,18044,0.0,0.0,2,3.665959,29.427869,1
9484,Greater Hudson Bank,485 Schutt Road Ext,,Middletown,NY,10940,10.5,27535,0.0,0.0,0,3.665959,29.43932,1
9417,"NBT Bank, National Association",Lyndon Corners Branch,,Syracuse,NY,13214,10.5,27081,0.0,0.0,0,3.665959,40.104221,1
10198,New York Community Bank,5770 Hylan Boulevard,,Staten Island,NY,10309,10.5,18347,0.0,0.0,1,3.665959,29.427869,1
10965,The First National Bank of Long Island,42 Deer Park Avenue,,Babylon,NY,11702,10.5,17092,0.0,0.0,9,1.0,29.427869,1


# Use K-Means with RandomForest

In [14]:
feature_cols = [ 'Take', 'PDistance', 'Officers1000', 'FFLCount', 'AvgRating', 'CrimeRate1000']
X = data[feature_cols]

In [15]:
# K-means 
km = KMeans(n_clusters=10, random_state=1)
km.fit(X)
data['cluster'] = km.labels_

In [16]:
data_X = X.copy()
data_X['cluster'] = km.labels_
centers = data_X.groupby('cluster').mean()
centers

Unnamed: 0_level_0,Take,PDistance,Officers1000,FFLCount,AvgRating,CrimeRate1000
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,53831.88,0.390447,2.22521,7.904029,3.66888,29.532274
1,174260700.0,0.367931,5.0,10.0,1.0,20.357615
2,44801050.0,0.520409,5.0,9.0,4.3,20.357615
3,13542670.0,0.464485,5.0,8.666667,3.65532,20.357615
4,23403040.0,0.470427,3.960335,8.0,4.05,24.892742
5,3209889.0,0.360355,4.577656,9.705882,3.687985,22.266888
6,33282570.0,0.367931,5.0,10.0,1.0,20.357615
7,53643000.0,0.632956,5.0,8.0,3.3,20.357615
8,8330004.0,0.490924,4.335788,7.333333,4.521986,31.289375
9,905989.7,0.420778,3.921633,9.71875,3.452295,24.461084


In [17]:
data[data.y_pred == 1].groupby('cluster').mean()

Unnamed: 0_level_0,Deposit,Lat,Lng,ClosestStationID,ClosestPSDistance,MeanPSDistance,PSCount,Take,PDistance,Officers1000,FFLCount,AvgRating,Population,CrimeRate1000,y_pred
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,173899900.0,41.185981,-74.343233,828.96556,4.162956,6.04713,5.122161,86949.73,0.29233,2.157876,7.778392,3.54646,4268936.0,28.926899,1.0
1,348521500000.0,40.7562,-73.9764,896.0,3.320694,3.320694,1.0,174260700.0,0.367931,5.0,10.0,1.0,8566917.0,20.357615,1.0
2,87301210000.0,40.7521,-73.9823,896.0,2.940012,2.940012,1.0,43650600.0,0.405999,5.0,10.0,5.0,8566917.0,20.357615,1.0
3,24384510000.0,40.75625,-73.9787,896.0,3.28612,3.28612,1.0,12192260.0,0.371388,5.0,9.5,3.282979,8566917.0,20.357615,1.0
4,44048180000.0,40.7554,-73.9847,896.0,3.113936,3.113936,1.0,22024090.0,0.388606,5.0,9.0,3.7,8566917.0,20.357615,1.0
5,6207942000.0,40.756413,-73.977187,896.0,3.321574,3.321574,1.0,3103971.0,0.367843,5.0,9.933333,3.726384,8566917.0,20.357615,1.0
6,66565140000.0,40.7562,-73.9764,896.0,3.320694,3.320694,1.0,33282570.0,0.367931,5.0,10.0,1.0,8566917.0,20.357615,1.0
8,14949690000.0,40.7529,-73.9799,896.0,3.040212,3.040212,1.0,7474846.0,0.395979,5.0,10.0,4.282979,8566917.0,20.357615,1.0
9,1811366000.0,40.819338,-73.920953,881.644444,3.268391,4.017399,3.155556,905682.8,0.373161,4.22307,9.733333,3.541892,7281464.0,22.892947,1.0


In [18]:
# Drop columns
dftarget2 = data[data.y_pred == 1].copy()
dftarget2.drop(['UniqueNum', 
       'Deposit', 'Lat', 'Lng', 'ClosestStationID',
       'MeanPSDistance', 'PSCount', 'Target', 'Population'], axis=1, inplace=True)

## Top 5 banks with highest Take

In [20]:
# Top 5 banks with highest Take and lower pDistance values from K-Means centers
dftop5 = dftarget2[dftarget2.cluster.isin([0, 1, 3, 5, 6, 9])].sort_values(by=['Take'], ascending=[False]).head(5)
dftop5

Unnamed: 0_level_0,Name,Address1,Address2,City,State,Zip,ClosestPSDistance,Take,PDistance,Officers1000,FFLCount,AvgRating,CrimeRate1000,y_pred,cluster
BankID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
8936,"JPMorgan Chase Bank, National Association",401 Madison Avenue,,New York,NY,10017,3.320694,174260736,0.367931,5.0,10,1.0,20.357615,1,1
8934,"JPMorgan Chase Bank, National Association",401 Madison Avenue,,New York,NY,10017,3.320694,33282569,0.367931,5.0,10,1.0,20.357615,1,6
8768,"Citibank, National Association",399 Park Avenue,,New York,NY,10022,3.602696,12862500,0.33973,5.0,10,3.665959,20.357615,1,3
8652,Bank of China,1045 Avenue Of The Americas,,New York,NY,10018,2.969544,11522011,0.403046,5.0,9,2.9,20.357615,1,3
8817,First Republic Bank,1230 Avenue Of The Americas,,New York,NY,10020,3.401485,4617530,0.359852,5.0,10,4.2,20.357615,1,5
