In [122]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [123]:
data = pd.read_csv(r"../data/Bank_Personal_Loan_Modelling.csv")
data.head()

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1


In this case, our products that we are targeting are
   - Mortgage
   - Personal Loan
   - Securities Account
   - CD account
   - Online
   - Credit Card

In [124]:
y_columns = [
    "Mortgage",
    "Personal Loan",
    "Securities Account",
    "CD Account",
    "Online",
    "Credit Card",
]

# Data Analytics

## Checking For Missing and duplicated Values

In [125]:
print("========== Missing Values ==========")
print(data.isnull().sum())
print("========== Duplicate Values ==========")
print(data.duplicated().sum())

ID                    0
Age                   0
Experience            0
Income                0
ZIP Code              0
Family                0
CCAvg                 0
Education             0
Mortgage              0
Personal Loan         0
Securities Account    0
CD Account            0
Online                0
CreditCard            0
dtype: int64
0


## Discover Hidden relation in Zip Code

Somethings I can think about Zip Code relation
   - Average GDP of the given Zip code
   - Population Density
   - Education Level

While all this values, if found, can somewhat be correlate to other rows (hopefully not), such as "Education", "CCAvg" etc.. It can give us hidden insights in to the users external factors, such as pressure by society and stuff. Hence, given a certain ZIP code maybe somewhat useless, we can replace it with the 3 mentioned values

In [126]:
import zipcodes
zipcode = data["ZIP Code"].unique()

def get_county(zipcode):
    finder = zipcodes.matching(zipcode)
    if len(finder) > 0:
        val = finder[0]["county"], finder[0]["lat"], finder[0]["long"]
    else:
        val = None, None, None
    return val

data["County"], data["Latitude"], data["Longitude"] = zip(*data["ZIP Code"].astype(str).apply(lambda x : get_county(x)))

In [164]:
col_to_drop = ['Rank within US (of 3142 counties)',
               'Rank within US (of 3143 counties)',
               'People (Education: Less Than 9th Grade)',
               "People (Education: At Least Bachelors Degree)",
               ' FIPS']

In [165]:
# Income (Median Family Income)
california_income = pd.read_csv(r"..\data\california_income.csv", delimiter=",")[2:].reset_index(drop=True).drop(columns=col_to_drop, errors = "ignore")
# Education (Percentage of people with atleast a bachelors)
california_education = pd.read_csv(r"..\data\california_education.csv", delimiter=",")[2:].reset_index(drop=True).drop(columns=col_to_drop, errors = "ignore")
california_education.rename(columns={"Value (Percent)": "County Education"}, inplace=True)
# Total Population
california_population = pd.read_csv(r"..\data\california_population.csv", delimiter=",")[2:].reset_index(drop=True).drop(columns=col_to_drop, errors = "ignore")
california_population["Total Population"] = california_population["People (Age 18-39)"]*100/california_population["Value (Percent)"]
california_population.drop(columns = ["People (Age 18-39)","Value (Percent)"], inplace = True, errors = "ignore")

In [166]:
data.shape

(5000, 17)

In [167]:
data_merge = data.merge(california_income, on="County", how="inner")
data_merge = data_merge.merge(california_education, on="County", how="inner")
data_merge = data_merge.merge(california_population, on="County", how="inner")

In [168]:
col_to_rename = {
    "Personal Loan": "Personal_Loan",
    "Securities Account": "Securities_Account",
    "CD Account": "CD_Account",
    "ZIP Code": "ZIP_Code",
    "Value (Dollars)": "Median_Income",
    "Country_Eudcation": "Country_Education",
    "Total Population": "Total_Population",
}

In [170]:
data_merge.rename(
    columns = col_to_rename, inplace = True, errors = "ignore"
)

In [171]:
data_merge

Unnamed: 0,ID,Age,Experience,Income,ZIP_Code,Family,CCAvg,Education,Mortgage,Personal_Loan,Securities_Account,CD_Account,Online,CreditCard,County,Latitude,Longitude,Median_Income,County Education,Total_Population
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0,Los Angeles County,34.1620,-118.0894,83411,34.6,9.949538e+06
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0,Los Angeles County,34.0218,-118.2883,83411,34.6,9.949538e+06
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0,Alameda County,37.8746,-122.2547,122488,50.9,1.663446e+06
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0,San Francisco County,37.7217,-122.4446,136689,59.8,8.517527e+05
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1,Los Angeles County,34.2429,-118.5273,83411,34.6,9.949538e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4961,4996,29,3,40,92697,1,1.9,3,0,0,0,0,1,0,Orange County,33.6473,-117.8409,109361,42.8,3.175215e+06
4962,4997,30,4,15,92037,4,0.4,1,85,0,0,0,1,0,San Diego County,32.8668,-117.2482,96974,41.0,3.286497e+06
4963,4998,63,39,24,93023,2,0.3,3,0,0,0,0,0,0,Ventura County,34.5210,-119.2477,102141,34.7,8.416505e+05
4964,4999,65,40,49,90034,3,0.5,2,0,0,0,0,1,0,Los Angeles County,34.0293,-118.3994,83411,34.6,9.949538e+06
