In [43]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as pl

from sklearn.model_selection import train_test_split

The csv data file provided contains synthetic data representing a stratified random sample of customers who have a savings account, and who satisfy several other criteria: they are *all aged 18 or over*, *none of them have retired yet*, and all of them have a current account (based on separate analysis) appears to be their main current account. The data are for a single year only.  All monetary amounts are in GBP and are full-year amounts. 

In [44]:
df = pd.read_csv('data/savings_customers_data_v1.csv')

```
cust_unique_id	A unique ID generated for each customer for the purposes of this analysis, in order to preserve customer anonymity.
age	The age of each customer in years at the mid-point of the year for which data was queried. 
gender	The self-identified gender recorded by each customer.
ethnicity_group	The self-identified etnnicity group recorded by each customer. 
geo_region	The geographical region of the UK to which the customers recorded address and postcode correspond. 
net_salary	The total inflows into the customer's current account identified as consisting of salary payments, based on a separate analysis. 
other_income	The total inflows into the customer's current account identified as consisting of non-salary forms of income, based on a separate analysis. 
observed_income	The sum of net_salary and other_income. 
housing_spend	The total outflows from the customer's current account identified as consisting of mortgage, rent and other housing costs, based on a separate analysis. 
childcare_spend	The total outflows from the customer's current account identified as consisting of nursery, childminder and other childcare costs, based on a separate analysis. 
gambling_spend	The total outflows from the customer's current account identified as consisting of spending on gambling websites, based on a separate analysis. 
observed_surplus	The total of observed income minus the total of observed costs. 
credit_score	The credit score assigned to the customer by an internal bank credit scoring process. 
savings_bal_lbg	The total balance of savings held by the customer with Lloyds Banking Group entities as of the start of the year for which data was queried. 
annual_net_savings_lbg	The total of savings payments made by the customer into savings accounts with Lloyds Banking Group entities, minus withdrawls from those accounts, during the year for which data was queried.
```

### 1.	Cleanse the data and prepare it to be suitable for statistical modelling and machine learning. Comment on the steps performed and explain the rationale for them.

In [45]:
df.info() # check data type and null counts 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30305 entries, 0 to 30304
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   cust_unique_id          30305 non-null  int64  
 1   age                     30305 non-null  int64  
 2   gender                  30305 non-null  object 
 3   ethnicity_group         30305 non-null  object 
 4   geo_region              30305 non-null  object 
 5   net_salary              30305 non-null  float64
 6   other_income            26067 non-null  float64
 7   observed_income         30305 non-null  float64
 8   housing_spend           22949 non-null  float64
 9   childcare_spend         30305 non-null  float64
 10  gambling_spend          30305 non-null  float64
 11  observed_surplus        30305 non-null  float64
 12  credit_score            30305 non-null  object 
 13  savings_bal_lbg         30305 non-null  float64
 14  annual_net_savings_lbg  30305 non-null

In [46]:
print(f'other_income has {30305-26067} null rows')
print(f'housing_spend has {30305-22949} null rows')

other_income has 4238 null rows
housing_spend has 7356 null rows


In [47]:
# check if we have any duplicate customers 

print('unique customers', len(df), df.cust_unique_id.nunique())

unique customers 30305 30305


In [48]:
# check the genders 
print('unique genders', df.gender.unique())

unique genders ['M' 'Other' 'F' '9999']


In [49]:
# how many customers have 9999 for gender?
df[df.gender == '9999'].cust_unique_id.count()

360

In [50]:
# check ethnicity_group
print('ethnicity_group', df.ethnicity_group.unique())

ethnicity_group ['White British' 'Afro-Carribbean' 'White Other' 'South Asian' 'Other']


In [51]:
# check geo_region
print('geo_region', df.geo_region.unique())

geo_region ['Southeast' 'Midlands' 'Scotland' 'Southwest' 'Wales' 'North']


In [52]:
# check why credit_score is object-type. Trying to cast it to float-type
# string-types will show up as an exception 
def isfloat(num):
    try: 
        float(num)
        return 0
    except:
        return num
    
df.credit_score.apply(isfloat).unique()

array([0, 'PROMO34'], dtype=object)

In [53]:
# how customers with PROMO34 credit score?
df[df.credit_score == 'PROMO34'].cust_unique_id.count()

1050

In [54]:
# check if any customers credit score is 0
df[df.credit_score == 0].cust_unique_id.count()

0

Reasonable assumptions:

1. Keep gender == 9999 in its own group, assumption here that we don't know the gender information of these customers
2. In the absense of any background knowlege about 'PROMO34' assume these customers don't have a credit score, i.e. set the credit score to 0 for these customers. 

In [59]:
df.credit_score.replace(to_replace='PROMO34', value='0', inplace=True)

In [62]:
# change dtype to float
df.credit_score = df.credit_score.astype(float)

In [63]:
df.describe() # statistics of the numberical values

Unnamed: 0,cust_unique_id,age,net_salary,other_income,observed_income,housing_spend,childcare_spend,gambling_spend,observed_surplus,credit_score,savings_bal_lbg,annual_net_savings_lbg
count,30305.0,30305.0,30305.0,26067.0,30305.0,22949.0,30305.0,30305.0,30305.0,30305.0,30305.0,30305.0
mean,15153.0,41.654182,56585.867472,13580.886176,69246.199349,24719.267675,593.375517,98.40816,51270.967701,1748.052575,840550.9,21299.618218
std,8748.44429,15.444812,11794.832398,5862.046413,14708.576183,4325.260951,2528.447257,2881.362877,16050.305701,399.942384,732337.5,10018.424977
min,1.0,4.0,-1000.0,0.0,32518.45,10573.57,0.0,0.0,-408635.51,0.0,0.0,0.0
25%,7577.0,29.0,50407.53,9962.795,59196.38,21605.14,0.0,0.0,40242.04,1643.36,151421.7,16891.13
50%,15153.0,42.0,56643.52,12716.98,67338.61,24603.17,0.0,0.0,50093.01,1792.25,711381.9,22870.7
75%,22729.0,55.0,64805.76,18749.15,82136.11,28155.7,0.0,0.0,60232.45,1986.02,1397923.0,28352.66
max,30305.0,68.0,104374.08,29752.89,131050.07,42189.02,17778.93,448664.26,128700.41,2719.65,3615059.0,42951.37
