In [16]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as pl

from sklearn.model_selection import train_test_split

The csv data file provided contains synthetic data representing a stratified random sample of customers who have a Lloyds Banking Group savings account, and who satisfy several other criteria: they are all aged 18 or over, none of them have retired yet, and all of them have a current account with Lloyds Banking Group that (based on separate analysis) appears to be their main current account. The data are for a single year only. A data dictionary is provided containing definitions of all columns. All monetary amounts are in GBP and are full-year amounts. 

In [10]:
df = pd.read_csv('data/savings_customers_data_v1.csv')

```
cust_unique_id	A unique ID generated for each customer for the purposes of this analysis, in order to preserve customer anonymity.
age	The age of each customer in years at the mid-point of the year for which data was queried. 
gender	The self-identified gender recorded by each customer.
ethnicity_group	The self-identified etnnicity group recorded by each customer. 
geo_region	The geographical region of the UK to which the customers recorded address and postcode correspond. 
net_salary	The total inflows into the customer's current account identified as consisting of salary payments, based on a separate analysis. 
other_income	The total inflows into the customer's current account identified as consisting of non-salary forms of income, based on a separate analysis. 
observed_income	The sum of net_salary and other_income. 
housing_spend	The total outflows from the customer's current account identified as consisting of mortgage, rent and other housing costs, based on a separate analysis. 
childcare_spend	The total outflows from the customer's current account identified as consisting of nursery, childminder and other childcare costs, based on a separate analysis. 
gambling_spend	The total outflows from the customer's current account identified as consisting of spending on gambling websites, based on a separate analysis. 
observed_surplus	The total of observed income minus the total of observed costs. 
credit_score	The credit score assigned to the customer by an internal bank credit scoring process. 
savings_bal_lbg	The total balance of savings held by the customer with Lloyds Banking Group entities as of the start of the year for which data was queried. 
annual_net_savings_lbg	The total of savings payments made by the customer into savings accounts with Lloyds Banking Group entities, minus withdrawls from those accounts, during the year for which data was queried.
```

In [12]:
df

Unnamed: 0,cust_unique_id,age,gender,ethnicity_group,geo_region,net_salary,other_income,observed_income,housing_spend,childcare_spend,gambling_spend,observed_surplus,credit_score,savings_bal_lbg,annual_net_savings_lbg
0,1,53,M,White British,Southeast,71602.70,19375.82,90978.52,28093.32,0.0,0.0,62885.20,2140.56,1713363.25,37712.97
1,2,52,Other,White British,Midlands,56111.36,11229.89,67341.25,22975.17,0.0,0.0,44366.07,1805.22,1148312.44,27008.50
2,3,62,M,Afro-Carribbean,Scotland,52804.51,9784.73,62589.24,22308.34,0.0,0.0,40280.90,1692.36,1420711.15,31364.28
3,4,44,Other,White British,Southeast,63360.88,16913.89,80274.76,27281.78,0.0,0.0,52992.99,1943.48,970783.31,27684.71
4,5,65,M,White British,Southeast,73642.58,,73642.58,28985.51,0.0,0.0,68341.32,2233.66,2660671.01,29503.44
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30300,30301,24,F,White British,Southeast,60827.36,,60827.36,,0.0,0.0,60827.36,1894.09,0.00,17565.48
30301,30302,29,F,White British,Southwest,47874.93,8952.87,56827.80,20716.41,0.0,0.0,36111.39,1604.48,140620.02,16131.26
30302,30303,25,M,White British,Southwest,53489.49,12644.74,66134.22,23979.80,0.0,0.0,42154.42,1719.58,6220.04,16356.25
30303,30304,52,F,White British,Southeast,71290.40,21247.32,92537.72,31927.67,0.0,0.0,60610.05,2119.9,1588091.41,31436.81


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30305 entries, 0 to 30304
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   cust_unique_id          30305 non-null  int64  
 1   age                     30305 non-null  int64  
 2   gender                  30305 non-null  object 
 3   ethnicity_group         30305 non-null  object 
 4   geo_region              30305 non-null  object 
 5   net_salary              30305 non-null  float64
 6   other_income            26067 non-null  float64
 7   observed_income         30305 non-null  float64
 8   housing_spend           22949 non-null  float64
 9   childcare_spend         30305 non-null  float64
 10  gambling_spend          30305 non-null  float64
 11  observed_surplus        30305 non-null  float64
 12  credit_score            30305 non-null  object 
 13  savings_bal_lbg         30305 non-null  float64
 14  annual_net_savings_lbg  30305 non-null

In [21]:
df.describe()

Unnamed: 0,cust_unique_id,age,net_salary,other_income,observed_income,housing_spend,childcare_spend,gambling_spend,observed_surplus,savings_bal_lbg,annual_net_savings_lbg
count,30305.0,30305.0,30305.0,26067.0,30305.0,22949.0,30305.0,30305.0,30305.0,30305.0,30305.0
mean,15153.0,41.654182,56585.867472,13580.886176,69246.199349,24719.267675,593.375517,98.40816,51270.967701,840550.9,21299.618218
std,8748.44429,15.444812,11794.832398,5862.046413,14708.576183,4325.260951,2528.447257,2881.362877,16050.305701,732337.5,10018.424977
min,1.0,4.0,-1000.0,0.0,32518.45,10573.57,0.0,0.0,-408635.51,0.0,0.0
25%,7577.0,29.0,50407.53,9962.795,59196.38,21605.14,0.0,0.0,40242.04,151421.7,16891.13
50%,15153.0,42.0,56643.52,12716.98,67338.61,24603.17,0.0,0.0,50093.01,711381.9,22870.7
75%,22729.0,55.0,64805.76,18749.15,82136.11,28155.7,0.0,0.0,60232.45,1397923.0,28352.66
max,30305.0,68.0,104374.08,29752.89,131050.07,42189.02,17778.93,448664.26,128700.41,3615059.0,42951.37


1.	Cleanse the data and prepare it to be suitable for statistical modelling and machine learning. Comment on the steps performed and explain the rationale for them.