# Predictive Analysis: ANZ Synthesized 3-month Transactional Dataset

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

## Background

Source: https://www.theforage.com/modules/ZLJCsrpkHo9pZBJNY/BiJPfqmGY2QwgN6gA

This task is based on a synthesised transaction dataset containing 3 months’ worth of transactions for 100 hypothetical customers. It contains purchases, recurring transactions, and salary transactions. The dataset is designed to simulate realistic transaction behaviours that are observed in ANZ’s real transaction data, so many of the insights gathered from the activities will be genuine.

*NOTE*: Data preprocessing for this dataset was already done in the notebook for Task 1. If reference for data preprocessing is required, please refer to Steps 0 and 1 in the notebook file 'Task 1. Exploratory Data Analysis.ipynb'. The dataset to be loaded here should be the cleaned file named 'DATA/ANZ-synthesized-transactions-cleaned.csv'

In [4]:
# referencing the cleaned dataset
file = 'DATA/ANZ-synthesized-transactions-cleaned.csv'
# Read file and parse timestamp as the index
df = pd.read_csv(file, index_col=0, parse_dates=True)

In [6]:
# To display the first 5 rows of the data
df.head()

Unnamed: 0_level_0,status,card_present_flag,first_name,age,gender,txn_description,balance,amount,geometry,X,Y,merch_suburb,merch_state,merch_geometry,merch_X,merch_Y,distance,country,card_present_bool
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2018-08-01 01:01:15+00:00,authorized,1,Diana,26,F,POS,35.39,16.25,153.41 -27.95,-27.95,153.41,Ashmore,QLD,153.38 -27.99,-27.99,153.38,5.34,Australia,True
2018-08-01 01:13:45+00:00,authorized,0,Diana,26,F,SALES-POS,21.2,14.19,153.41 -27.95,-27.95,153.41,Sydney,NSW,151.21 -33.87,-33.87,151.21,691.63,Australia,False
2018-08-01 01:26:15+00:00,authorized,1,Michael,38,M,POS,5.71,6.42,151.23 -33.94,-33.94,151.23,Sydney,NSW,151.21 -33.87,-33.87,151.21,8.01,Australia,True
2018-08-01 01:38:45+00:00,authorized,1,Rhonda,40,F,SALES-POS,2117.22,40.9,153.10 -27.66,-27.66,153.1,Buderim,QLD,153.05 -26.68,-26.68,153.05,109.2,Australia,True
2018-08-01 01:51:15+00:00,authorized,1,Diana,26,F,SALES-POS,17.95,3.25,153.41 -27.95,-27.95,153.41,Mermaid Beach,QLD,153.44 -28.06,-28.06,153.44,12.6,Australia,True


In [7]:
# check the shape of the DataFrame
df.shape

(12043, 19)

* Rows in dataset: 12043
* Columns in dataset: 21

In [11]:
# confirmation of data types prior to conversion
df.dtypes

status                object
card_present_flag      int64
first_name            object
age                    int64
gender                object
txn_description       object
balance              float64
amount               float64
geometry              object
X                    float64
Y                    float64
merch_suburb          object
merch_state           object
merch_geometry        object
merch_X              float64
merch_Y              float64
distance             float64
country               object
card_present_bool       bool
dtype: object

In [10]:
# Check the distinct values for each column
df.nunique()

status                   2
card_present_flag        2
first_name              80
age                     33
gender                   2
txn_description          6
balance              12006
amount                4457
geometry               100
X                       85
Y                       87
merch_suburb          1610
merch_state              9
merch_geometry        2704
merch_X                670
merch_Y                719
distance              4252
country                  1
card_present_bool        2
dtype: int64

In [13]:
df.index.strftime('%d/%m/%Y').nunique()

91

In [12]:
# display all missing values
df.isna().sum()

status               0
card_present_flag    0
first_name           0
age                  0
gender               0
txn_description      0
balance              0
amount               0
geometry             0
X                    0
Y                    0
merch_suburb         0
merch_state          0
merch_geometry       0
merch_X              0
merch_Y              0
distance             0
country              0
card_present_bool    0
dtype: int64

In [27]:
df.txn_description.value_counts()

SALES-POS     3934
POS           3783
PAYMENT       2600
PAY/SALARY     883
INTER BANK     742
PHONE BANK     101
Name: txn_description, dtype: int64

In [36]:
annual_salary = df[df.txn_description == 'PAY/SALARY']

Unnamed: 0_level_0,card_present_flag,age,balance,amount,X,Y,merch_X,merch_Y,distance,card_present_bool
first_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Abigail,0,520,151094.12,25305.41,-454.48,1801.54,0.0,0.0,0.0,0
Alexander,0,228,92901.74,21897.06,-225.96,863.28,0.0,0.0,0.0,0
Amy,0,258,26484.20,9782.16,-222.36,856.62,0.0,0.0,0.0,0
Andrew,0,936,657477.50,9389.04,-453.84,1741.08,0.0,0.0,0.0,0
Antonio,0,364,78678.67,15384.74,-446.60,1620.92,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...
Tim,0,160,295834.60,35343.92,-127.60,463.32,0.0,0.0,0.0,0
Timothy,0,120,42000.30,9844.98,-227.16,871.38,0.0,0.0,0.0,0
Tonya,0,189,50023.10,19881.05,-224.00,812.42,0.0,0.0,0.0,0
Tyler,0,1230,144217.86,31927.72,-755.02,2898.18,0.0,0.0,0.0,0
