# Imports and loading dataset 

In [2]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

In [3]:
Anz = pd.read_excel("ANZ synthesised transaction dataset.xlsx")
Anz.head()

Unnamed: 0,status,card_present_flag,bpay_biller_code,account,currency,long_lat,txn_description,merchant_id,merchant_code,first_name,...,age,merchant_suburb,merchant_state,extraction,amount,transaction_id,country,customer_id,merchant_long_lat,movement
0,authorized,1.0,,ACC-1598451071,AUD,153.41 -27.95,POS,81c48296-73be-44a7-befa-d053f48ce7cd,,Diana,...,26,Ashmore,QLD,2018-08-01T01:01:15.000+0000,16.25,a623070bfead4541a6b0fff8a09e706c,Australia,CUS-2487424745,153.38 -27.99,debit
1,authorized,0.0,,ACC-1598451071,AUD,153.41 -27.95,SALES-POS,830a451c-316e-4a6a-bf25-e37caedca49e,,Diana,...,26,Sydney,NSW,2018-08-01T01:13:45.000+0000,14.19,13270a2a902145da9db4c951e04b51b9,Australia,CUS-2487424745,151.21 -33.87,debit
2,authorized,1.0,,ACC-1222300524,AUD,151.23 -33.94,POS,835c231d-8cdf-4e96-859d-e9d571760cf0,,Michael,...,38,Sydney,NSW,2018-08-01T01:26:15.000+0000,6.42,feb79e7ecd7048a5a36ec889d1a94270,Australia,CUS-2142601169,151.21 -33.87,debit
3,authorized,1.0,,ACC-1037050564,AUD,153.10 -27.66,SALES-POS,48514682-c78a-4a88-b0da-2d6302e64673,,Rhonda,...,40,Buderim,QLD,2018-08-01T01:38:45.000+0000,40.9,2698170da3704fd981b15e64a006079e,Australia,CUS-1614226872,153.05 -26.68,debit
4,authorized,1.0,,ACC-1598451071,AUD,153.41 -27.95,SALES-POS,b4e02c10-0852-4273-b8fd-7b3395e32eb0,,Diana,...,26,Mermaid Beach,QLD,2018-08-01T01:51:15.000+0000,3.25,329adf79878c4cf0aeb4188b4691c266,Australia,CUS-2487424745,153.44 -28.06,debit


In [4]:
Anz.columns

Index(['status', 'card_present_flag', 'bpay_biller_code', 'account',
       'currency', 'long_lat', 'txn_description', 'merchant_id',
       'merchant_code', 'first_name', 'balance', 'date', 'gender', 'age',
       'merchant_suburb', 'merchant_state', 'extraction', 'amount',
       'transaction_id', 'country', 'customer_id', 'merchant_long_lat',
       'movement'],
      dtype='object')

### Modifying data to obtain salaries for each customer

In [5]:
Anz_salaries = Anz[Anz["txn_description"]=="PAY/SALARY"].groupby("customer_id").mean()
Anz_salaries.head()

Unnamed: 0_level_0,card_present_flag,merchant_code,balance,age,amount
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CUS-1005756958,,0.0,4718.665385,53,970.47
CUS-1117979751,,0.0,11957.202857,21,3578.65
CUS-1140341822,,0.0,5841.72,28,1916.51
CUS-1147642491,,0.0,8813.467692,34,1711.39
CUS-1196156254,,0.0,23845.717143,34,3903.73


In [6]:
salaries = []

for customer_id in Anz["customer_id"]:
    salaries.append(int(Anz_salaries.loc[customer_id]["amount"]))
    
Anz["annual_salary"] = salaries

In [7]:
Anz_cus = Anz.groupby("customer_id").mean()
Anz_cus.head()

Unnamed: 0_level_0,card_present_flag,merchant_code,balance,age,amount,annual_salary
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CUS-1005756958,0.8125,0.0,2275.852055,53,222.862603,970
CUS-1117979751,0.826923,0.0,9829.929,21,339.8437,3578
CUS-1140341822,0.815385,0.0,5699.21225,28,212.6325,1916
CUS-1147642491,0.75,0.0,9032.841186,34,245.600169,1711
CUS-1196156254,0.785276,0.0,22272.433755,34,147.145796,3903


# Predictive Analytics

### Linear Regression

In [9]:
N_train = int(len(Anz_cus)*0.8)
X_train = Anz_cus.drop("annual_salary", axis=1).iloc[:N_train]
Y_train = Anz_cus["annual_salary"].iloc[:N_train]
X_test = Anz_cus.drop("annual_salary", axis=1).iloc[N_train:]
Y_test = Anz_cus["annual_salary"].iloc[N_train:]

In [10]:
linear_reg = LinearRegression()

In [11]:
linear_reg.fit(X_train, Y_train)
linear_reg.score(X_train, Y_train)

0.23295376366257825

In [12]:
linear_reg.predict(X_test)

array([1993.98473311, 2867.39066481, 1944.95959591, 1806.85984885,
       2226.35045442, 2075.34697175, 1813.02987337, 5388.67435983,
       1902.35351608, 2191.90445145, 1713.48134178, 2854.40519949,
       2094.77781158, 3815.34342881, 2249.92922822, 1768.80816189,
       2095.02988288, 1515.18425875, 1782.72752537, 2481.2898546 ])

In [13]:
linear_reg.score(X_test, Y_test)

-0.31694234980747327

### Decision Tree - Classification and Regression

In [14]:
Anz_cat = Anz[["txn_description", "gender", "age", "merchant_state", "movement"]]

In [15]:
pd.get_dummies(Anz_cat).head()

Unnamed: 0,age,txn_description_INTER BANK,txn_description_PAY/SALARY,txn_description_PAYMENT,txn_description_PHONE BANK,txn_description_POS,txn_description_SALES-POS,gender_F,gender_M,merchant_state_ACT,merchant_state_NSW,merchant_state_NT,merchant_state_QLD,merchant_state_SA,merchant_state_TAS,merchant_state_VIC,merchant_state_WA,movement_credit,movement_debit
0,26,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1
1,26,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,1
2,38,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,1
3,40,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,1
4,26,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,1


In [17]:
N_train = int(len(Anz)*0.8)
X_train = pd.get_dummies(Anz_cat).iloc[:N_train]
Y_train = Anz["annual_salary"].iloc[:N_train]
X_test = pd.get_dummies(Anz_cat).iloc[N_train:]
Y_test = Anz["annual_salary"].iloc[N_train:]

#### Classification

In [18]:
decision_tree_class = DecisionTreeClassifier()

In [19]:
decision_tree_class.fit(X_train, Y_train)
decision_tree_class.score(X_train, Y_train)

0.7882499481004774

In [20]:
decision_tree_class.predict(X_test)

array([1013, 1043, 4132, ..., 4054, 1043,  996], dtype=int64)

In [21]:
decision_tree_class.score(X_test, Y_test)

0.755085097550851

#### Regression

In [22]:
decision_tree_reg = DecisionTreeRegressor()

In [23]:
decision_tree_reg.fit(X_train, Y_train)
decision_tree_reg.score(X_train, Y_train)

0.7468978726536879

In [24]:
decision_tree_reg.predict(X_test)

array([1226.42857143, 1043.        , 4132.        , ..., 3345.04761905,
       1043.        , 1626.        ])

In [25]:
decision_tree_reg.score(X_test, Y_test)

0.6822816148137859