### Packages

In [1]:
import pandas as pd
import sklearn
import datetime
import numpy as np


import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.cluster import DBSCAN

import pandas as pd

### Data

In [2]:
df = pd.read_csv('gs://shakingshamrocks_eu/test_data_3_sec.csv')

In [3]:
df['datetime'] = df['dates'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))

### Datetime Features

#### Functions

In [4]:
def convert_to_mins(val):
    try:
        return(int((val.total_seconds()/60)))
    except ValueError:
        return np.nan

#### Features

In [6]:
df['time_since_last_transaction'] = df['datetime'] - df['datetime'].shift()
df['time_since_last_transaction']= df['time_since_last_transaction'].apply(lambda x: convert_to_mins(x))

In [11]:
df = df.reset_index()

In [14]:
#time since last transaction in 'other party'
main_dict = {}
for party in list(df['other_account_name'].value_counts().index):
    temp_df = df[df['other_account_name'] == party].copy()
    temp_df['time_since_last_trans_party'] = temp_df['datetime'] - temp_df['datetime'].shift()
    pos_dict = pd.Series(temp_df['time_since_last_trans_party'].values, index = temp_df['index'])
    main_dict.update(pos_dict)
df['time_since_last_transaction_party'] = df['index'].map(main_dict)

In [37]:
df.columns

Index(['index', 'Unnamed: 0', 'dates', 'account_name', 'account_IBAN',
       'other_account_name', 'Type', 'amount', 'balance', 'datetime',
       'time_since_last_transaction', 'time_since_last_trans_party',
       'time_since_last_transaction_party', 'time_only'],
      dtype='object')

In [38]:
np.random.seed(0)


df_test = df[df['account_name'] == 'Sarah Brown']
df_test['day'] = pd.to_datetime(df_test.dates).dt.day
df_test.head()

numeric_features = ['amount', 'day']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])
# account name used as dummy feature
categorical_features = ['account_name']
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('cluster', DBSCAN(0.5))])

prediction = clf.fit_predict(df_test)

df_test['prediction'] = prediction

ValueError: Found array with 0 sample(s) (shape=(0, 2)) while a minimum of 1 is required.

In [187]:
df_test['prediction'] 

0      0
2      1
3      2
6      2
7      2
      ..
443    2
445    2
446    2
448    2
449    2
Name: prediction, Length: 225, dtype: int64

In [140]:
cols = ['type','date','time','amount','balance','other_party']

In [149]:
mock_data = [
    ['regular','2020-01-05','14:25','-200.00','5470.00','ESB'],
    ['regular','2020-01-07','10:36','-30.00','5270.00','Pharmacy'],
    ['irregular','2020-01-08','12:17','-98.46','5240.00','Tesco'],
    ['regular','2020-01-12','10:23','-1200.00','4040.00','Rent'],
    ['irregular','2020-01-13','11:30','-67.42','3972.58','Tesco'],
    ['regular','2020-01-14','09:47','-30.00','3942.58','Pharmacy'],
    ['discretionary','2020-01-15','09:43','-3.45','3939.13','Starbucks'],
    ['regular','2020-01-16','07:43','-49.99','3889.14','Vodafone'],
    ['discretionary','2020-01-18','10:43','-5.44','3883.70','Starbucks'],
    ['irregular','2020-01-20','18:47','-78.65','3805.05','Tesco'],
    ['regular','2020-01-21','09:36','-30.00','3775.05','Pharmacy'],
    ['regular','2020-01-22','12:00','-100.00','3675.05','Dr Zeuss'],
    ['discretionary','2020-01-23','02:54','-250.00','3425.05','Amazon'],
    ['discretionary','2020-01-23','03:01','-175.00','3250.05','Amazon',],
    ['irregular','2020-01-27','13:58','-102.47','3147.58','Tesco'],
    ['regular','2020-01-28','09:00','+2500.00','5647.58','ACME Corp.'],
    ['regular','2020-01-28','10:14','-30.00','5617.58','Pharmacy'],
    ['discretionary','2020-02-02','09:14','-3.57','5614.01','Starbucks'],
    ['regular','2020-02-05','14:25','-200.00','5414.01','ESB'],
    ['irregular','2020-02-06','15:47','-74.68','5339.33','Tesco'],
    ['discretionary','2020-02-07','10:20','-4.34','5334.99','Starbucks'],
    ['regular','2020-02-07','10:34','-30.00','5304.99','Pharmacy'],
    ['regular','2020-02-12','10:14','-1200.00','4104.99','Rent'],
    ['discretionary','2020-02-12','11:55','-3.75','4101.24','Starbucks'],
    ['irregular','2020-02-12','12:14','-100.27','4000.97','Tesco'],
    ['regular','2020-02-14','09:45','-30.00','3970.97','Pharmacy'],
    ['regular','2020-02-16','07:43','-49.99','3920.98','Vodafone'],
    ['irregular','2020-02-20','11:41','-88.34','3832.64','Tesco'],
    ['discretionary','2020-02-20','20:15','-45.36','3787.28','Amazon'],
    ['regular','2020-02-21','08:57','-30.00','3757.28','Pharmacy'],
    ['discretionary','2020-02-21','09:15','-3.75','3753.53','Starbucks'],
    ['irregular','2020-02-27','18:24','-95.45','3658.08','Tesco'],
    ['regular','2020-02-28','10:46','-30.00','3628.08','Pharmacy'],
    ['regular','2020-02-28','09:00','+2500.00','6128.08','ACME Corp.'],
    ['irregular','2020-03-01','19:01','-120.78','6007.03','Tesco'],
    ['discretionary','2020-03-03','09:14','-3.75','6003.55','Starbucks'],
    ['regular','2020-03-05','14:25','-200.00','5803.55','ESB'],
    ['regular','2020-03-07','10:12','-30.00','5773.55','Pharmacy'],
    ['discretionary','2020-03-07','10:25','-3.75','5769.80','Starbucks'],
    ['irregular','2020-03-08','18:31','-87.32','5682.48','Tesco'],
    ['regular','2020-03-12','10:10','-1200.00','4482.48','Rent'],
    ['regular','2020-03-14','09:45','-30.00','4452.48','Pharmacy'],
    ['regular','2020-03-16','07:43','-49.99','4402.49','Vodafone'],
    ['irregular','2020-03-21','09:31','-58.69','4343.80','Tesco'],
    ['discretionary','2020-03-21','09:57','-4.99','4338.81','Starbucks'],
    ['regular','2020-03-21','10:58','-30.00','4308.81','Pharmacy'],
    ['irregular','2020-03-25','18:42','-96.12','4212.69','Tesco'],
    ['regular','2020-03-28','10:13','-30.00','4182.69','Pharmacy'],
    ['regular','2020-03-28','09:00','+2500.00','6682.69','ACME Corp.'],
    ['discretionary','2020-03-28','10:15','-1534.47','5148.22','Amazon'],
    ['discretionary','2020-03-28','10:57','-799.99','4348.23','Asos']   
]


In [150]:
mock_df = pd.DataFrame(mock_data, columns = cols)

In [151]:
mock_df.head()

Unnamed: 0,type,date,time,amount,balance,other_party
0,regular,2020-01-05,14:25,-200.0,5470.0,ESB
1,regular,2020-01-07,10:36,-30.0,5270.0,Pharmacy
2,irregular,2020-01-08,12:17,-98.46,5240.0,Tesco
3,regular,2020-01-12,10:23,-1200.0,4040.0,Rent
4,irregular,2020-01-13,11:30,-67.42,3972.58,Tesco


### Datetime

#### Functions 

In [152]:
def convert_to_mins(val):
    try:
        return(int((val.total_seconds()/60)))
    except ValueError:
        return np.nan

#### Apply

In [153]:
mock_df['datetime'] =mock_df['date'] + " "+ mock_df['time']

##### time_since_last_transaction

In [154]:
#time since last transaction in mins
mock_df['datetime'] = mock_df['datetime'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M'))
mock_df['time_since_last_transaction'] = mock_df['datetime'] - mock_df['datetime'].shift()
mock_df['time_since_last_transaction']= mock_df['time_since_last_transaction'].apply(lambda x: convert_to_mins(x))

#### time since last transaction on party basis

In [163]:
mock_df = mock_df.reset_index()

In [165]:
#time since last transaction in 'other party'
main_dict = {}
for party in list(mock_df['other_party'].value_counts().index):
    temp_df = mock_df[mock_df['other_party'] == party].copy()
    temp_df['time_since_last_trans_party'] = temp_df['datetime'] - temp_df['datetime'].shift()
    pos_dict = pd.Series(temp_df['time_since_last_trans_party'].values, index = temp_df['index'])
    main_dict.update(pos_dict)
mock_df['time_since_last_trans_party'] = mock_df['index'].map(main_dict)

In [168]:
mock_df['time_since_last_trans_party']= mock_df['time_since_last_trans_party'].apply(lambda x: convert_to_mins(x))

In [15]:
# Extract day of month
mock_df['day_of_month'] = mock_df['date'].apply(lambda x: x[-2:])

In [65]:
mock_df.head()

Unnamed: 0,type,date,time,amount,balance,other_party,day_of_month
0,regular,2020-01-05,14:25,-200.0,5470.0,ESB,5
1,regular,2020-01-07,10:36,-30.0,5270.0,Pharmacy,7
2,irregular,2020-01-08,12:17,-98.46,5240.0,Tesco,8
3,regular,2020-01-12,10:23,-1200.0,4040.0,Rent,12
4,irregular,2020-01-13,11:30,-67.42,3972.58,Tesco,13


In [66]:
party_onehot = pd.get_dummies(mock_df['other_party'])

In [67]:
party_onehot.shape

(51, 10)

In [68]:
party_onehot.head()

Unnamed: 0,ACME Corp.,Amazon,Asos,Dr Zeuss,ESB,Pharmacy,Rent,Starbucks,Tesco,Vodafone
0,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,0,1,0


In [71]:
subset_df = mock_df[['day_of_month','amount']]

In [72]:
subset_df.head()

Unnamed: 0,day_of_month,amount
0,5,-200.0
1,7,-30.0
2,8,-98.46
3,12,-1200.0
4,13,-67.42


In [73]:
target = mock_df['type']

In [74]:
target.head()

0      regular
1      regular
2    irregular
3      regular
4    irregular
Name: type, dtype: object

In [76]:
features_df = pd.concat([subset_df,party_onehot],axis = 1)

In [77]:
features_df.shape

(51, 12)

In [78]:
features_df.head()

Unnamed: 0,day_of_month,amount,ACME Corp.,Amazon,Asos,Dr Zeuss,ESB,Pharmacy,Rent,Starbucks,Tesco,Vodafone
0,5,-200.0,0,0,0,0,1,0,0,0,0,0
1,7,-30.0,0,0,0,0,0,1,0,0,0,0
2,8,-98.46,0,0,0,0,0,0,0,0,1,0
3,12,-1200.0,0,0,0,0,0,0,1,0,0,0
4,13,-67.42,0,0,0,0,0,0,0,0,1,0


In [79]:
X_train,X_test,y_train,y_test = train_test_split(features_df,target)

In [83]:
y_train.shape

(38,)

In [96]:
clf = svm.SVC()

In [97]:
fitted = clf.fit(X_train,y_train)

In [98]:
y_pred = fitted.predict(X_test)

In [87]:
from sklearn.metrics import accuracy_score

In [99]:
accuracy_score(y_test,y_pred)

0.5384615384615384

array([0. , 0. , 0.7])

In [100]:
from sklearn.tree import DecisionTreeClassifier

In [101]:
clf2 = DecisionTreeClassifier()

In [102]:
fitted2 = clf2.fit(X_train,y_train)

In [103]:
y_pred2 = fitted2.predict(X_test)

In [104]:
accuracy_score(y_test,y_pred2)

1.0