In [1]:
import os
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    confusion_matrix, roc_curve, 
    precision_recall_fscore_support, plot_roc_curve, roc_auc_score
)

import plotly.express as px
import plotly.subplots as sp

from svm_utils import *
from feature_extraction import *

In [2]:
data_dir = '../../data'
pxm = 'p6m'
start_date = '2021/01/01'
end_date = '2021/06/30'

In [3]:
df_reg = pd.read_csv(os.path.join(data_dir, 'model/{}/full.csv'.format(pxm)))
train = pd.read_csv(os.path.join(data_dir, 'model/{}/train.csv'.format(pxm)))
val = pd.read_csv(os.path.join(data_dir, 'model/{}/validate.csv'.format(pxm)))
test = pd.read_csv(os.path.join(data_dir, 'model/{}/test.csv'.format(pxm)))
test = test.append(val)

train_idxs = [np.where(df_reg['id'] == cid)[0][0] for cid in train['id']]
test_idxs = [np.where(df_reg['id'] == cid)[0][0] for cid in test['id']]

In [4]:
df_main = {}

fname = os.path.join(data_dir, 'main/currentacct_txn.csv')
df_trans = read_current(fname, df_reg['id'])
df_trans = summarise_id_day(df_trans, df_reg['id'], start_date, end_date)
df_main['current'] = df_trans

fname = os.path.join(data_dir, 'main/creditcard_txn.csv')
df_trans = read_credit(fname, df_reg['id'])
df_trans = summarise_id_day(df_trans, df_reg['id'], start_date, end_date)
df_main['credit'] = df_trans

fname = os.path.join(data_dir, 'main/savingacct_txn.csv')
df_trans = read_saving(fname, df_reg['id'])
df_trans = summarise_id_day(df_trans, df_reg['id'], start_date, end_date)
df_main['saving'] = df_trans

df_main['saving'] = df_main['saving'].reset_index(drop=False)
df_main['current'] = df_main['current'].reset_index(drop=False)
df_main['credit'] = df_main['credit'].reset_index(drop=False)

for acc_type in df_main:
    df_main[acc_type] = df_main[acc_type][df_main[acc_type]['day'] >= 0]

  if (await self.run_code(code, result,  async_=asy)):
  if (await self.run_code(code, result,  async_=asy)):


In [39]:
acc_type='credit'

### Original line

In [40]:
df_big = df_main[acc_type]
df_big = df_big.melt(id_vars=['id', 'day'], value_vars=['transact_c', 'transact_d'], var_name='type', value_name='value')
df_big['type'] = df_big['type'].replace({'transact_c': 'Credit', 'transact_d': 'Debit'})

dates = pd.date_range(start='2021/01/01', end='2021/06/30')
df_big['day'] = df_big.day.replace(dict(zip(df_big.day, dates)))

In [41]:
cid = 20858441

In [42]:
df = df_big[df_big['id'] == cid]

fig1 = px.line(
        df.sort_values('day'), x="day", y="value",
       title='Transaction history', color='type',
    )
fig1.update_layout(showlegend=False)
fig1.show()

### Smoothed line

In [43]:
dict_rolled = roll_df(df_main[acc_type].set_index('id'), df_big['id'].unique(), 14)

  0%|          | 0/6049 [00:00<?, ?it/s]

In [44]:
df_rolled = []
for cid in dict_rolled:
    dict_rolled[cid]['id'] = cid
    dict_rolled[cid]['date'] = pd.date_range(start='2021/01/14', end='2021/06/30')
    df_rolled.append(dict_rolled[cid])
df_rolled = pd.concat(df_rolled)  

tmps = []
for cid in dict_rolled:
    tmp = pd.DataFrame({
        'transact_c': 0,
        'transact_d': 0,
        'id': cid,
        'date': pd.date_range(start='2021/01/01', end='2021/01/13')
    })
    tmps.append(tmp)
df_rolled = df_rolled.append(pd.concat(tmps)).sort_values(['id', 'date'])

df_rolled = df_rolled.melt(id_vars=['id', 'date'], value_vars=['transact_c', 'transact_d'], var_name='type', value_name='value')

df_rolled['type'] = df_rolled['type'].replace({'transact_c': 'Credit', 'transact_d': 'Debit'})

In [45]:
cid = 20858441

df = df_rolled[df_rolled['id'] == cid]

fig2 = px.line(
        df.sort_values('date'), x="date", y="value",
       title='Transaction history (Smoothed)', color='type',
    )
fig2.update_layout(showlegend=False)

fig2.show()

### Histogram

In [46]:
cid = 20858441

df = df_rolled[df_rolled['id'] == cid]
df.loc[:, 'value'] = np.log(df['value'][df['value'] > 0])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [47]:
fig3 = px.histogram(df, x="value", color='type',  nbins=60, opacity=0.75)
fig3.show()

### Combine

In [48]:
cid = 20858441

df1 = df_big[df_big['id'] == cid]
df2 = df_rolled[df_rolled['id'] == cid]

fig1 = px.line(
        df1.sort_values('day'), x="day", y="value", color='type',
    )

fig2 = px.line(
        df2.sort_values('date'), x="date", y="value", color='type',
    )

df2.loc[:, 'value'] = np.log(df2['value'][df2['value'] > 0])
fig3 = px.histogram(df2, x="value", color='type',  nbins=30, opacity=0.75)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [49]:
titles = [
    'Transaction Amount Per Day',
    'Transaction Amount Per Day (Smoothed)',
    'Frequencies of Transaction Amount'
]

fig1_traces = []
fig2_traces = []
fig3_traces = []
for trace in range(len(fig1["data"])):
    fig1_traces.append(fig1["data"][trace])
for trace in range(len(fig2["data"])):
    fig2_traces.append(fig2["data"][trace])
for trace in range(len(fig3["data"])):
    fig3_traces.append(fig3["data"][trace])


fig = sp.make_subplots(rows=3, cols=1, subplot_titles=titles) 


for traces in fig1_traces:
    fig.append_trace(traces, row=1, col=1)
for traces in fig2_traces:
    fig.append_trace(traces, row=2, col=1)
for traces in fig3_traces:
    fig.append_trace(traces, row=3, col=1)

    
for i in range(len(fig.data) - 2):
    fig.data[i]['showlegend'] = False

In [50]:
fig.update_layout(
    margin=dict(l=0, r=0, t=0, b=0),
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="left",
        x=0
    ),
    height=700
)

In [52]:
path = os.path.join(data_dir, 'demo', 'algo', 'transaction', acc_type, 'transaction.csv')
df_big.to_csv(path, index=False)

In [53]:
path = os.path.join(data_dir, 'demo', 'algo', 'transaction', acc_type, 'transaction_rolled.csv')
df_rolled.to_csv(path, index=False)