In [1]:
import nbimporter

In [2]:
import numpy as np
import pandas as pd
import re

from datetime import datetime

from kevin_data_exploration import clean_text

In [3]:
outflows = pd.read_parquet('../data/ucsd-outflows.pqt')
outflows

Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category
0,0,acc_0,LOAN,900.60,2022-07-05,LOAN
1,0,acc_0,ATM_CASH,80.00,2022-03-25,ATM_CASH
2,0,acc_0,TST* Casa Del Rio - Exp Fairlawn OH 09/24,18.42,2022-09-26,FOOD_AND_BEVERAGES
3,0,acc_0,LOAN,634.00,2023-01-10,LOAN
4,0,acc_0,Buffalo Wild Wings,26.47,2022-09-12,FOOD_AND_BEVERAGES
...,...,...,...,...,...,...
2597483,5941,acc_9524,ATM_CASH,8.42,2023-01-25,ATM_CASH
2597484,5941,acc_9524,ATM_CASH,2.06,2023-01-25,ATM_CASH
2597485,5941,acc_9524,ATM_CASH,262.88,2023-01-25,ATM_CASH
2597486,5941,acc_9524,ATM_CASH,10.00,2023-01-25,ATM_CASH


In [4]:
cleaned_data = clean_text(outflows)

In [5]:
cleaned_data.head()

Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category
0,0,acc_0,loan,900.6,2022-07-05,LOAN
1,0,acc_0,atmcash,80.0,2022-03-25,ATM_CASH
2,0,acc_0,tst casa del rio exp fairlawn,18.42,2022-09-26,FOOD_AND_BEVERAGES
3,0,acc_0,loan,634.0,2023-01-10,LOAN
4,0,acc_0,buffalo wild wings,26.47,2022-09-12,FOOD_AND_BEVERAGES


## add features

In [6]:
def day_of_week(date):
    return date.strftime('%A')

def day_of_month(date):
    return date.day

def dollar_amount(amount):
    return int(amount == int(amount))

def find_emails(memo):
    email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
    match = re.search(email_pattern, memo)
    return match.group(0) if match else None

def contains_com_link(memo):
    com_pattern = r'\b\w+\.com\b'
    return bool(re.search(com_pattern, memo))

def com_link(memo):
    com_pattern = r'\b\w+\.com\b'
    match = re.search(com_pattern, memo)
    return match.group(0) if match else None

In [7]:
def add_features(data):
    
    df = data.copy()
    
    # day of week
    df['posted_day'] = df['posted_date'].apply(day_of_week)

    # day of month
    df['posted_day'] = df['posted_date'].apply(day_of_month)

    # whole dollar amount
    df['whole_number'] = df['amount'].apply(dollar_amount)

    # emails
    df['email'] = df['memo'].apply(find_emails)

    # includes link
    df['contains_link'] = df['memo'].apply(contains_com_link)

    # link
    df['link'] = df['memo'].apply(com_link)
    
    return df

In [8]:
df_feat = add_features(cleaned_data)
df_feat.head()

Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category,posted_day,whole_number,email,contains_link,link
0,0,acc_0,loan,900.6,2022-07-05,LOAN,5,0,,False,
1,0,acc_0,atmcash,80.0,2022-03-25,ATM_CASH,25,1,,False,
2,0,acc_0,tst casa del rio exp fairlawn,18.42,2022-09-26,FOOD_AND_BEVERAGES,26,0,,False,
3,0,acc_0,loan,634.0,2023-01-10,LOAN,10,1,,False,
4,0,acc_0,buffalo wild wings,26.47,2022-09-12,FOOD_AND_BEVERAGES,12,0,,False,


In [9]:
df_feat[df_feat['link'].notna()].groupby('category')['link'].count()

category
EDUCATION                 449
FOOD_AND_BEVERAGES       5369
GENERAL_MERCHANDISE    103664
GROCERIES                4279
OVERDRAFT                   6
PETS                     1435
RENT                       10
TRAVEL                   7974
Name: link, dtype: int64

In [10]:
df_feat[df_feat['email'].notna()].groupby('category')['email'].count()

category
FOOD_AND_BEVERAGES      18
GENERAL_MERCHANDISE    216
TRAVEL                   2
Name: email, dtype: int64

In [11]:
df_feat[df_feat['email'].notna()]['email'].unique()

array(['cc@google.comca', 'cc@google.com', 'help@bumble.com',
       'info@woot.cotx', 'support@pantaya.co', 'info@woot.com',
       'cc@vgw.co', 'info@woot.comt', 'mike@latnp.coca',
       'support@usemotion.com', 'gp@vgw.co', 'ar@guard.com',
       'abarnes@uptv.ga', 'cc@google.comcaus', 'bostev@aol.co',
       'info@parks.ca', 'help@hint.app', 'bermeajrjose@gmail.com',
       'warnett@gci.cakus', 'sul@triaco.nemi', 'bilikin1@att.cous'],
      dtype=object)