In [1]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.io as pio
import kaleido 
print('kaleido version:', kaleido.__version__)
from pathlib import Path
import pandas as pd
import numpy as np
import pyarrow
import regex

#Inline figures
pio.renderers.default = 'iframe'

#Themeing
# mcolors = px.colors.qualitative.Dark24

pio.templates['custom'] = go.layout.Template(
    layout=dict(
        xaxis=dict(ticks='outside', tickcolor='lightgray', showgrid=False, showline=True),
        yaxis=dict(ticks='outside', tickcolor='lightgray', showgrid=False, showline=True, mirror=True),
        yaxis2=dict(ticks='outside', tickcolor='lightgray', showgrid=False,),
        # colorway=mcolors,
    )

)

pio.templates.default = 'plotly_white+custom'

kaleido version: 0.2.1


In [108]:
#TODO 
# Jesse refactor to run in order
# Alex to change his code so that each line has a sender and reciever
#

# Transaction Preprocessing

## E-transfer Transaction Features
- `e_at_risk`
    - General indicator
    - Probability that a message references a species at risk, or parts thereof
    - *could also be a binary flag*
    - *can look at CITES for this list*
- `e_cad_at_risk`
    - Export indicator
    - Probability that message references a species at risk in Canada, or parts thereof (bear, geese, etc.)
    - *could also be a binary flag*
    - *get animal list from fintrac doc and papers*
    - Maaz: I don't think e_at_risk as defined above is useful. CITES list is very long. I will just focus on the Canadian at-risk. And I will call it e_at_risk.
        - VVCP12477508, BMIY44394974, ZGAN21196261, MXBU42708741, VYOC17525287, XGAL12621643 -- for sure
        - IBCA52577297 -- 'moose meat'. David Wilkerson sent 978 to Steven Cruz. Wilkerson is also the guy who bought the bear fangs and skin. idk what to write for this.
- `e_role`
    - General indicator
    - Probability that a message references an IWT related role (e.g. poacher, coordinator, supplier, breeder, trader)
    - *could also be a binary flag*
    - Maaz: regex search for these words did not yield anything, nor did semantic search for poacher, nor semantic search for breeder
- `e_trad_med`
    - Import indicator
    - Probability that a message references traditional medicine
    - *could also be a binary flag*
    - Maaz: semantic search for traditional medicine only brought up one possible hit: lion's mane. But, this is a plant. So idk.
 

*Do we want to separate these into TO and FROM?*

In [2]:
DATAPATH = Path('../data/')
EMTPATH = DATAPATH / 'processed' / 'emt.parquet'
emt_df = pd.read_parquet(EMTPATH)

emt_df['trxn_message'] = emt_df['trxn_message'].fillna('')

In [6]:
# function takes iterable of messages and returns list of if its about animal trafficking or no
def classify_message_animal_trafficking(messages):
    messages = pd.Series(messages) # to use str contains

    keywords = [
        "bears?", 
        "cougars?", 
        "goose", 
        "geese", 
        "lynx", 
        "moose", 
        "crab?", 
        "eel", 
        "elver", 
        "lobsters?", 
        "turtles?", 
        "sharks?", 
        "wolf", 
        "wolves",
        "ivory", 
        "tusks?", 
        "scales?", 
        "fins?", 
        "tortoise", 
        "gecko", 
        "salamander", 
        "pangolin",
        "musk deer",
        "macaque",
        "gall",
        "bile",
        "gall ?stones?",
        "tiger",
        "leopard",
        "bones?",
        "rhinoceros",
        "rhino",
        "shell",
        "seahorses?",
        "alligator",
        "crocodile",
        "horns?",
        "skin",
        "fangs?"
    ]
    pattern = '|'.join(['\\b{}\\b'.format(k) for k in keywords])

    negatives = [
        'shark week',
        'shark ?skin',
        'shark tank',
        'lone wolf',
        'bear grylls',
        'fortnite',
        'fornite',
        'snake ?skin',
        'drank ur bear',
        'wolf-brand',
        'bear market inves',
        'wolf of wall street',
        'electric eel',
        'goose island',
        'shark vacuum',
        'lobster dinner',
        'lobster fest',
        'crab shack',
        'bulldog skin care',
        'turtle wax',
        'goose down',
        'phone skin',
        'fitness club',
        'oil change',
        'duck duck goose',
        'kayak',
        'king crab',
        'soap',
        'phone case',
        'pizzeria',
        'kettle',
        'steakhouse',
        'leopard print',
        'art',
        'tiger balm',
        'leopard spot',
        'tiger woods',
        'tiger king',
        'screen protector',
        'donation',
        'seahorse earrings',
        'tickets',
        'power bank',
        'beach towel',
        'diving service',
        'bedliner',
        'camera'
    ]
    negative_pattern = '|'.join(['{}'.format(k) for k in negatives])

    classification = (messages.str.contains(pattern, case=False, na=False)) & (~messages.str.contains(negative_pattern, case=False, na=False))
    return classification

In [9]:
# fuzzy matching with regex example
# regexpr = regex.compile(r"(\bbear\b){e<=1}|(\bfox\b){e<=1}")
# emt_df['e_at_risk'] = emt_df['trxn_message'].map(lambda m : bool(regexpr.search(m)))

# if you want to implement this in the function I wrote, just change how `pattern` is constructed
# pattern = regex.compile('|'.join(['(\\b{}\\b){{e<=1}}'.format(k) for k in keywords]))

# one challenge is e.g. fox would match to for, which gets you lots of false positives
# read https://pypi.org/project/regex/ and you can see that you can specify which character sets to allow for the fuzzy match
# e.g., {s<=2:[a-z]} at most 2 substitutions, which must be in the character set [a-z]
# so to fuzzy match fox but not for, use something like that to exclude matching the `r` character?

# and of course would have to change the str.contains to a map and lambda function

In [7]:
emt_df['e_at_risk'] = classify_message_animal_trafficking(emt_df['trxn_message'])

In [8]:
emt_df[emt_df['e_at_risk']]

Unnamed: 0,cust_id_sender,cust_id_receiver,name_sender,name_receiver,trxn_message,emt_value,trxn_id,occ_wealth_receiver,occ_animal_receiver,occ_int_receiver,label_receiver,occ_wealth_sender,occ_animal_sender,occ_int_sender,label_sender,e_at_risk,e_role,e_trad_med
48979,CUST75429953,CUST29156003,VINCENT PEREZ,JAMES HILL,"salamander,thnks!",3331.0,VVCP12477508,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,True,0,0
75791,CUST77650473,EXTERNAL907939,DAISY SMITH,LAGAN DESAI,"1 plr br rug, 2 wlf skin",4576.0,BMIY44394974,,,,,0.0,0.0,0.0,0.0,True,0,0
134658,CUST85769551,CUST33059790,DAVID WILKERSON,DR.JOHN WADE,"For bear skin, as dicussed",4260.0,ZGAN21196261,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,True,0,0
167585,CUST28295610,CUST45674350,HAZEL DATTA,DAVID COOLEY,Ivory,1640.0,MXBU42708741,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,True,0,0
224011,EXTERNAL531873,CUST45674350,ROBERT CARROLL,DAVID COOLEY,4 pieces of ivory,1921.0,VYOC17525287,1.0,0.0,1.0,0.0,,,,,True,0,0
422121,CUST73707963,CUST45674350,ÉLISABETH LACROIX,DAVID COOLEY,Rihno horn collection,946.0,AZEY15264226,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,True,0,0
446082,CUST85769551,CUST33059790,DAVID WILKERSON,DR.JOHN WADE,bear fangs,2200.0,XGAL12621643,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,True,0,0
480263,CUST85769551,CUST80174234,DAVID WILKERSON,STEVEN CRUZ,moose meat,978.0,IBCA52577297,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,True,0,0


In [9]:
# function takes iterable of messages and returns list of if its about a traditional medicine plant or not
# I have left this here but I am not sure if this is useful. the animals are already covered in the previous function.
# based on: https://www.canada.ca/en/environment-climate-change/services/convention-international-trade-endangered-species/publications/animal-plant-ingredients-traditional-medicine.html#_03
def classify_message_trad_medicine_plant(messages):
    messages = pd.Series(messages) # to use str contains

    keywords = [
        "aucklandia",
        "costus",
        "root",
        "eagle ?wood",
        "fern",
        "golden haired dog",
        "golden hair dog",
        "nard"
    ]
    pattern = '|'.join(['\\b{}\\b'.format(k) for k in keywords])

    negatives = []
    negative_pattern = '|'.join(['{}'.format(k) for k in negatives])

    classification = (messages.str.contains(pattern, case=False, na=False)) & (~messages.str.contains(negative_pattern, case=False, na=False))

    return classification

In [58]:
# dummy
def classify_messsage_iwt_role(messages):
    messages = pd.Series(messages)

    classification = [0 for m in messages]

    return classification

In [59]:
emt_df['e_role'] = classify_messsage_iwt_role(emt_df['trxn_message'])

In [60]:
# dummy
def classify_messsage_trad_med(messages):
    messages = pd.Series(messages)

    classification = [0 for m in messages]

    return classification

In [61]:
emt_df['e_trad_med'] = classify_messsage_trad_med(emt_df['trxn_message'])

In [63]:
emt_df.to_parquet(EMTPATH, index=False)

## Wire Transfer Transaction Features
- `w_to_country`
    - Import indicator
    - Binary flag
    - 1 if the wire transfer is to a jurisdiction of concern:
        - China
        - Hong Kong
        - South Africa
        - Australia
        - ...
- `w_from_country`
    - *not listed in fintrac doc, but could be a useful flag as well*
    - Binary flag
    - 1 if from a jurisdiction of concern, 0 otherwise
- `w_external_to_animal`
    - Binary flag
    - 1 if from a jurisdiction of concern to an animal related business, 0 otherwise

In [28]:
WIREPATH = DATAPATH / 'processed' / 'wire.parquet'
KYCPATH = DATAPATH / 'processed' / 'kyc.parquet'
EMTPATH = DATAPATH / 'processed' / 'emt.parquet'


SUSCOUNTRIES = ['CN', 'AU']

In [33]:
#Load Data
kyc_df = pd.read_parquet(KYCPATH)
kyc_df = kyc_df[['cust_id', 'occ_animal']]

wire_df = pd.read_parquet(WIREPATH)

#w_to_country
wire_df['w_to_country'] = wire_df['country_receiver'].map(lambda x: 1 if x in SUSCOUNTRIES else 0)

#w_from_country
wire_df['w_from_country'] = wire_df['country_sender'].map(lambda x: 1 if x in SUSCOUNTRIES else 0)

#w_external_to_animal
wire_df['w_external_to_animal'] = wire_df.apply(lambda r: 1 if r.country_sender!='CA' and r.occ_animal_receiver==1 else 0, axis=1)

In [34]:
wire_df.to_parquet(WIREPATH, index=False)
wire_df.sample(3)

Unnamed: 0,cust_id_sender,cust_id_receiver,name_sender,name_receiver,trxn_value,country_sender,country_receiver,trxn_id,occ_wealth_receiver,occ_animal_receiver,occ_int_receiver,label_receiver,occ_wealth_sender,occ_animal_sender,occ_int_sender,label_sender,w_to_country,w_from_country,w_external_to_animal
2593,CUST46488790,EXTERNAL648252,CHRISTOPHER THOMPSON,MRS. OLWETHU DINABANTU,1536.0,CA,SA,WEVE63600140,,,,,0.0,0.0,0.0,0.0,0,0,0
54648,CUST74454544,CUST39351940,PATRICIA JOHNSON,THOMAS WILSON DDS,4275.0,CA,CA,PWJU65538543,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0
60265,CUST56867358,CUST41543659,DR.BRIAN GONZALES,LUCY-AURÉLIE BRISSON,2634.0,CA,CA,PHTV16985227,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0


## Cash Transaction Features
- `c_large`
    - General indicator
    - Binary flag
    - 1 if a deposit or withdrawal is an outlier (above Q3+1.5*IQR)
    - To determine outliers, data is grouped by occupation and by transaction type (deposit/withdrawal)

In [23]:
from pathlib import Path
import pandas as pd
import numpy as np
import pyarrow

DATAPATH = Path('../data/')
CASHPATH = DATAPATH / 'processed' / 'cash.parquet'
KYCPATH = DATAPATH / 'processed' / 'kyc.parquet'

UPPERBOUND = 1.5 #Q3 + UPPERBOUND*IQR gives the transaction amount beyond which transactions are classified as outliers

In [28]:
#Load Data
kyc_df = pd.read_parquet(KYCPATH)
kyc_df = kyc_df[['cust_id', 'occupation']]

cash_df = pd.read_parquet(CASHPATH)
cash_df = cash_df.merge(kyc_df, on='cust_id', how='left')

#Get IQR for Outlier Test
q3 = cash_df.groupby(['type', 'occupation'])['trxn_amount'].quantile(0.75)
q1 = cash_df.groupby(['type', 'occupation'])['trxn_amount'].quantile(0.25)
iqr = q3-q1
fence = q3+UPPERBOUND*iqr #upper bound for outlier tests
fence_dict = fence.to_dict()

#Test outliers
def outlier_test(row, fence): 
    outlier = row.trxn_amount > fence[(row.type, row.occupation)]

    return 1 if outlier else 0

cash_df['c_large'] = cash_df.apply(lambda row: outlier_test(row, fence_dict), axis=1)
print(f'Classified {cash_df.c_large.sum()} of {cash_df.c_large.count()} as outliers')

Classified 9774 of 212532 as outliers


In [30]:
cash_df.to_parquet(CASHPATH, index=False)
cash_df.sample(3)

Unnamed: 0,cust_id,trxn_amount,type,trxn_id,occ_wealth,occ_animal,occ_int,label,occupation,c_large
103847,CUST25178401,7490,deposit,AMUV20063371,0,0,0,0,Construction Contractor,0
206645,CUST91586622,4090,withdrawal,YMYJ74086394,0,0,0,0,"Freelancer (e.g., Graphic Designer, Writer)",0
188818,CUST23412850,5895,deposit,YQZC11904658,1,0,0,0,Loan or Finance Company Owner,0


# General Transaction Features
- `t_to_animal`
    - Import indicator
    - Binary flag
    - 1 if the transaction is to an animal related business
- `t_from_animal`
    - *not listed in fintrac doc, but could be a useful flag as well*
    - Binary flag
    - 1 if the transaction is from an animal related business
- `t_to_animal_large`
    - Import indicator
    - Binary flag
    - 1 if the transaction is from an animal related and is above a certain dollar amount, 0 otherwise
    - *tbd the dollar amount*
- `t_from_animal_large`
    - *not listed in fintrac doc, but could be a useful flag as well*
    - Binary flag
    - 1 if the transaction is from an animal related and is above a certain dollar amount, 0 otherwise
    - *tbd the dollar amount*
- `t_to_shipping`
    - Export indicator
    - Binary flag
    - 1 if the transaction is to someone in shipping/postal/cargo
    - *no label for this type of occupation yet*


In [2]:
import pandas as pd
import numpy as np

import plotly.graph_objects as go
import plotly.express as px

In [3]:
emt_data = pd.read_csv('../data/raw/emt.csv')
wire_data = pd.read_csv('../data/raw/wire.csv')
cash_data = pd.read_csv('../data/raw/cash.csv')
kyc_data = pd.read_csv('../data/processed/kyc.parquet')

In [3]:
# First, rename columns to have a consistent schema
cash_data.rename(columns={
    'amount': 'trxn_amount',
    }, inplace=True)
emt_data.rename(columns={
    'emt value': 'trxn_amount', 
    'id sender': 'cust_id_sender',
    'id receiver': 'cust_id_receiver',
    'name sender': 'name_sender',
    'name receiver': 'name_receiver',
    'emt message': 'trxn_message',
    }, inplace=True)
wire_data.rename(columns={
    'wire value': 'trxn_amount',
    'id sender': 'cust_id_sender',
    'id receiver': 'cust_id_receiver',
    'name sender': 'name_sender',
    'name receiver': 'name_receiver',
    'country sender': 'country_sender',
    'country receiver': 'country_receiver',
    }, inplace=True)

# Add a new column to each dataframe to indicate the type of transaction
cash_data['trxn_type'] = 'cash'
emt_data['trxn_type'] = 'emt'
wire_data['trxn_type'] = 'wire'

# duplicate all transactions in emt_data and wire_data to have a sender and receiver as the cust_id
emt_data_sender = emt_data.copy()
emt_data_sender.rename(columns={
    'cust_id_sender': 'cust_id',
    'name_sender': 'name',
    'country_sender': 'country',
    }, inplace=True)
emt_data_receiver = emt_data.copy()
emt_data_receiver.rename(columns={
    'cust_id_receiver': 'cust_id',
    'name_receiver': 'name',
    'country_receiver': 'country',
    }, inplace=True)
wire_data_sender = wire_data.copy()
wire_data_sender.rename(columns={
    'cust_id_sender': 'cust_id',
    'name_sender': 'name',
    'country_receiver': 'country',
    }, inplace=True)
wire_data_receiver = wire_data.copy()
wire_data_receiver.rename(columns={
    'cust_id_receiver': 'cust_id',
    'name_receiver': 'name',
    'country_sender': 'country',
    }, inplace=True)
wire_data_sender['direction'] = 'sender'
wire_data_receiver['direction'] = 'receiver'
emt_data_sender['direction'] = 'sender'
emt_data_receiver['direction'] = 'receiver'

# drop columns that are not needed
emt_data_sender.drop(columns=['cust_id_receiver', 'name_receiver'], inplace=True)
emt_data_receiver.drop(columns=['cust_id_sender', 'name_sender'], inplace=True)
wire_data_sender.drop(columns=['cust_id_receiver', 'name_receiver', 'country_sender'], inplace=True)
wire_data_receiver.drop(columns=['cust_id_sender', 'name_sender', 'country_receiver'], inplace=True)

# drop rows where cust_id doesn't begin with 'CUST'
emt_data_sender = emt_data_sender[emt_data_sender['cust_id'].str.startswith('CUST')]
emt_data_receiver = emt_data_receiver[emt_data_receiver['cust_id'].str.startswith('CUST')]
wire_data_sender = wire_data_sender[wire_data_sender['cust_id'].str.startswith('CUST')]
wire_data_receiver = wire_data_receiver[wire_data_receiver['cust_id'].str.startswith('CUST')]

# remerge the dataframes
emt_data = pd.concat([emt_data_sender, emt_data_receiver], ignore_index=True)
wire_data = pd.concat([wire_data_sender, wire_data_receiver], ignore_index=True)

wire_data = wire_data.drop(columns=['name'])
emt_data = emt_data.drop(columns=['name'])

In [6]:
# Add kyc data to trxn data
emt_data = emt_data.merge(kyc_data, on='cust_id', how='left')
wire_data = wire_data.merge(kyc_data, on='cust_id', how='left')
cash_data = cash_data.merge(kyc_data, on='cust_id', how='left')

In [7]:
emt_data

Unnamed: 0,cust_id,tx_message,tx_amount,trxn_id,tx_type,direction,Name,Gender,Occupation,Age,Tenure,occ_wealth,occ_animal,occ_int,label
0,CUST26232205,for the bike u lent me,154.0,WFEZ76031047,emt,sender,JASON GARRISON,male,Luthier,32.0,13.0,0,0,0,0
1,CUST35533148,,518.0,XQJS86205330,emt,sender,ANTHONY ROBERSON,male,Hotelier,48.0,18.0,0,0,0,0
2,CUST59096559,,46.0,WPXP45854083,emt,sender,KEVIN PARK,male,Import/Export Business Owner,34.0,8.0,1,0,1,0
3,CUST69049633,,570.0,OIRZ70883325,emt,sender,ZHU FENG LAN,male,Retail Salesperson,35.0,0.0,0,0,0,0
4,CUST27403977,,480.0,TRNT55099512,emt,sender,IND.DAVID DUNLAP JR.,other,Private Security Company Owner,69.0,14.0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
704566,CUST74979363,,119.0,USHN74907347,emt,receiver,WANDA HAYDEN,female,Cashier,27.0,1.0,0,0,0,0
704567,CUST68693554,,208.0,VXES44436032,emt,receiver,MICHAEL CRUZ,male,Unknown,35.0,0.0,0,0,0,0
704568,CUST90504001,Fox racing motocross gear,150.0,LTUK21435620,emt,receiver,MR. ERIC WALTERS,male,Real Estate Broker,29.0,5.0,1,0,1,0
704569,CUST99824006,,270.0,OIRO35201076,emt,receiver,JOHN HUNT JR.,male,Jewelry Dealer,33.0,2.0,1,0,1,0


In [14]:
emt_data['t_to_animal'] = np.where((emt_data['occ_animal']==1) & (emt_data['direction']=='receiver'), 1, 0)
wire_data['t_to_animal'] = np.where((wire_data['occ_animal']==1) & (wire_data['direction']=='receiver'), 1, 0)
cash_data['t_to_animal'] = np.where(cash_data['occ_animal']==1, 1, 0)

emt_data['t_from_animal'] = np.where((emt_data['occ_animal']==1) & (emt_data['direction']=='sender'), 1, 0)
wire_data['t_from_animal'] = np.where((wire_data['occ_animal']==1) & (wire_data['direction']=='sender'), 1, 0)
cash_data['t_from_animal'] = 0

threshold = 1000
emt_data['t_to_animal_large'] = np.where((emt_data['occ_animal']==1) & (emt_data['direction']=='receiver') & (emt_data['trxn_amount'] > threshold), 1, 0)
wire_data['t_to_animal_large'] = np.where((wire_data['occ_animal']==1) & (wire_data['direction']=='receiver') & (wire_data['trxn_amount'] > threshold), 1, 0)
cash_data['t_to_animal_large'] = np.where((cash_data['occ_animal']==1) & (cash_data['trxn_amount'] > threshold), 1, 0)

emt_data['t_from_animal_large'] = np.where((emt_data['occ_animal']==1) & (emt_data['direction']=='sender') & (emt_data['trxn_amount'] > threshold), 1, 0)
wire_data['t_from_animal_large'] = np.where((wire_data['occ_animal']==1) & (wire_data['direction']=='sender') & (wire_data['trxn_amount'] > threshold), 1, 0)
cash_data['t_from_animal_large'] = 0

emt_data['t_to_int'] = np.where((emt_data['occ_int']==1) & (emt_data['direction']=='receiver'), 1, 0)
wire_data['t_to_int'] = np.where((wire_data['occ_int']==1) & (wire_data['direction']=='receiver'), 1, 0)
cash_data['t_to_int'] = np.where(cash_data['occ_int']==1, 1, 0)

emt_data['t_from_int'] = np.where((emt_data['occ_int']==1) & (emt_data['direction']=='sender'), 1, 0)
wire_data['t_from_int'] = np.where((wire_data['occ_int']==1) & (wire_data['direction']=='sender'), 1, 0)
cash_data['t_from_int'] = 0

In [15]:
emt_data

Unnamed: 0,cust_id,tx_message,tx_amount,trxn_id,tx_type,direction,Name,Gender,Occupation,Age,...,occ_wealth,occ_animal,occ_int,label,t_to_animal,t_from_animal,t_to_animal_large,t_from_animal_large,t_to_int,t_from_int
0,CUST26232205,for the bike u lent me,154.0,WFEZ76031047,emt,sender,JASON GARRISON,male,Luthier,32.0,...,0,0,0,0,0,0,0,0,0,0
1,CUST35533148,,518.0,XQJS86205330,emt,sender,ANTHONY ROBERSON,male,Hotelier,48.0,...,0,0,0,0,0,0,0,0,0,0
2,CUST59096559,,46.0,WPXP45854083,emt,sender,KEVIN PARK,male,Import/Export Business Owner,34.0,...,1,0,1,0,0,0,0,0,0,1
3,CUST69049633,,570.0,OIRZ70883325,emt,sender,ZHU FENG LAN,male,Retail Salesperson,35.0,...,0,0,0,0,0,0,0,0,0,0
4,CUST27403977,,480.0,TRNT55099512,emt,sender,IND.DAVID DUNLAP JR.,other,Private Security Company Owner,69.0,...,1,0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
704566,CUST74979363,,119.0,USHN74907347,emt,receiver,WANDA HAYDEN,female,Cashier,27.0,...,0,0,0,0,0,0,0,0,0,0
704567,CUST68693554,,208.0,VXES44436032,emt,receiver,MICHAEL CRUZ,male,Unknown,35.0,...,0,0,0,0,0,0,0,0,0,0
704568,CUST90504001,Fox racing motocross gear,150.0,LTUK21435620,emt,receiver,MR. ERIC WALTERS,male,Real Estate Broker,29.0,...,1,0,1,0,0,0,0,0,1,0
704569,CUST99824006,,270.0,OIRO35201076,emt,receiver,JOHN HUNT JR.,male,Jewelry Dealer,33.0,...,1,0,1,0,0,0,0,0,1,0


In [None]:
emt_data.to_parquet('../data/processed/emt.parquet')
wire_data.to_parquet('../data/processed/wire.parquet')
cash_data.to_parquet('../data/processed/cash.parquet')

In [36]:
# Alex to clean all data before final export (drop useless columns)