# GOAL

Anonymize data from AR app to allow public sharing.
- AR Comments (OK)
- Countries (N/A)
- DSO (OK)
- ExchangeRates (N/A)
- Invoice Item Detail
- Invoices (OK)
- Items (OK)
- Link Table (OK)
- Product Lines (OK)
- Subsidiaries (OK)

# PACKAGES

In [191]:
import pandas as pd
from anonympy.pandas import dfAnonymizer
from anonympy.pandas.utils_pandas import available_methods
from anonympy.pandas.utils_pandas import fake_methods
import os
import gcsfs
import pickle
from random import shuffle

# PARAMETERS

In [174]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../secrets/gcp_qlik_key.json'
source_path='gs://qlik-demos-data/finance/in/'
destination_path='gs://qlik-demos-data/finance/out/'
pd_options = {"token": os.environ['GOOGLE_APPLICATION_CREDENTIALS']}
fs = gcsfs.GCSFileSystem(token=os.environ['GOOGLE_APPLICATION_CREDENTIALS'])

# seeds and keys for anonymization
key = 'qlikrulesaboveallothers'
seed = 1001

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# files
    
    dict_files = {
    'Countries': 'AR_Countries V1',
    "Invoice Item Detail":"AR_Invoice Item Detail V1",
    "Product Lines": "AR_Product Lines V1",
    "Invoices": "AR_Invoices V1",
    "Items":"AR_Items V1",
    "Comments":"AR_Comments V1",
    "DSO":"AR_DSO V1",
    "Link Table":"AR_Link Table V1",
    "Subsidiaries":"AR_Subsidiaries V1",
    "ExchangeRates":"AR_ExchangeRates V1"
    }

# FUNCTIONS

## noise_amount_column

In [175]:
def noise_amount_column(original_column):
    noise_column=original_column.replace(".-","-0.",regex=True).astype('float')
    return noise_column.apply(lambda x: round(x*2/3+50000,1) if x>=0 else round(x*2/3-50000,1))

## scramble_column

In [176]:
def scramble_column(original_column):
    scrambled_column=original_column.copy()
    def scramble_str(original_str):
        def return_number(number=0.3):
            return number
        
        scrambled_str=list(original_str)
        shuffle(scrambled_str,return_number)

        return "".join([str(item) for item in scrambled_str])
    return scrambled_column.apply(scramble_str)

## sequential_values_for_column

In [177]:
def sequencial_values_for_column(original_column):
    columns_names={'index':'new',0:'original'}
    sequencial_values_for_column=pd.DataFrame(set(original_column)).reset_index().rename(columns=columns_names)
    sequencial_values_for_column=pd.merge(
        original_column,
        sequencial_values_for_column,
        left_on=original_column.name,
        right_on='original',
        how='left').drop(
            columns=['original',original_column.name])
    return sequencial_values_for_column.rename(columns={'new':original_column.name})

## fake_data_for_column

In [178]:
def fake_data_for_column(original_column,fake_method):

    # create a 2 columns dataframe with the unique values from original_column twice
    df_exclusive_values=original_column.drop_duplicates()
    df_exclusive_values=pd.merge(df_exclusive_values,df_exclusive_values,how='inner',left_index=True,right_index=True,suffixes=('','_fake'))

    anon_exclusive_values=dfAnonymizer(df_exclusive_values)
    anon_exclusive_values.categorical_fake({original_column.name+'_fake':fake_method},seed=seed)
    
    fake_data_for_column=pd.merge(original_column,
        anon_exclusive_values.to_df(),
        how='left',
        on=original_column.name
        ).drop(columns=[original_column.name])
    return fake_data_for_column.rename(columns={original_column.name+'_fake':original_column.name})

# DATA ANONYMIZATION

## Subsidiaries

In [179]:
# read original file from gcs
df_subsidiaries=pd.read_csv(source_path+dict_files['Subsidiaries']+'.csv',storage_options=pd_options)
df_subsidiaries['NetSuite Subsidiary ID']=df_subsidiaries['NetSuite Subsidiary ID'].astype('str')

print('original dataframe')
display(df_subsidiaries.head())

# anonymize dataframe
anon_subsidiaries = dfAnonymizer(df_subsidiaries)

anon_subsidiaries.categorical_tokenization('%SubsidiaryCode',max_token_len=3,key=key)
anon_subsidiaries.categorical_fake({'Subsidiary':'company'},seed=seed)
anon_subsidiaries.column_suppression(['Is Attunity Subsidiary','VAT Registration Number'])
anon_subsidiaries.categorical_resampling(
    ['Subsidiary Currency Code','Subsidiary Region'],seed=seed)

print(anon_subsidiaries.info())

df_subsidiaries_anon=anon_subsidiaries.to_df()
df_subsidiaries_anon['NetSuite Subsidiary ID']=df_subsidiaries_anon['%SubsidiaryCode']
df_subsidiaries_anon['Subsidiary Legal Name']=df_subsidiaries_anon['Subsidiary']
df_subsidiaries_anon['Workday Subsidiary Name']=df_subsidiaries_anon['Subsidiary']
df_subsidiaries_anon['Subsidiary Region']=df_subsidiaries_anon['Subsidiary Region'].replace({'Technologies':'World'},inplace=False)

# merge original and anonymized dataframes
df_subsidiaries=df_subsidiaries.join(df_subsidiaries_anon,how='inner',lsuffix='_orig')
print('full dataframe')
display(df_subsidiaries.head())

# persist anonymized df to GCS
df_subsidiaries_anon.to_csv(destination_path+dict_files['Subsidiaries']+'.csv',index=False)

# persist mapping tables to GCS
map_subsidiary_code = dict(zip(df_subsidiaries['%SubsidiaryCode_orig'], df_subsidiaries['%SubsidiaryCode']))
map_subsidiary_currency_code = dict(zip(df_subsidiaries['Subsidiary Currency Code_orig'], df_subsidiaries['Subsidiary Currency Code']))

with fs.open(destination_path+'map_subsidiary_code.pickle', 'wb') as handle:
    pickle.dump(map_subsidiary_code, handle, protocol=pickle.HIGHEST_PROTOCOL)
with fs.open(destination_path+'map_subsidiary_currency_code.pickle', 'wb') as handle:
    pickle.dump(map_subsidiary_currency_code, handle, protocol=pickle.HIGHEST_PROTOCOL)

df_subsidiaries = anon_subsidiaries=df_subsidiaries_anon=map_subsidiary_code=map_subsidiary_currency_code=[]



original dataframe


Unnamed: 0,Subsidiary,%SubsidiaryCode,Subsidiary Legal Name,Subsidiary Currency Code,Subsidiary Is Active,NetSuite Subsidiary ID,VAT Registration Number,Is Active,Is Elimination,Subsidiary Region,Workday Subsidiary Name,Is Attunity Subsidiary
0,Qlik Foreign Parent AB - Do Not Use,NA1,Qlik Foreign Parent AB - Do Not Use,SEK,No,54,,No,No,,,
1,Expressor Software Corporation,EXP,Expressor Software Corporation,USD,No,46,,No,No,Americas,Expressor Software Corporation,No
2,Purchase Price Adjustments,PPA,Purchase Price Adjustments,USD,Yes,56,,Yes,No,Technologies,,No
3,QlikTech Holdings Inc.,HOI,QlikTech Holdings Inc.,USD,Yes,13,,Yes,No,Technologies,,Yes
4,QlikTech Belgium,BEL,QlikTech Belgium,EUR,No,5,BE0848691897,No,No,EMEA,,No


+--------------------------+--------+-------------+--------------------+
|          Column          | Status |    Type     |       Method       |
| Subsidiary               | 1      | categorical | Synthetic Data     |
+--------------------------+--------+-------------+--------------------+
| %SubsidiaryCode          | 1      | categorical | Tokenization       |
+--------------------------+--------+-------------+--------------------+
| Subsidiary Legal Name    | 0      | categorical |                    |
+--------------------------+--------+-------------+--------------------+
| Subsidiary Currency Code | 1      | categorical | Resampling         |
+--------------------------+--------+-------------+--------------------+
| Subsidiary Is Active     | 0      | categorical |                    |
+--------------------------+--------+-------------+--------------------+
| NetSuite Subsidiary ID   | 0      | categorical |                    |
+--------------------------+--------+-------------+

Unnamed: 0,Subsidiary_orig,%SubsidiaryCode_orig,Subsidiary Legal Name_orig,Subsidiary Currency Code_orig,Subsidiary Is Active_orig,NetSuite Subsidiary ID_orig,VAT Registration Number,Is Active_orig,Is Elimination_orig,Subsidiary Region_orig,Workday Subsidiary Name_orig,Is Attunity Subsidiary,Subsidiary,%SubsidiaryCode,Subsidiary Legal Name,Subsidiary Currency Code,Subsidiary Is Active,NetSuite Subsidiary ID,Is Active,Is Elimination,Subsidiary Region,Workday Subsidiary Name
0,Qlik Foreign Parent AB - Do Not Use,NA1,Qlik Foreign Parent AB - Do Not Use,SEK,No,54,,No,No,,,,Hodges and Sons,7a1,Hodges and Sons,EUR,No,7a1,No,No,APAC,Hodges and Sons
1,Expressor Software Corporation,EXP,Expressor Software Corporation,USD,No,46,,No,No,Americas,Expressor Software Corporation,No,Peters Group,110,Peters Group,EUR,No,110,No,No,APAC,Peters Group
2,Purchase Price Adjustments,PPA,Purchase Price Adjustments,USD,Yes,56,,Yes,No,Technologies,,No,Russell LLC,c4f,Russell LLC,USD,Yes,c4f,Yes,No,APAC,Russell LLC
3,QlikTech Holdings Inc.,HOI,QlikTech Holdings Inc.,USD,Yes,13,,Yes,No,Technologies,,Yes,"Banks, Morales and Armstrong",e62,"Banks, Morales and Armstrong",SEK,Yes,e62,Yes,No,APAC,"Banks, Morales and Armstrong"
4,QlikTech Belgium,BEL,QlikTech Belgium,EUR,No,5,BE0848691897,No,No,EMEA,,No,"Suarez, Johnson and Avery",b0a,"Suarez, Johnson and Avery",USD,No,b0a,No,No,World,"Suarez, Johnson and Avery"


## AR Comments

In [180]:
# read original file from gcs
df_comments = pd.read_csv(source_path+dict_files['Comments']+'.csv',storage_options=pd_options)
#df_comments[['comment_date','comment_text']]=df_comments['%ARCommentKey'].str.split('|',expand=True,n=1)
#df_comments['comment_date']=pd.to_datetime(df_comments['comment_date']).dt.date
print('original dataframe')
display(df_comments.head())

# anonymize dataframe
anon_comments=dfAnonymizer(df_comments)
#anon_comments.column_suppression(['comment_text'])
anon_comments.categorical_tokenization(['%ARCommentKey'],max_token_len=10,key=key)
#anon_comments.datetime_noise('comment_date',seed=seed)

df_comments_anon=anon_comments.to_df()
df_comments_anon['AR Comments']=df_comments_anon['AR Comments'].apply(lambda x:0 if pd.isna(x) else 1)

anon_comments.info()

df_comments=df_comments.join(df_comments_anon,how='inner',lsuffix='_orig')
print('full dataframe')
display(df_comments.head())

#persist anonymized df to GCS
df_comments_anon.to_csv(destination_path+dict_files['Comments']+'.csv',index=False)

# persist mapping tables to GCS
map_comment_key = dict(zip(df_comments['%ARCommentKey_orig'], df_comments['%ARCommentKey']))
with fs.open(destination_path+'map_comment_key.pickle', 'wb') as handle:
    pickle.dump(map_comment_key, handle, protocol=pickle.HIGHEST_PROTOCOL)

df_comments=df_comments_anon=map_comment_key=[]


original dataframe


Unnamed: 0,%ARCommentKey,AR Comments,comment_date,comment_text
0,1/1/2019|230569 PT. Evotech Distribusi,,2019-01-01,230569 PT. Evotech Distribusi
1,"1/1/2019|230572 IDW2- IntegraÃ§Ã£o e Desenvolvimento, Lda.",12/31/18-VÃ­ctor- (renewal). End user informed contract cancellation out of date.,2019-01-01,"230572 IDW2- IntegraÃ§Ã£o e Desenvolvimento, Lda."
2,1/1/2019|230578 BUSINESS & DECISION FRANCE,13/12 FM O/S in should be paid by EOM,2019-01-01,230578 BUSINESS & DECISION FRANCE
3,1/1/2019|230582 PT Mitra Integrasi Informatika,"20/Aug/18 Vivien: Indomarco MA $1.8K - Invoice has been processed for payment, Iwan will confirm on 20/8 the payment date",2019-01-01,230582 PT Mitra Integrasi Informatika
4,1/1/2019|230583 SSL Software Systems LLC,"21/12/2018 JIE//, renewal. Partner conf. will be provided PO by end user within a week (inv14684)",2019-01-01,230583 SSL Software Systems LLC


+---------------+--------+-------------+-----------------------+
|    Column     | Status |    Type     |        Method         |
| %ARCommentKey | 1      | categorical | Tokenization          |
+---------------+--------+-------------+-----------------------+
| AR Comments   | 0      | categorical |                       |
+---------------+--------+-------------+-----------------------+
| comment_date  | 1      | categorical | Datetime Perturbation |
+---------------+--------+-------------+-----------------------+
| comment_text  | 1      | categorical | Column Suppression    |
+---------------+--------+-------------+-----------------------+
full dataframe


Unnamed: 0,%ARCommentKey_orig,AR Comments_orig,comment_date_orig,comment_text,%ARCommentKey,AR Comments,comment_date
0,1/1/2019|230569 PT. Evotech Distribusi,,2019-01-01,230569 PT. Evotech Distribusi,84a2a45593,0,2019-09-02
1,"1/1/2019|230572 IDW2- IntegraÃ§Ã£o e Desenvolvimento, Lda.",12/31/18-VÃ­ctor- (renewal). End user informed contract cancellation out of date.,2019-01-01,"230572 IDW2- IntegraÃ§Ã£o e Desenvolvimento, Lda.",6303e07530,1,2019-03-05
2,1/1/2019|230578 BUSINESS & DECISION FRANCE,13/12 FM O/S in should be paid by EOM,2019-01-01,230578 BUSINESS & DECISION FRANCE,33b6ff8c43,1,2019-06-26
3,1/1/2019|230582 PT Mitra Integrasi Informatika,"20/Aug/18 Vivien: Indomarco MA $1.8K - Invoice has been processed for payment, Iwan will confirm on 20/8 the payment date",2019-01-01,230582 PT Mitra Integrasi Informatika,ba55a70d3f,1,2018-03-03
4,1/1/2019|230583 SSL Software Systems LLC,"21/12/2018 JIE//, renewal. Partner conf. will be provided PO by end user within a week (inv14684)",2019-01-01,230583 SSL Software Systems LLC,08d451c05c,1,2018-12-27


## DSO

In [181]:
# read original file from gcs
df_dso = pd.read_csv(source_path+dict_files['DSO']+'.csv')
#df_dso['NetSuite Extract DateTime']=pd.to_datetime(df_dso['NetSuite Extract DateTime'])
# split '%DSOKey' in period and subsidiary code to apply different anonymization
df_dso[['period','subsidiary_code']]=df_dso['%DSOKey'].str.split('|',expand=True)
#df_dso['period']=pd.to_datetime(df_dso['period'])

print('original dataframe')
display(df_dso.head())

# read mapping tables from gcs

with fs.open(destination_path+'map_subsidiary_code.pickle', 'rb') as handle:
    map_subsidiary_code = pickle.load(handle)
with fs.open(destination_path+'map_subsidiary_currency_code.pickle', 'rb') as handle:
    map_subsidiary_currency_code = pickle.load(handle)

# anonymize dataframe
anon_dso=dfAnonymizer(df_dso)
#anon_dso.datetime_noise(['NetSuite Extract DateTime','period'],seed=seed)
anon_dso.info()

df_dso_anon=anon_dso.to_df()
df_dso_anon['Transaction Line Amount - Local']=noise_amount_column(
    df_dso_anon['Transaction Line Amount - Local'])
df_dso_anon['Transaction Line Amount - USD']=noise_amount_column(
    df_dso_anon['Transaction Line Amount - USD'])
# df_dso_anon['%DSOKey']=df_dso_anon[
#     'period'].dt.strftime("%Y-%m")+'|'+df_dso_anon['subsidiary_code'].map(map_subsidiary_code)
# df_dso_anon['From Currency Code']=df_dso_anon['From Currency Code'].map(map_subsidiary_currency_code)
df_dso_anon['%DSOKey']=df_dso_anon[
    'period']+'|'+df_dso_anon['subsidiary_code'].map(map_subsidiary_code)
df_dso_anon['From Currency Code']=df_dso_anon['From Currency Code'].map(map_subsidiary_currency_code)



# merge original and anonymized dataframes
df_dso=df_dso.join(df_dso_anon,how='inner',lsuffix='_orig')
print('full dataframe')
display(df_dso.head())

# persist anonymized df to GCS
df_dso_anon.to_csv(destination_path+dict_files['DSO']+'.csv',index=False)

# persist mapping tables to GCS
map_dso_key = dict(zip(df_dso['%DSOKey_orig'], df_dso['%DSOKey']))
with fs.open(destination_path+'map_dso_key.pickle', 'wb') as handle:
    pickle.dump(map_dso_key, handle, protocol=pickle.HIGHEST_PROTOCOL)

# map_period=dict(zip(df_dso['period_orig'], df_dso['period']))
# with fs.open(destination_path+'map_period.pickle', 'wb') as handle:
#     pickle.dump(map_period, handle, protocol=pickle.HIGHEST_PROTOCOL)

# drop artificial columns created by splitting '%DSOKey'
df_dso=df_dso.drop(columns=['period','subsidiary_code','period_orig','subsidiary_code_orig'])
df_dso_anon=df_dso_anon.drop(columns=['period','subsidiary_code'])

df_dso=df_dso_anon=anon_dso=map_subsidiary_code=map_subsidiary_currency_code=map_dso_key=map_period=[]

original dataframe


Unnamed: 0,NetSuite Extract DateTime,Transaction Line Amount - Local,Transaction Line Amount - USD,%DSOKey,DSO Amount Type,From Currency Code,period,subsidiary_code
0,2022-04-06 02:27:59,3912778.8,4430557.0,2022-01|FRA,Revenue,,2022-01-01,FRA
1,2022-04-06 02:27:59,-15024.3,-16827.22,2022-04|FRA,Revenue,,2022-04-01,FRA
2,2022-04-06 02:27:59,3915833.96,4309884.0,2022-03|FRA,Revenue,,2022-03-01,FRA
3,2022-04-06 02:27:59,2949581.23,3346152.0,2022-02|FRA,Revenue,,2022-02-01,FRA
4,2022-04-06 02:27:59,2137978.6,316228.4,2022-03|DMK,Revenue,,2022-03-01,DMK


+---------------------------------+--------+-------------+-----------------------+
|             Column              | Status |    Type     |        Method         |
| NetSuite Extract DateTime       | 1      | datetime    | Datetime Perturbation |
+---------------------------------+--------+-------------+-----------------------+
| Transaction Line Amount - Local | 0      | numeric     |                       |
+---------------------------------+--------+-------------+-----------------------+
| Transaction Line Amount - USD   | 0      | numeric     |                       |
+---------------------------------+--------+-------------+-----------------------+
| %DSOKey                         | 0      | categorical |                       |
+---------------------------------+--------+-------------+-----------------------+
| DSO Amount Type                 | 0      | categorical |                       |
+---------------------------------+--------+-------------+-----------------------+
| Fr

Unnamed: 0,NetSuite Extract DateTime_orig,Transaction Line Amount - Local_orig,Transaction Line Amount - USD_orig,%DSOKey_orig,DSO Amount Type_orig,From Currency Code_orig,period_orig,subsidiary_code_orig,NetSuite Extract DateTime,Transaction Line Amount - Local,Transaction Line Amount - USD,%DSOKey,DSO Amount Type,From Currency Code,period,subsidiary_code
0,2022-04-06 02:27:59,3912778.8,4430557.0,2022-01|FRA,Revenue,,2022-01-01,FRA,2022-11-29 02:27:59,2658519.2,3003704.5,2022-08|f11,Revenue,,2022-08-26,FRA
1,2022-04-06 02:27:59,-15024.3,-16827.22,2022-04|FRA,Revenue,,2022-04-01,FRA,2022-06-08 02:27:59,-60016.2,-61218.1,2022-06|f11,Revenue,,2022-06-03,FRA
2,2022-04-06 02:27:59,3915833.96,4309884.0,2022-03|FRA,Revenue,,2022-03-01,FRA,2022-10-03 02:27:59,2660556.0,2923256.2,2022-08|f11,Revenue,,2022-08-28,FRA
3,2022-04-06 02:27:59,2949581.23,3346152.0,2022-02|FRA,Revenue,,2022-02-01,FRA,2021-06-06 02:27:59,2016387.5,2280768.3,2021-04|f11,Revenue,,2021-04-03,FRA
4,2022-04-06 02:27:59,2137978.6,316228.4,2022-03|DMK,Revenue,,2022-03-01,DMK,2022-04-06 02:27:59,1475319.1,260818.9,2022-03|43f,Revenue,,2022-03-01,DMK


## Invoices

In [182]:
# read original file from gcs
df_invoice = pd.read_csv(source_path+dict_files['Invoices']+'.csv')
date_columns=['Date','Due Date','As Of Date','Rev. Rec. Start Date','Rev. Rec. End Date','Contract Item Start Date','Contract Item End Date']
for column in date_columns:
    df_invoice[column]=pd.to_datetime(df_invoice[column],errors='coerce')

string_columns=['Customer Code','PO Number','%ItemID']
for column in string_columns:
    df_invoice[column]=df_invoice[column].astype(str)

# create 2 columns to store the original values of the columns to be anonymized
df_invoice[['document_id','item_id']]=df_invoice['%InvoiceItemKey'].str.split('|',expand=True,n=1)
print('original dataframe')
display(df_invoice.head())

# read mapping tables from gcs
with fs.open(destination_path+'map_subsidiary_currency_code.pickle', 'rb') as handle:
    map_subsidiary_currency_code = pickle.load(handle)
with fs.open(destination_path+'map_subsidiary_code.pickle', 'rb') as handle:
    map_subsidiary_code = pickle.load(handle)
with fs.open(destination_path+'map_comment_key.pickle', 'rb') as handle:
    map_comment_key = pickle.load(handle)
with fs.open(destination_path+'map_dso_key.pickle', 'rb') as handle:
    map_dso_key = pickle.load(handle)


# anonymize dataframe
anon_invoice=dfAnonymizer(df_invoice)
anon_invoice.column_suppression(['Detail URL','Customer URL','%SummaryKey','Project Name','Credit Limit'])
anon_invoice.datetime_noise(date_columns,seed=seed)

anon_invoice.categorical_tokenization(['PO Number'],max_token_len=10,key=key)
anon_invoice.categorical_resampling(['Country Code'],seed=seed)
anon_invoice.info()

df_invoice_anon=anon_invoice.to_df()
value_columns=[
    'Temp Transaction Amount',
    'Temp Amount Due (Foreign Currency)',
    'Open Balance',
    'Amount Due (Foreign Currency)',
    'Transaction Amount',
    'Remaining (m)',
    'Recognized Balance',
    'Remaining Deferred Balance',
    'Tax Value',
    'Recognized Balance (Foreign Currency)',
    'Remaining Deferred Balance (Foreign Currency)',
    'Tax Value (Foreign Currency)',
    'Recognized Balance (Local)',
    'Remaining Deferred Balance (Local)',
    'Tax Value (Local)'
    ]
for column in value_columns:
    df_invoice_anon[column]=noise_amount_column(df_invoice_anon[column])

# create fake data
df_invoice_anon['Customer Name']=fake_data_for_column(df_invoice_anon['Customer Name'],'company')
df_invoice_anon['Sales Rep Name']=fake_data_for_column(df_invoice_anon['Sales Rep Name'],'name')
df_invoice_anon['Accounts Receivable Accountant']=fake_data_for_column(df_invoice_anon['Accounts Receivable Accountant'],'name')

# anonymize %InvoiceItemKey and delete support fields
df_invoice_anon['document_id']=sequencial_values_for_column(df_invoice_anon['document_id'])
df_invoice_anon['%InvoiceItemKey']=df_invoice_anon['document_id'].astype(str)+'|'+df_invoice_anon['item_id'].astype(str)
df_invoice_anon=df_invoice_anon.drop(columns=['document_id','item_id'])
df_invoice=df_invoice.drop(columns=['document_id','item_id'])

# replace values with mapped values
df_invoice_anon['Transaction Currency']=df_invoice_anon['Transaction Currency'].map(map_subsidiary_currency_code)
df_invoice_anon['%ARCommentKey']=df_invoice_anon['%ARCommentKey'].map(map_comment_key)
df_invoice_anon['%DSOKey']=df_invoice_anon['%DSOKey'].map(map_dso_key)

# scramble values in 'Customer Code' and 'End User Code'
df_invoice_anon['Customer Code']=scramble_column(df_invoice_anon['Customer Code'].astype('str'))
df_invoice_anon['End User Code']=scramble_column(df_invoice_anon['End User Code'].astype('str'))
df_invoice_anon['Customer Original']=df_invoice_anon['Customer Code']+' '+df_invoice_anon['Customer Name']

# inherit values from other fields
df_invoice_anon['Customer']=df_invoice_anon['Customer Original']
df_invoice_anon['End User']=df_invoice_anon['End User Code']

df_invoice=df_invoice.join(df_invoice_anon,how='inner',lsuffix='_orig')
# display full dataframe
print('full dataframe')
display(df_invoice.head())

#persist anonymized df to GCS
df_invoice_anon.to_csv(destination_path+dict_files['Invoices']+'.csv',index=False)

# persist mapping tables to GCS
map_invoice_key=dict(zip(df_invoice['%InvoiceItemKey_orig'], df_invoice['%InvoiceItemKey']))
with fs.open(destination_path+'map_invoice_key.pickle', 'wb') as handle:
    pickle.dump(map_invoice_key, handle, protocol=pickle.HIGHEST_PROTOCOL)

#drop variables
map_subsidiary_currency_code=map_invoice_key=map_subsidiary_code=map_comment_key=map_dso_key=df_invoice=df_invoice_anon=anon_invoice=None

  df_invoice = pd.read_csv(source_path+'AR_Invoices.csv')


original dataframe


Unnamed: 0,Is Missing Required PO,Detail URL,Customer URL,Collection Group,%SummaryKey,Document ID,%ItemID,%ARCommentKey,Due Date,Date,As Of Date,Age,Age - Month End,Age - Quarter End,Age - Year End,Is Overdue,Is Overdue Over 30 Days,Is Overdue Over 60 Days,Is Overdue Over 90 Days,Customer Original,Customer Code,Customer Name,Transaction Type,Payment Terms on Account,Sales Rep Name,Supplier Portal,TempPriority,PO Number,Project Name,Temp Transaction Amount,Temp Amount Due (Foreign Currency),Open Balance,Transaction Currency,End User Code,Rev. Rec. Start Date,Rev. Rec. End Date,Contract Item Start Date,Contract Item End Date,Contract Item Term,Contract Term (m),Contract Term,%DSOKey,Customer,End User,Accounts Receivable Accountant,Aging Bucket,Forecast Aging Bucket - EOM,Forecast Aging Bucket - EOQ,Forecast Aging Bucket - EOY,Is Overdue Month End,Is Overdue Over 30 Days Month End,Is Overdue Over 60 Days Month End,Is Overdue Over 90 Days Month End,Is Month End,Payment Terms on Invoice,Is Missing AR Comments,Amount Due (Foreign Currency),Transaction Amount,Remaining (m),Recognized Balance,Remaining Deferred Balance,Tax Value,Recognized Balance (Foreign Currency),Remaining Deferred Balance (Foreign Currency),Tax Value (Foreign Currency),Recognized Balance (Local),Remaining Deferred Balance (Local),Tax Value (Local),Channel Tier,Country Code,Dedicated Account Rep,Is Dedicated Account,PO Required,Credit Status,Credit Limit,SFDC Account Record Type,%InvoiceItemKey,document_id,item_id
0,No,https://system.na1.netsuite.com/app/accounting/transactions/custinvc.nl?id=37506267,https://system.na1.netsuite.com/app/common/entity/custjob.nl?id=463544,Reseller,1/31/2019|230569 PT. Evotech Distribusi,INVSING00004586,2171.0,1/31/2019|230569 PT. Evotech Distribusi,2019-01-31,2019-01-01,2019-01-31,0,0,59,334,No,No,No,No,230569 PT. Evotech Distribusi,230569,PT. Evotech Distribusi,Invoice,Net 30 Days,"Ma, Simon E05406",No,,,,294840,294840,2975.79,USD,335119.0,2019-02-01,2019-11-30,2019-02-01,2019-11-30,10,10,10.0,2019-01|SIN,230569 PT. Evotech Distribusi,335119 PT. Dipo Star Finance,No Accounts Receivable Accountant,Not Due,Not Due,31-60,180+,No,No,No,No,Yes,30,Yes,2948.4,2948.4,10.0,0.0,2975.79,0.0,0.0,2948.4,0.0,0.0,4020.73308,0.0,Reseller,ID,,No,No,Approved,,Partner Account,INVSING00004586|2171,INVSING00004586,2171
1,No,https://system.na1.netsuite.com/app/accounting/transactions/custinvc.nl?id=37506268,https://system.na1.netsuite.com/app/common/entity/custjob.nl?id=463544,Reseller,1/31/2019|230569 PT. Evotech Distribusi,INVSING00004587,2171.0,1/31/2019|230569 PT. Evotech Distribusi,2019-01-31,2019-01-01,2019-01-31,0,0,59,334,No,No,No,No,230569 PT. Evotech Distribusi,230569,PT. Evotech Distribusi,Invoice,Net 30 Days,"Ma, Simon E05406",No,,,,23090,23090,233.04,USD,240598.0,2019-02-01,2019-03-31,2019-02-01,2019-03-31,2,2,2.0,2019-01|SIN,230569 PT. Evotech Distribusi,240598 dexa group,No Accounts Receivable Accountant,Not Due,Not Due,31-60,180+,No,No,No,No,Yes,30,Yes,230.9,230.9,2.0,0.0,233.04,0.0,0.0,230.9,0.0,0.0,314.87833,0.0,Reseller,ID,,No,No,Approved,,Partner Account,INVSING00004587|2171,INVSING00004587,2171
2,No,https://system.na1.netsuite.com/app/accounting/transactions/custinvc.nl?id=39352621,https://system.na1.netsuite.com/app/common/entity/custjob.nl?id=463544,Reseller,1/31/2019|230569 PT. Evotech Distribusi,INVSING00004620,2171.0,1/31/2019|230569 PT. Evotech Distribusi,2019-02-27,2019-01-28,2019-01-31,-27,-27,32,307,No,No,No,No,230569 PT. Evotech Distribusi,230569,PT. Evotech Distribusi,Invoice,Net 30 Days,"Ma, Simon E05406",No,,,,295910,295910,2966.32,USD,251416.0,2015-11-01,2019-10-31,2016-03-01,2019-10-31,44,44,48.0,2019-01|SIN,230569 PT. Evotech Distribusi,251416 Agel Langgeng,No Accounts Receivable Accountant,Not Due,Not Due,31-60,180+,No,No,No,No,Yes,30,Yes,2959.1,2959.1,9.0,2359.572727,606.747273,0.0,2353.8295454545,605.270455,0.0,3188.732885,819.959885,0.0,Reseller,ID,,No,No,Approved,,Partner Account,INVSING00004620|2171,INVSING00004620,2171
3,No,https://system.na1.netsuite.com/app/accounting/transactions/custinvc.nl?id=39352621,https://system.na1.netsuite.com/app/common/entity/custjob.nl?id=463544,Reseller,1/31/2019|230569 PT. Evotech Distribusi,INVSING00004620,10061.0,1/31/2019|230569 PT. Evotech Distribusi,2019-02-27,2019-01-28,2019-01-31,-27,-27,32,307,No,No,No,No,230569 PT. Evotech Distribusi,230569,PT. Evotech Distribusi,Invoice,Net 30 Days,"Ma, Simon E05406",No,,,,222448,222448,2229.9,USD,251416.0,2015-11-01,2019-10-31,2015-11-01,NaT,0,1,48.0,2019-01|SIN,230569 PT. Evotech Distribusi,251416 Agel Langgeng,No Accounts Receivable Accountant,Not Due,Not Due,31-60,180+,No,No,No,No,Yes,30,Yes,2224.48,2224.48,,2229.9,0.0,0.0,2224.48,0.0,0.0,3013.503056,0.0,0.0,Reseller,ID,,No,No,Approved,,Partner Account,INVSING00004620|10061,INVSING00004620,10061
4,No,https://system.na1.netsuite.com/app/accounting/transactions/custinvc.nl?id=39352621,https://system.na1.netsuite.com/app/common/entity/custjob.nl?id=463544,Reseller,1/31/2019|230569 PT. Evotech Distribusi,INVSING00004620,2171.0,1/31/2019|230569 PT. Evotech Distribusi,2019-02-27,2019-01-28,2019-01-31,-27,-27,32,307,No,No,No,No,230569 PT. Evotech Distribusi,230569,PT. Evotech Distribusi,Invoice,Net 30 Days,"Ma, Simon E05406",No,,,,593880,593880,5953.28,USD,251416.0,2015-11-01,2019-10-31,2015-11-01,2019-10-31,48,48,48.0,2019-01|SIN,230569 PT. Evotech Distribusi,251416 Agel Langgeng,No Accounts Receivable Accountant,Not Due,Not Due,31-60,180+,No,No,No,No,Yes,30,Yes,5938.8,5938.8,9.0,4837.04,1116.24,0.0,4825.275,1113.525,0.0,6536.800043,1508.492317,0.0,Reseller,ID,,No,No,Approved,,Partner Account,INVSING00004620|2171,INVSING00004620,2171


+-----------------------------------------------+--------+-------------+-----------------------+
|                    Column                     | Status |    Type     |        Method         |
| Is Missing Required PO                        | 0      | categorical |                       |
+-----------------------------------------------+--------+-------------+-----------------------+
| Detail URL                                    | 1      | categorical | Column Suppression    |
+-----------------------------------------------+--------+-------------+-----------------------+
| Customer URL                                  | 1      | categorical | Column Suppression    |
+-----------------------------------------------+--------+-------------+-----------------------+
| Collection Group                              | 0      | categorical |                       |
+-----------------------------------------------+--------+-------------+-----------------------+
| %SummaryKey                 

since Python 3.9 and will be removed in a subsequent version.
  shuffle(scrambled_str,return_number)


full dataframe


Unnamed: 0,Is Missing Required PO_orig,Detail URL,Customer URL,Collection Group_orig,%SummaryKey,Document ID_orig,%ItemID_orig,%ARCommentKey_orig,Due Date_orig,Date_orig,As Of Date_orig,Age_orig,Age - Month End_orig,Age - Quarter End_orig,Age - Year End_orig,Is Overdue_orig,Is Overdue Over 30 Days_orig,Is Overdue Over 60 Days_orig,Is Overdue Over 90 Days_orig,Customer Original_orig,Customer Code_orig,Customer Name_orig,Transaction Type_orig,Payment Terms on Account_orig,Sales Rep Name_orig,Supplier Portal_orig,TempPriority_orig,PO Number_orig,Project Name,Temp Transaction Amount_orig,Temp Amount Due (Foreign Currency)_orig,Open Balance_orig,Transaction Currency_orig,End User Code_orig,Rev. Rec. Start Date_orig,Rev. Rec. End Date_orig,Contract Item Start Date_orig,Contract Item End Date_orig,Contract Item Term_orig,Contract Term (m)_orig,Contract Term_orig,%DSOKey_orig,Customer_orig,End User_orig,Accounts Receivable Accountant_orig,Aging Bucket_orig,Forecast Aging Bucket - EOM_orig,Forecast Aging Bucket - EOQ_orig,Forecast Aging Bucket - EOY_orig,Is Overdue Month End_orig,Is Overdue Over 30 Days Month End_orig,Is Overdue Over 60 Days Month End_orig,Is Overdue Over 90 Days Month End_orig,Is Month End_orig,Payment Terms on Invoice_orig,Is Missing AR Comments_orig,Amount Due (Foreign Currency)_orig,Transaction Amount_orig,Remaining (m)_orig,Recognized Balance_orig,Remaining Deferred Balance_orig,Tax Value_orig,Recognized Balance (Foreign Currency)_orig,Remaining Deferred Balance (Foreign Currency)_orig,Tax Value (Foreign Currency)_orig,Recognized Balance (Local)_orig,Remaining Deferred Balance (Local)_orig,Tax Value (Local)_orig,Channel Tier_orig,Country Code_orig,Dedicated Account Rep_orig,Is Dedicated Account_orig,PO Required_orig,Credit Status_orig,Credit Limit,SFDC Account Record Type_orig,%InvoiceItemKey_orig,Is Missing Required PO,Collection Group,Document ID,%ItemID,%ARCommentKey,Due Date,Date,As Of Date,Age,Age - Month End,Age - Quarter End,Age - Year End,Is Overdue,Is Overdue Over 30 Days,Is Overdue Over 60 Days,Is Overdue Over 90 Days,Customer Original,Customer Code,Customer Name,Transaction Type,Payment Terms on Account,Sales Rep Name,Supplier Portal,TempPriority,PO Number,Temp Transaction Amount,Temp Amount Due (Foreign Currency),Open Balance,Transaction Currency,End User Code,Rev. Rec. Start Date,Rev. Rec. End Date,Contract Item Start Date,Contract Item End Date,Contract Item Term,Contract Term (m),Contract Term,%DSOKey,Customer,End User,Accounts Receivable Accountant,Aging Bucket,Forecast Aging Bucket - EOM,Forecast Aging Bucket - EOQ,Forecast Aging Bucket - EOY,Is Overdue Month End,Is Overdue Over 30 Days Month End,Is Overdue Over 60 Days Month End,Is Overdue Over 90 Days Month End,Is Month End,Payment Terms on Invoice,Is Missing AR Comments,Amount Due (Foreign Currency),Transaction Amount,Remaining (m),Recognized Balance,Remaining Deferred Balance,Tax Value,Recognized Balance (Foreign Currency),Remaining Deferred Balance (Foreign Currency),Tax Value (Foreign Currency),Recognized Balance (Local),Remaining Deferred Balance (Local),Tax Value (Local),Channel Tier,Country Code,Dedicated Account Rep,Is Dedicated Account,PO Required,Credit Status,SFDC Account Record Type,%InvoiceItemKey
0,No,https://system.na1.netsuite.com/app/accounting/transactions/custinvc.nl?id=37506267,https://system.na1.netsuite.com/app/common/entity/custjob.nl?id=463544,Reseller,1/31/2019|230569 PT. Evotech Distribusi,INVSING00004586,2171.0,1/31/2019|230569 PT. Evotech Distribusi,2019-01-31,2019-01-01,2019-01-31,0,0,59,334,No,No,No,No,230569 PT. Evotech Distribusi,230569,PT. Evotech Distribusi,Invoice,Net 30 Days,"Ma, Simon E05406",No,,,,294840,294840,2975.79,USD,335119.0,2019-02-01,2019-11-30,2019-02-01,2019-11-30,10,10,10.0,2019-01|SIN,230569 PT. Evotech Distribusi,335119 PT. Dipo Star Finance,No Accounts Receivable Accountant,Not Due,Not Due,31-60,180+,No,No,No,No,Yes,30,Yes,2948.4,2948.4,10.0,0.0,2975.79,0.0,0.0,2948.4,0.0,0.0,4020.73308,0.0,Reseller,ID,,No,No,Approved,,Partner Account,INVSING00004586|2171,No,Reseller,INVSING00004586,2171.0,d3af200e5a,2019-09-25,2019-08-26,2019-09-25,0,0,59,334,No,No,No,No,502693 Hodges and Sons,502693,Hodges and Sons,Invoice,Net 30 Days,Amy Wolf,No,,33dabf0380,246560.0,246560.0,51983.9,GBP,1.319305,2019-09-26,2020-07-24,2019-09-26,2020-07-24,10,10,10.0,2018-10|1d3,502693 Hodges and Sons,1.319305,Amy Wolf,Not Due,Not Due,31-60,180+,No,No,No,No,Yes,30,Yes,51965.6,51965.6,50006.7,50000.0,51983.9,50000.0,50000.0,51965.6,50000.0,50000.0,52680.5,50000.0,Reseller,DE,,No,No,Approved,Partner Account,144257|2171
1,No,https://system.na1.netsuite.com/app/accounting/transactions/custinvc.nl?id=37506268,https://system.na1.netsuite.com/app/common/entity/custjob.nl?id=463544,Reseller,1/31/2019|230569 PT. Evotech Distribusi,INVSING00004587,2171.0,1/31/2019|230569 PT. Evotech Distribusi,2019-01-31,2019-01-01,2019-01-31,0,0,59,334,No,No,No,No,230569 PT. Evotech Distribusi,230569,PT. Evotech Distribusi,Invoice,Net 30 Days,"Ma, Simon E05406",No,,,,23090,23090,233.04,USD,240598.0,2019-02-01,2019-03-31,2019-02-01,2019-03-31,2,2,2.0,2019-01|SIN,230569 PT. Evotech Distribusi,240598 dexa group,No Accounts Receivable Accountant,Not Due,Not Due,31-60,180+,No,No,No,No,Yes,30,Yes,230.9,230.9,2.0,0.0,233.04,0.0,0.0,230.9,0.0,0.0,314.87833,0.0,Reseller,ID,,No,No,Approved,,Partner Account,INVSING00004587|2171,No,Reseller,INVSING00004587,2171.0,d3af200e5a,2019-04-04,2019-03-05,2019-04-04,0,0,59,334,No,No,No,No,502693 Hodges and Sons,502693,Hodges and Sons,Invoice,Net 30 Days,Amy Wolf,No,,33dabf0380,65393.3,65393.3,50155.4,GBP,5.2984,2019-04-05,2019-06-02,2019-04-05,2019-06-02,2,2,2.0,2018-10|1d3,502693 Hodges and Sons,5.2984,Amy Wolf,Not Due,Not Due,31-60,180+,No,No,No,No,Yes,30,Yes,50153.9,50153.9,50001.3,50000.0,50155.4,50000.0,50000.0,50153.9,50000.0,50000.0,50209.9,50000.0,Reseller,DE,,No,No,Approved,Partner Account,139585|2171
2,No,https://system.na1.netsuite.com/app/accounting/transactions/custinvc.nl?id=39352621,https://system.na1.netsuite.com/app/common/entity/custjob.nl?id=463544,Reseller,1/31/2019|230569 PT. Evotech Distribusi,INVSING00004620,2171.0,1/31/2019|230569 PT. Evotech Distribusi,2019-02-27,2019-01-28,2019-01-31,-27,-27,32,307,No,No,No,No,230569 PT. Evotech Distribusi,230569,PT. Evotech Distribusi,Invoice,Net 30 Days,"Ma, Simon E05406",No,,,,295910,295910,2966.32,USD,251416.0,2015-11-01,2019-10-31,2016-03-01,2019-10-31,44,44,48.0,2019-01|SIN,230569 PT. Evotech Distribusi,251416 Agel Langgeng,No Accounts Receivable Accountant,Not Due,Not Due,31-60,180+,No,No,No,No,Yes,30,Yes,2959.1,2959.1,9.0,2359.572727,606.747273,0.0,2353.8295454545,605.270455,0.0,3188.732885,819.959885,0.0,Reseller,ID,,No,No,Approved,,Partner Account,INVSING00004620|2171,No,Reseller,INVSING00004620,2171.0,d3af200e5a,2019-08-26,2019-07-27,2019-07-30,-27,-27,32,307,No,No,No,No,502693 Hodges and Sons,502693,Hodges and Sons,Invoice,Net 30 Days,Amy Wolf,No,,33dabf0380,247273.3,247273.3,51977.5,GBP,4.216501,2016-04-29,2020-04-28,2016-08-28,2020-04-28,44,44,48.0,2018-10|1d3,502693 Hodges and Sons,4.216501,Amy Wolf,Not Due,Not Due,31-60,180+,No,No,No,No,Yes,30,Yes,51972.7,51972.7,50006.0,51573.0,50404.5,50000.0,51569.2,50403.5,50000.0,52125.8,50546.6,50000.0,Reseller,US,,No,No,Approved,Partner Account,118660|2171
3,No,https://system.na1.netsuite.com/app/accounting/transactions/custinvc.nl?id=39352621,https://system.na1.netsuite.com/app/common/entity/custjob.nl?id=463544,Reseller,1/31/2019|230569 PT. Evotech Distribusi,INVSING00004620,10061.0,1/31/2019|230569 PT. Evotech Distribusi,2019-02-27,2019-01-28,2019-01-31,-27,-27,32,307,No,No,No,No,230569 PT. Evotech Distribusi,230569,PT. Evotech Distribusi,Invoice,Net 30 Days,"Ma, Simon E05406",No,,,,222448,222448,2229.9,USD,251416.0,2015-11-01,2019-10-31,2015-11-01,NaT,0,1,48.0,2019-01|SIN,230569 PT. Evotech Distribusi,251416 Agel Langgeng,No Accounts Receivable Accountant,Not Due,Not Due,31-60,180+,No,No,No,No,Yes,30,Yes,2224.48,2224.48,,2229.9,0.0,0.0,2224.48,0.0,0.0,3013.503056,0.0,0.0,Reseller,ID,,No,No,Approved,,Partner Account,INVSING00004620|10061,No,Reseller,INVSING00004620,10061.0,d3af200e5a,2018-04-28,2018-03-29,2018-04-01,-27,-27,32,307,No,No,No,No,502693 Hodges and Sons,502693,Hodges and Sons,Invoice,Net 30 Days,Amy Wolf,No,,33dabf0380,198298.7,198298.7,51486.6,GBP,4.216501,2014-12-31,2018-12-30,2014-12-31,NaT,0,1,48.0,2018-10|1d3,502693 Hodges and Sons,4.216501,Amy Wolf,Not Due,Not Due,31-60,180+,No,No,No,No,Yes,30,Yes,51483.0,51483.0,,51486.6,50000.0,50000.0,51483.0,50000.0,50000.0,52009.0,50000.0,50000.0,Reseller,ES,,No,No,Approved,Partner Account,118660|10061
4,No,https://system.na1.netsuite.com/app/accounting/transactions/custinvc.nl?id=39352621,https://system.na1.netsuite.com/app/common/entity/custjob.nl?id=463544,Reseller,1/31/2019|230569 PT. Evotech Distribusi,INVSING00004620,2171.0,1/31/2019|230569 PT. Evotech Distribusi,2019-02-27,2019-01-28,2019-01-31,-27,-27,32,307,No,No,No,No,230569 PT. Evotech Distribusi,230569,PT. Evotech Distribusi,Invoice,Net 30 Days,"Ma, Simon E05406",No,,,,593880,593880,5953.28,USD,251416.0,2015-11-01,2019-10-31,2015-11-01,2019-10-31,48,48,48.0,2019-01|SIN,230569 PT. Evotech Distribusi,251416 Agel Langgeng,No Accounts Receivable Accountant,Not Due,Not Due,31-60,180+,No,No,No,No,Yes,30,Yes,5938.8,5938.8,9.0,4837.04,1116.24,0.0,4825.275,1113.525,0.0,6536.800043,1508.492317,0.0,Reseller,ID,,No,No,Approved,,Partner Account,INVSING00004620|2171,No,Reseller,INVSING00004620,2171.0,d3af200e5a,2019-03-02,2019-01-31,2019-02-03,-27,-27,32,307,No,No,No,No,502693 Hodges and Sons,502693,Hodges and Sons,Invoice,Net 30 Days,Amy Wolf,No,,33dabf0380,445920.0,445920.0,53968.9,GBP,4.216501,2015-11-04,2019-11-03,2015-11-04,2019-11-03,48,48,48.0,2018-10|1d3,502693 Hodges and Sons,4.216501,Amy Wolf,Not Due,Not Due,31-60,180+,No,No,No,No,Yes,30,Yes,53959.2,53959.2,50006.0,53224.7,50744.2,50000.0,53216.8,50742.3,50000.0,54357.9,51005.7,50000.0,Reseller,IT,,No,No,Approved,Partner Account,118660|2171


## Items

In [184]:
# load original file from gcs
df_items=pd.read_csv(source_path+dict_files['Items']+'.csv')
string_columns=['%ItemID']
for column in string_columns:
    df_items[column]=df_items[column].astype(str)

display(df_items.head())

# anonymize dataframe
anon_items=dfAnonymizer(df_items)
anon_items.categorical_resampling(
    ['Item Type','Product Family'],seed=seed)

anon_items.info()

df_items_anon=anon_items.to_df()

# create fake data
categorical_fake_dict={'Item Name':'color_name','Bookings Group 1':'word','Bookings Group 2':'currency_name','Bookings Group 3':'job'}
for key,value in categorical_fake_dict.items():
    df_items_anon[key]=fake_data_for_column(df_items_anon[key],value)

# inherit values from anonymized fields
df_items_anon['Item Description']=df_items_anon['Item Name']
df_items_anon['Item']=df_items_anon['Item Name']
df_items_anon['Product Family']=df_items_anon['Product Family'].replace('Qonnect Fees','Misc',inplace=False)

# merge anonymized df with original df
df_items=df_items.join(df_items_anon,how='inner',lsuffix='_orig')
display(df_items.head())

# persist anonymized df to GCS
df_items_anon.to_csv(destination_path+dict_files['Items']+'.csv',index=False)

# drop variables
df_items=df_items_anon=anon_items=[]

Unnamed: 0,%ItemID,Item Name,Item Description,Item,Item Type,Product Family,Bookings Group 1,Bookings Group 2,Bookings Group 3,Is AMP,Is ARR Item,Is ACV Calc,Is Non-Booking Invoicing
0,9297,No Tax,No Tax,No Tax No Tax,Sales Tax Item,,,,,No,No,No,No
1,9310,8030,OEM-Qlik Sense Test Site,8030 OEM-Qlik Sense Test Site,Non-inventory Item,Licenses,Non-Renewal,License,Perpetual Licenses,No,No,No,No
2,9328,8035,OEM - Qlik Sense Enterprise Test Site Limited 10,8035 OEM - Qlik Sense Enterprise Test Site Limited 10,Non-inventory Item,Licenses,Non-Renewal,License,Perpetual Licenses,No,No,No,No
3,9329,8040,OEM-Qlik Sense Enterprise Development Site,8040 OEM-Qlik Sense Enterprise Development Site,Non-inventory Item,Licenses,Non-Renewal,License,Perpetual Licenses,No,No,No,No
4,9331,8045,Upgrade Qlik Sense User Model Test Site to Production Capaci,8045 Upgrade Qlik Sense User Model Test Site to Production Capaci,Non-inventory Item,Licenses,Non-Renewal,License,Perpetual Licenses,No,No,No,No


+--------------------------+--------+-------------+------------+
|          Column          | Status |    Type     |   Method   |
| %ItemID                  | 0      | categorical |            |
+--------------------------+--------+-------------+------------+
| Item Name                | 0      | categorical |            |
+--------------------------+--------+-------------+------------+
| Item Description         | 0      | categorical |            |
+--------------------------+--------+-------------+------------+
| Item                     | 0      | categorical |            |
+--------------------------+--------+-------------+------------+
| Item Type                | 1      | categorical | Resampling |
+--------------------------+--------+-------------+------------+
| Product Family           | 1      | categorical | Resampling |
+--------------------------+--------+-------------+------------+
| Bookings Group 1         | 0      | categorical |            |
+------------------------

Unnamed: 0,%ItemID_orig,Item Name_orig,Item Description_orig,Item_orig,Item Type_orig,Product Family_orig,Bookings Group 1_orig,Bookings Group 2_orig,Bookings Group 3_orig,Is AMP_orig,Is ARR Item_orig,Is ACV Calc_orig,Is Non-Booking Invoicing_orig,%ItemID,Item Name,Item Description,Item,Item Type,Product Family,Bookings Group 1,Bookings Group 2,Bookings Group 3,Is AMP,Is ARR Item,Is ACV Calc,Is Non-Booking Invoicing
0,9297,No Tax,No Tax,No Tax No Tax,Sales Tax Item,,,,,No,No,No,No,9297,Chocolate,Chocolate,Chocolate,Non-inventory Item,Subscription,style,Burundian franc,Barrister's clerk,No,No,No,No
1,9310,8030,OEM-Qlik Sense Test Site,8030 OEM-Qlik Sense Test Site,Non-inventory Item,Licenses,Non-Renewal,License,Perpetual Licenses,No,No,No,No,9310,Gray,Gray,Gray,Non-inventory Item,Licenses,without,Gibraltar pound,"Engineer, civil (consulting)",No,No,No,No
2,9328,8035,OEM - Qlik Sense Enterprise Test Site Limited 10,8035 OEM - Qlik Sense Enterprise Test Site Limited 10,Non-inventory Item,Licenses,Non-Renewal,License,Perpetual Licenses,No,No,No,No,9328,DarkGoldenRod,DarkGoldenRod,DarkGoldenRod,Non-inventory Item,Licenses,without,Gibraltar pound,"Engineer, civil (consulting)",No,No,No,No
3,9329,8040,OEM-Qlik Sense Enterprise Development Site,8040 OEM-Qlik Sense Enterprise Development Site,Non-inventory Item,Licenses,Non-Renewal,License,Perpetual Licenses,No,No,No,No,9329,OldLace,OldLace,OldLace,Non-inventory Item,Subscription,without,Gibraltar pound,"Engineer, civil (consulting)",No,No,No,No
4,9331,8045,Upgrade Qlik Sense User Model Test Site to Production Capaci,8045 Upgrade Qlik Sense User Model Test Site to Production Capaci,Non-inventory Item,Licenses,Non-Renewal,License,Perpetual Licenses,No,No,No,No,9331,DodgerBlue,DodgerBlue,DodgerBlue,Non-inventory Item,Consulting,without,Gibraltar pound,"Engineer, civil (consulting)",No,No,No,No


## Link Table

In [185]:
# read data from GCS
df_link=pd.read_csv(source_path+dict_files['Link Table']+'.csv')
print('original dataframe')
display(df_link.head())

# read mapping tables from GCS
with fs.open(destination_path+'map_dso_key.pickle', 'rb') as handle:
    map_dso_key = pickle.load(handle)

with fs.open(destination_path+'map_subsidiary_code.pickle', 'rb') as handle:
    map_subsidiary_code = pickle.load(handle)

# anonymize dataframe
anon_link=dfAnonymizer(df_link)
anon_link.info()

df_link_anon=anon_link.to_df()

# replace values with mapped values
df_link_anon['%DSOKey']=df_link_anon['%DSOKey'].map(map_dso_key)
df_link_anon['%SubsidiaryCode']=df_link_anon['%SubsidiaryCode'].map(map_subsidiary_code)
df_link_anon['Period']=df_link_anon['%DSOKey'].str.split('|',expand=True)[0]

# merge anonymized df with original df
df_link=df_link.join(df_link_anon,how='inner',lsuffix='_orig')
display(df_link.head())

# persist anonymized df to GCS
df_link_anon.to_csv(destination_path+dict_files['Link Table']+'.csv',index=False)

df_link=df_link_anon=anon_link=map_dso_key=map_subsidiary_code=None

original dataframe


Unnamed: 0,%DSOKey,%SubsidiaryCode,Period
0,2021-09|IND,IND,2021-09
1,2021-09|KOR,KOR,2021-09
2,2022-02|MEX,MEX,2022-02
3,2022-04|SWZ,SWZ,2022-04
4,2021-09|CHN,CHN,2021-09


+-----------------+--------+-------------+--------+
|     Column      | Status |    Type     | Method |
| %DSOKey         | 0      | categorical |        |
+-----------------+--------+-------------+--------+
| %SubsidiaryCode | 0      | categorical |        |
+-----------------+--------+-------------+--------+
| Period          | 0      | categorical |        |
+-----------------+--------+-------------+--------+


Unnamed: 0,%DSOKey_orig,%SubsidiaryCode_orig,Period_orig,%DSOKey,%SubsidiaryCode,Period
0,2021-09|IND,IND,2021-09,2020-12|861,861,2020-12
1,2021-09|KOR,KOR,2021-09,2021-03|dc7,dc7,2021-03
2,2022-02|MEX,MEX,2022-02,2022-09|a86,a86,2022-09
3,2022-04|SWZ,SWZ,2022-04,2021-12|03a,03a,2021-12
4,2021-09|CHN,CHN,2021-09,2021-01|8fd,8fd,2021-01


## Product Lines

In [186]:
# read data from GCS
df_product_lines=pd.read_csv(source_path+dict_files['Product Lines']+'.csv')
print('original dataframe')
display(df_product_lines.head())

# anonymize dataframe
anon_product_lines=dfAnonymizer(df_product_lines)
anon_product_lines.info()

df_product_lines_anon=anon_product_lines.to_df()

# create fake data
categorical_fake_dict={'Product Line 3':'state','Product Line 2':'free_email_domain'}
for key,value in categorical_fake_dict.items():
    df_product_lines_anon[key]=fake_data_for_column(df_product_lines_anon[key],value)

map_product_line1={
    'Analytics':'Go to Market','Data Integration':'Operations','Non-Product Invoicing':'Others','Professional Services' : 'Support functions'}
df_product_lines_anon['Product Line 1']=df_product_lines_anon['Product Line 1'].map(map_product_line1)

# merge anonymized df with original df
df_product_lines=df_product_lines.join(df_product_lines_anon,how='inner',lsuffix='_orig')
display(df_product_lines.head())

# persist anonymized df to GCS
df_product_lines_anon.to_csv(destination_path+dict_files['Product Lines']+'.csv',index=False)

# persist mapping tables to GCS
map_product_line3=dict(zip(df_product_lines['Product Line 3_orig'], df_product_lines['Product Line 3']))
with fs.open(destination_path+'map_product_line3.pickle', 'wb') as handle:
    pickle.dump(map_product_line3, handle, protocol=pickle.HIGHEST_PROTOCOL)

df_product_lines=df_product_lines_anon=anon_product_lines=map_product_line3=None

original dataframe


Unnamed: 0,Product Line 1,Product Line 2,Product Line 3
0,Analytics,Value Added Products,Qlik AutoML
1,Analytics,Value Added Products,Qlik Analytics Connectors
2,Analytics,Value Added Products,Qlik Insight Advisor Chat
3,Analytics,Value Added Products,Qlik NPrinting
4,Analytics,Value Added Products,Qlik Alerting


+----------------+--------+-------------+--------+
|     Column     | Status |    Type     | Method |
| Product Line 1 | 0      | categorical |        |
+----------------+--------+-------------+--------+
| Product Line 2 | 0      | categorical |        |
+----------------+--------+-------------+--------+
| Product Line 3 | 0      | categorical |        |
+----------------+--------+-------------+--------+


Unnamed: 0,Product Line 1_orig,Product Line 2_orig,Product Line 3_orig,Product Line 1,Product Line 2,Product Line 3
0,Analytics,Value Added Products,Qlik AutoML,Go to Market,gmail.com,Arkansas
1,Analytics,Value Added Products,Qlik Analytics Connectors,Go to Market,gmail.com,Wisconsin
2,Analytics,Value Added Products,Qlik Insight Advisor Chat,Go to Market,gmail.com,Wisconsin
3,Analytics,Value Added Products,Qlik NPrinting,Go to Market,gmail.com,Illinois
4,Analytics,Value Added Products,Qlik Alerting,Go to Market,gmail.com,Colorado


## Invoice Item Detail

In [187]:
# read data from GCS
df_invoice_item=pd.read_csv(source_path+dict_files['Invoice Item Detail']+'.csv')
print('original dataframe')
display(df_invoice_item.head())

# read mapping tables from GCS
with fs.open(destination_path+'map_invoice_key.pickle', 'rb') as handle:
    map_invoice_key = pickle.load(handle)
with fs.open(destination_path+'map_product_line3.pickle', 'rb') as handle:
    map_product_line3 = pickle.load(handle)

#anonymize dataframe
anon_invoice_item=dfAnonymizer(df_invoice_item)
anon_invoice_item.column_suppression(['Created By'])
anon_invoice_item.info()

df_invoice_item_anon=anon_invoice_item.to_df()

df_invoice_item_anon['%InvoiceItemKey']=df_invoice_item_anon['%InvoiceItemKey'].map(map_invoice_key)
df_invoice_item_anon['Product Line 3']=df_invoice_item_anon['Product Line 3'].map(map_product_line3)

# merge anonymized df with original df
df_invoice_item=df_invoice_item.join(df_invoice_item_anon,how='inner',lsuffix='_orig')
display(df_invoice_item.head())

# persist anonymized df to GCS
df_invoice_item_anon.to_csv(destination_path+dict_files['Invoice Item Detail']+'.csv',index=False)

df_invoice_item_anon=df_invoice_item=anon_invoice_item=map_invoice_key=map_product_line3=None

original dataframe


Unnamed: 0,%InvoiceItemKey,Created By,Product Line 3
0,CMAT00000201|499,Singh Pearljit Kaur Randhawa Sarjit C01316,Analytics Maintenance
1,CMAT00000201|2171,Singh Pearljit Kaur Randhawa Sarjit C01316,QlikView Maintenance
2,CMAT00000214|499,Kow Wai Hung C01099,Analytics Maintenance
3,CMAT00000214|2171,Kow Wai Hung C01099,QlikView Maintenance
4,CMAT00000215|499,Kow Wai Hung C01099,Analytics Maintenance


+-----------------+--------+-------------+--------------------+
|     Column      | Status |    Type     |       Method       |
| %InvoiceItemKey | 0      | categorical |                    |
+-----------------+--------+-------------+--------------------+
| Created By      | 1      | categorical | Column Suppression |
+-----------------+--------+-------------+--------------------+
| Product Line 3  | 0      | categorical |                    |
+-----------------+--------+-------------+--------------------+


Unnamed: 0,%InvoiceItemKey_orig,Created By,Product Line 3_orig,%InvoiceItemKey,Product Line 3
0,CMAT00000201|499,Singh Pearljit Kaur Randhawa Sarjit C01316,Analytics Maintenance,268759|499,Nebraska
1,CMAT00000201|2171,Singh Pearljit Kaur Randhawa Sarjit C01316,QlikView Maintenance,268759|2171,Oregon
2,CMAT00000214|499,Kow Wai Hung C01099,Analytics Maintenance,34774|499,Nebraska
3,CMAT00000214|2171,Kow Wai Hung C01099,QlikView Maintenance,34774|2171,Oregon
4,CMAT00000215|499,Kow Wai Hung C01099,Analytics Maintenance,269669|499,Nebraska
