# GOAL

Anonymize data from AR app to allow public sharing.
- AR Comments (OK)
- Countries (N/A)
- DSO (OK)
- ExchangeRates (N/A)
- Invoice Item Detail
- Invoices
- Items
- Link Table
- Product Lines
- Subsidiaries (OK)

# PACKAGES

In [None]:
import pandas as pd
from anonympy.pandas import dfAnonymizer
from anonympy.pandas.utils_pandas import available_methods
from anonympy.pandas.utils_pandas import fake_methods

  from .autonotebook import tqdm as notebook_tqdm


# PARAMETERS

In [None]:
source_path='/datasets/gdrive/#profissional/Qlik/Content Factory/'

# FUNCTIONS

In [None]:
def noise_amount_column(original_column):
    noise_column=original_column.replace(".-","-0.",regex=True).astype('float')
    return noise_column.apply(lambda x: round(x*2/3+50000,1) if x>=0 else round(x*2/3-50000,1))

# DATA ANONYMIZATION

## Subsidiaries

In [None]:
df_subsidiaries=pd.read_csv(source_path+'AR_Subsidiaries.csv')
df_subsidiaries['NetSuite Subsidiary ID']=df_subsidiaries['NetSuite Subsidiary ID'].astype('str')
df_subsidiaries.head()

Unnamed: 0,Subsidiary,%SubsidiaryCode,Subsidiary Legal Name,Subsidiary Currency Code,Subsidiary Is Active,NetSuite Subsidiary ID,VAT Registration Number,Is Active,Is Elimination,Subsidiary Region,Workday Subsidiary Name,Is Attunity Subsidiary
0,Qlik Foreign Parent AB - Do Not Use,NA1,Qlik Foreign Parent AB - Do Not Use,SEK,No,54,,No,No,,,
1,Expressor Software Corporation,EXP,Expressor Software Corporation,USD,No,46,,No,No,Americas,Expressor Software Corporation,No
2,Purchase Price Adjustments,PPA,Purchase Price Adjustments,USD,Yes,56,,Yes,No,Technologies,,No
3,QlikTech Holdings Inc.,HOI,QlikTech Holdings Inc.,USD,Yes,13,,Yes,No,Technologies,,Yes
4,QlikTech Belgium,BEL,QlikTech Belgium,EUR,No,5,BE0848691897,No,No,EMEA,,No


In [None]:
anon_subsidiaries = dfAnonymizer(df_subsidiaries)

anon_subsidiaries.categorical_tokenization('%SubsidiaryCode',max_token_len=3)
anon_subsidiaries.categorical_fake({'Subsidiary':'company'})
anon_subsidiaries.column_suppression(['Is Attunity Subsidiary','VAT Registration Number'])
anon_subsidiaries.categorical_resampling(['Subsidiary Currency Code','Subsidiary Region'])

print(anon_subsidiaries.info())

df_subsidiaries_anon=anon_subsidiaries.to_df()
df_subsidiaries_anon['NetSuite Subsidiary ID']=df_subsidiaries_anon['%SubsidiaryCode']
df_subsidiaries_anon['Subsidiary Legal Name']=df_subsidiaries_anon['Subsidiary']
df_subsidiaries_anon['Workday Subsidiary Name']=df_subsidiaries_anon['Subsidiary']
df_subsidiaries_anon['Subsidiary Region']=df_subsidiaries_anon['Subsidiary Region'].replace({'Technologies':'World'},inplace=False)

+--------------------------+--------+-------------+--------------------+
|          Column          | Status |    Type     |       Method       |
| Subsidiary               | 1      | categorical | Synthetic Data     |
+--------------------------+--------+-------------+--------------------+
| %SubsidiaryCode          | 1      | categorical | Tokenization       |
+--------------------------+--------+-------------+--------------------+
| Subsidiary Legal Name    | 0      | categorical |                    |
+--------------------------+--------+-------------+--------------------+
| Subsidiary Currency Code | 1      | categorical | Resampling         |
+--------------------------+--------+-------------+--------------------+
| Subsidiary Is Active     | 0      | categorical |                    |
+--------------------------+--------+-------------+--------------------+
| NetSuite Subsidiary ID   | 0      | categorical |                    |
+--------------------------+--------+-------------+

In [None]:
df_subsidiaries=df_subsidiaries.join(df_subsidiaries_anon,how='inner',lsuffix='_orig')
df_subsidiaries.head()

Unnamed: 0,Subsidiary_orig,%SubsidiaryCode_orig,Subsidiary Legal Name_orig,Subsidiary Currency Code_orig,Subsidiary Is Active_orig,NetSuite Subsidiary ID_orig,VAT Registration Number,Is Active_orig,Is Elimination_orig,Subsidiary Region_orig,...,Subsidiary,%SubsidiaryCode,Subsidiary Legal Name,Subsidiary Currency Code,Subsidiary Is Active,NetSuite Subsidiary ID,Is Active,Is Elimination,Subsidiary Region,Workday Subsidiary Name
0,Qlik Foreign Parent AB - Do Not Use,NA1,Qlik Foreign Parent AB - Do Not Use,SEK,No,54,,No,No,,...,Garcia Inc,764,Garcia Inc,JPY,No,764,No,No,EMEA,Garcia Inc
1,Expressor Software Corporation,EXP,Expressor Software Corporation,USD,No,46,,No,No,Americas,...,Hall-Harrington,d2c,Hall-Harrington,EUR,No,d2c,No,No,EMEA,Hall-Harrington
2,Purchase Price Adjustments,PPA,Purchase Price Adjustments,USD,Yes,56,,Yes,No,Technologies,...,Jarvis-Duran,23d,Jarvis-Duran,EUR,Yes,23d,Yes,No,EMEA,Jarvis-Duran
3,QlikTech Holdings Inc.,HOI,QlikTech Holdings Inc.,USD,Yes,13,,Yes,No,Technologies,...,Cole Ltd,2ef,Cole Ltd,SGD,Yes,2ef,Yes,No,EMEA,Cole Ltd
4,QlikTech Belgium,BEL,QlikTech Belgium,EUR,No,5,BE0848691897,No,No,EMEA,...,Nelson-Montoya,87b,Nelson-Montoya,USD,No,87b,No,No,APAC,Nelson-Montoya


In [None]:
df_subsidiaries.to_csv(source_path+'full_subsidiaries.csv',index=False)
df_subsidiaries = anon_subsidiaries=df_subsidiaries_anon=[]

## AR Comments

In [None]:
df_comments = pd.read_csv(source_path+'AR_Comments.csv',nrows=100000)
df_comments[['comment_date','comment_text']]=df_comments['%ARCommentKey'].str.split('|',expand=True,n=1)
df_comments['comment_date']=pd.to_datetime(df_comments['comment_date']).dt.date
df_comments.head()

Unnamed: 0,%ARCommentKey,AR Comments,comment_date,comment_text
0,1/1/2019|230569 PT. Evotech Distribusi,,2019-01-01,230569 PT. Evotech Distribusi
1,1/1/2019|230572 IDW2- IntegraÃ§Ã£o e Desenvolv...,12/31/18-VÃ­ctor- (renewal). End user informed...,2019-01-01,"230572 IDW2- IntegraÃ§Ã£o e Desenvolvimento, Lda."
2,1/1/2019|230578 BUSINESS & DECISION FRANCE,13/12 FM O/S in should be paid by EOM,2019-01-01,230578 BUSINESS & DECISION FRANCE
3,1/1/2019|230582 PT Mitra Integrasi Informatika,20/Aug/18 Vivien: Indomarco MA $1.8K - Invoice...,2019-01-01,230582 PT Mitra Integrasi Informatika
4,1/1/2019|230583 SSL Software Systems LLC,"21/12/2018 JIE//, renewal. Partner conf. wil...",2019-01-01,230583 SSL Software Systems LLC


In [None]:
anon_comments=dfAnonymizer(df_comments)
anon_comments.info()

+---------------+--------+-------------+--------+
|    Column     | Status |    Type     | Method |
| %ARCommentKey | 0      | categorical |        |
+---------------+--------+-------------+--------+
| AR Comments   | 0      | categorical |        |
+---------------+--------+-------------+--------+
| comment_date  | 0      | categorical |        |
+---------------+--------+-------------+--------+
| comment_text  | 0      | categorical |        |
+---------------+--------+-------------+--------+


In [None]:
anon_comments.column_suppression(['comment_text'])
anon_comments.categorical_tokenization(['%ARCommentKey'],max_token_len=10)
anon_comments.datetime_noise('comment_date')

df_comments_anon=anon_comments.to_df()
df_comments_anon['AR Comments']=df_comments_anon['AR Comments'].apply(lambda x:0 if pd.isna(x) else 1)



In [None]:
df_comments.join(df_comments_anon,how='inner',lsuffix='_orig')

Unnamed: 0,%ARCommentKey_orig,AR Comments_orig,comment_date_orig,comment_text,%ARCommentKey,AR Comments,comment_date
0,1/1/2019|230569 PT. Evotech Distribusi,,2019-01-01,230569 PT. Evotech Distribusi,af485a2c7d,0,2018-07-05
1,1/1/2019|230572 IDW2- IntegraÃ§Ã£o e Desenvolv...,12/31/18-VÃ­ctor- (renewal). End user informed...,2019-01-01,"230572 IDW2- IntegraÃ§Ã£o e Desenvolvimento, Lda.",b8399ff8b7,1,2019-02-04
2,1/1/2019|230578 BUSINESS & DECISION FRANCE,13/12 FM O/S in should be paid by EOM,2019-01-01,230578 BUSINESS & DECISION FRANCE,75749424f1,1,2019-03-03
3,1/1/2019|230582 PT Mitra Integrasi Informatika,20/Aug/18 Vivien: Indomarco MA $1.8K - Invoice...,2019-01-01,230582 PT Mitra Integrasi Informatika,d91d1c8d6a,1,2018-07-06
4,1/1/2019|230583 SSL Software Systems LLC,"21/12/2018 JIE//, renewal. Partner conf. wil...",2019-01-01,230583 SSL Software Systems LLC,9eff63ed28,1,2019-07-26
...,...,...,...,...,...,...,...
99995,1/15/2021|358948 Jeppesen Systems AB,,2021-01-15,358948 Jeppesen Systems AB,0297d0c625,0,2021-02-17
99996,1/15/2021|359024 StraightForward Methods,11/24/2020 Dana Customer is up to date at this...,2021-01-15,359024 StraightForward Methods,0b01897868,1,2020-03-23
99997,1/15/2021|359085 Inovyo InteligeÌ‚ncia de Merc...,11/24/2020 Francisco- Demand Letter sent (cure...,2021-01-15,359085 Inovyo InteligeÌ‚ncia de Mercado LTDA,9636e1a274,1,2020-06-17
99998,1/15/2021|359096 DBS BANK INDIA LIMITED,Jan. 13 Ning Renewal INVIND202100526: Expect t...,2021-01-15,359096 DBS BANK INDIA LIMITED,a1b181465a,1,2021-02-16


In [None]:
df_comments=df_comments_anon=[]

## DSO

In [None]:
df_dso = pd.read_csv(source_path+'AR_DSO.csv')
df_dso['NetSuite Extract DateTime']=pd.to_datetime(df_dso['NetSuite Extract DateTime'])
df_dso[['period','subsidiary_code']]=df_dso['%DSOKey'].str.split('|',expand=True)
df_dso['period']=pd.to_datetime(df_dso['period'])
df_dso.head()

Unnamed: 0,NetSuite Extract DateTime,Transaction Line Amount - Local,Transaction Line Amount - USD,%DSOKey,DSO Amount Type,From Currency Code,period,subsidiary_code
0,2022-04-06 02:27:59,3912778.8,4430557.0,2022-01|FRA,Revenue,,2022-01-01,FRA
1,2022-04-06 02:27:59,-15024.3,-16827.22,2022-04|FRA,Revenue,,2022-04-01,FRA
2,2022-04-06 02:27:59,3915833.96,4309884.0,2022-03|FRA,Revenue,,2022-03-01,FRA
3,2022-04-06 02:27:59,2949581.23,3346152.0,2022-02|FRA,Revenue,,2022-02-01,FRA
4,2022-04-06 02:27:59,2137978.6,316228.4,2022-03|DMK,Revenue,,2022-03-01,DMK


In [None]:
df_full_subsidiaries=pd.read_csv(source_path+'full_subsidiaries.csv')
df_full_subsidiaries=df_full_subsidiaries[['%SubsidiaryCode_orig','Subsidiary Currency Code_orig','%SubsidiaryCode','Subsidiary Currency Code']]
map_subsidiary_code = dict(zip(df_full_subsidiaries['%SubsidiaryCode_orig'], df_full_subsidiaries['%SubsidiaryCode']))
map_subsidiary_currency_code = dict(zip(df_full_subsidiaries['Subsidiary Currency Code_orig'], df_full_subsidiaries['Subsidiary Currency Code']))
df_full_subsidiaries=[]

In [None]:
anon_dso=dfAnonymizer(df_dso)
anon_dso.datetime_noise(['NetSuite Extract DateTime','period'])
anon_dso.info()

df_dso_anon=anon_dso.to_df()
df_dso_anon['Transaction Line Amount - Local']=noise_amount_column(
    df_dso_anon['Transaction Line Amount - Local'])
df_dso_anon['Transaction Line Amount - USD']=noise_amount_column(
    df_dso_anon['Transaction Line Amount - USD'])
df_dso_anon['%DSOKey']=df_dso_anon[
    'period'].dt.strftime("%Y-%m")+'|'+df_dso_anon['subsidiary_code'].map(map_subsidiary_code)
df_dso_anon['From Currency Code']=df_dso_anon['From Currency Code'].map(map_subsidiary_currency_code)

+---------------------------------+--------+-------------+-----------------------+
|             Column              | Status |    Type     |        Method         |
| NetSuite Extract DateTime       | 1      | datetime    | Datetime Perturbation |
+---------------------------------+--------+-------------+-----------------------+
| Transaction Line Amount - Local | 0      | numeric     |                       |
+---------------------------------+--------+-------------+-----------------------+
| Transaction Line Amount - USD   | 0      | numeric     |                       |
+---------------------------------+--------+-------------+-----------------------+
| %DSOKey                         | 0      | categorical |                       |
+---------------------------------+--------+-------------+-----------------------+
| DSO Amount Type                 | 0      | categorical |                       |
+---------------------------------+--------+-------------+-----------------------+
| Fr

In [None]:
df_dso=df_dso.drop(columns=['period','subsidiary_code'])
df_dso_anon=df_dso_anon.drop(columns=['period','subsidiary_code'])
df_dso=df_dso.join(df_dso_anon,how='inner',lsuffix='_orig')
df_dso.head()

Unnamed: 0,NetSuite Extract DateTime_orig,Transaction Line Amount - Local_orig,Transaction Line Amount - USD_orig,%DSOKey_orig,DSO Amount Type_orig,From Currency Code_orig,NetSuite Extract DateTime,Transaction Line Amount - Local,Transaction Line Amount - USD,%DSOKey,DSO Amount Type,From Currency Code
0,2022-04-06 02:27:59,3912778.8,4430557.0,2022-01|FRA,Revenue,,2022-06-03 02:27:59,2658519.2,3003704.5,2022-02|cc8,Revenue,
1,2022-04-06 02:27:59,-15024.3,-16827.22,2022-04|FRA,Revenue,,2021-08-05 02:27:59,-60016.2,-61218.1,2021-12|cc8,Revenue,
2,2022-04-06 02:27:59,3915833.96,4309884.0,2022-03|FRA,Revenue,,2021-06-06 02:27:59,2660556.0,2923256.2,2021-12|cc8,Revenue,
3,2022-04-06 02:27:59,2949581.23,3346152.0,2022-02|FRA,Revenue,,2021-06-08 02:27:59,2016387.5,2280768.3,2021-08|cc8,Revenue,
4,2022-04-06 02:27:59,2137978.6,316228.4,2022-03|DMK,Revenue,,2022-11-04 02:27:59,1475319.1,260818.9,2022-10|033,Revenue,


In [None]:
df_dso=[]

## Invoices

In [None]:
df_invoice = pd.read_csv(source_path+'AR_Invoices.csv',nrows=100000)
date_columns=['Date','Due Date','As Of Date','Rev. Rec. Start Date','Rev. Rec. End Date','Contract Item Start Date','Contract Item End Date']
for column in date_columns:
    df_invoice[column]=pd.to_datetime(df_invoice[column])
df_invoice.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,Is Missing Required PO,Detail URL,Customer URL,Collection Group,%SummaryKey,Document ID,%ItemID,%ARCommentKey,Due Date,Date,...,Tax Value (Local),Channel Tier,Country Code,Dedicated Account Rep,Is Dedicated Account,PO Required,Credit Status,Credit Limit,SFDC Account Record Type,%InvoiceItemKey
0,No,https://system.na1.netsuite.com/app/accounting...,https://system.na1.netsuite.com/app/common/ent...,Reseller,1/31/2019|230569 PT. Evotech Distribusi,INVSING00004586,2171.0,1/31/2019|230569 PT. Evotech Distribusi,2019-01-31,2019-01-01,...,0.0,Reseller,ID,,No,No,Approved,,Partner Account,INVSING00004586|2171
1,No,https://system.na1.netsuite.com/app/accounting...,https://system.na1.netsuite.com/app/common/ent...,Reseller,1/31/2019|230569 PT. Evotech Distribusi,INVSING00004587,2171.0,1/31/2019|230569 PT. Evotech Distribusi,2019-01-31,2019-01-01,...,0.0,Reseller,ID,,No,No,Approved,,Partner Account,INVSING00004587|2171
2,No,https://system.na1.netsuite.com/app/accounting...,https://system.na1.netsuite.com/app/common/ent...,Reseller,1/31/2019|230569 PT. Evotech Distribusi,INVSING00004620,2171.0,1/31/2019|230569 PT. Evotech Distribusi,2019-02-27,2019-01-28,...,0.0,Reseller,ID,,No,No,Approved,,Partner Account,INVSING00004620|2171
3,No,https://system.na1.netsuite.com/app/accounting...,https://system.na1.netsuite.com/app/common/ent...,Reseller,1/31/2019|230569 PT. Evotech Distribusi,INVSING00004620,10061.0,1/31/2019|230569 PT. Evotech Distribusi,2019-02-27,2019-01-28,...,0.0,Reseller,ID,,No,No,Approved,,Partner Account,INVSING00004620|10061
4,No,https://system.na1.netsuite.com/app/accounting...,https://system.na1.netsuite.com/app/common/ent...,Reseller,1/31/2019|230569 PT. Evotech Distribusi,INVSING00004620,2171.0,1/31/2019|230569 PT. Evotech Distribusi,2019-02-27,2019-01-28,...,0.0,Reseller,ID,,No,No,Approved,,Partner Account,INVSING00004620|2171


In [None]:
anon_invoice=dfAnonymizer(df_invoice)
anon_invoice.column_suppression(['Detail URL','Customer URL','%SummaryKey','Project Name','Credit Limit'])
anon_invoice.datetime_noise(date_columns)

anon_invoice.info()


+-----------------------------------------------+--------+-------------+-----------------------+
|                    Column                     | Status |    Type     |        Method         |
| Is Missing Required PO                        | 0      | categorical |                       |
+-----------------------------------------------+--------+-------------+-----------------------+
| Detail URL                                    | 1      | categorical | Column Suppression    |
+-----------------------------------------------+--------+-------------+-----------------------+
| Customer URL                                  | 1      | categorical | Column Suppression    |
+-----------------------------------------------+--------+-------------+-----------------------+
| Collection Group                              | 0      | categorical |                       |
+-----------------------------------------------+--------+-------------+-----------------------+
| %SummaryKey                 

In [None]:
df_invoice_anon=anon_invoice.to_df()
value_columns=[
    'Temp Transaction Amount',
    'Temp Amount Due (Foreign Currency)',
    'Open Balance',
    'Amount Due (Foreign Currency)',
    'Transaction Amount',
    'Remaining (m)',
    'Recognized Balance',
    'Remaining Deferred Balance',
    'Tax Value',
    'Recognized Balance (Foreign Currency)',
    'Remaining Deferred Balance (Foreign Currency)',
    'Tax Value (Foreign Currency)',
    'Recognized Balance (Local)',
    'Remaining Deferred Balance (Local)',
    'Tax Value (Local)'
    ]
for column in value_columns:
    #print(column,df_invoice_anon[column].dtype)
    df_invoice_anon[column]=noise_amount_column(df_invoice_anon[column])

df_invoice_anon['Transaction Currency']=df_invoice_anon['Transaction Currency'].map(map_subsidiary_currency_code)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=e3ab9b9d-2749-46b3-82e1-74830b780a5f' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>