In [1]:
import pandas as pd
import csv
import random
from faker import Faker

In [2]:
def create_test_input_table(number_of_transactions, number_of_ids, types_of_transactions, id_low, id_max, id_sparse=False):
    d = []
    fake = Faker() 
    id_list=[]
    if id_max-id_low < number_of_ids:
        print("Number of IDs needs to be less or equal than the range of id_low:id_max")
        return  
    
    #This is a sparse approach, if you wish a simpler sequencial generation then use the commented version 
    
    
    if id_sparse==True:
        print("Generating sparse IDs")
        while True:
            random_id=random.randint(id_low,id_max)
            if not random_id in id_list:
                id_list.append([random_id,fake.name()])
            if len(id_list) == number_of_ids:
                 break   
    else:
        print("Generating sequential IDs")
        i=0
        ids=[]
        while True:
            ids.extend(range(id_low,id_low+number_of_ids))    
            id_list.append([ids[i],fake.name()])     
            i=i+1
            if len(id_list) == number_of_ids:
                break             
    print ("Created a list of id and names of len:" + str(len(id_list)))
    
  
    for i in range(0, number_of_transactions):
        j= random.randint(0,number_of_ids-1)
        id = id_list[j][0]
        name =id_list[j][1]
        transact= random.randint(1,types_of_transactions)
        amount= random.randint(-10000,100000)
        d.append({'ID': id ,'Name': name, 'Transact': transact, 'Amount': amount })
    print(len(d))
    df1=  pd.DataFrame(d)
    df2 = pd.DataFrame(id_list) 
    df2.columns=['ID','Name']
    df2.sort_values(by=['ID'], inplace=True)
    df1.to_csv("input.csv", index=False)
    df2.to_csv("id_list.csv", index=False)
    return df1
df= create_test_input_table(1000,100,10,80000,90000,True)
df.head()

Generating sparse IDs
Created a list of id and names of len:100
1000


Unnamed: 0,ID,Name,Transact,Amount
0,86107,Lauren Sanders,6,46104
1,89331,Amanda Hansen,6,54343
2,82355,Brooke Morris DDS,4,-9850
3,89927,Edward Robinson,6,3981
4,85804,Jacob Wilkerson,1,10045


In [None]:

def load_translation_table(file_path):
    L2S={}
    S2L={} 
    with open(file_path, 'r') as csvfile:
        r = csv.reader(csvfile, delimiter=',', quotechar='"',)
        next(r)
        for row in r:
            if row[0] in L2S.keys(): 
                print("Duplicate id value detected:" + str(row[0]))
            else:
                L2S[row[0]]=row[1]
            if row[1] in S2L.keys():
                print("Duplicate syn value detected:" + str(row[1]))
            else:
                S2L[row[1]]=row[0]
                
    return L2S, S2L


def save_translation_table(file_path,L2S):
    with open(file_path,'w') as f:
        w = csv.writer(f, delimiter=',', quotechar='"',quoting=csv.QUOTE_MINIMAL) 
        w.writerow(['id','syn'])
        w.writerows(L2S.items())




These functions below are meant to manage the available population of SYNs
If you only need to anonymise one file, a straight process to identify unique keys and associate to a series is fine
But if you are going to apply to multiple separate tables, you need to manage the extra ids that may appear down the line

In [None]:
def create_syn_range(start,end):
    l = list(range(start,end+1))
    l = [str(i) for i in l]
    return l

def load_available_syns(file_path):
    l=[]
    with open(file_path , 'r') as f:
        f.readline()
        for line in f:
            line = line.strip() #or some other preprocessing
            l.append(line) #storing everything in memory!
    return l
def save_available_syns(file_path, list_available):
    with open(file_path , 'w') as f:
        f.write("syn\n")
        for item in list_available:
            f.write("%s\n" % item)

def check_available_syns(list_available, S2L, L2S, cleanup_used= False):
    error_count=0
    for n in S2L.keys():
        if n in list_available:          
            error_count += 1
            loc= list_available.index(n)
            print(str(n)+" was found in the list_available list" + str(loc))
            if cleanup_used==True:
                list_available.pop(loc)
                print ("Removed from list in memory")
    print ("Errors found checking S2L:" + str(error_count))
    error_count=0
    for n in L2S.values():
        if n in list_available:          
            error_count += 1
            loc= list_available.index(n)
            print(str(n)+" was found in the list_available list" + str(loc))
            if cleanup_used==True:
                list_available.pop(loc)
                print ("Removed from list in memory")
    print ("Errors found checking L2S:" + str(error_count))


def expand_available_syns(list_available,  S2L, start, end):
    new_range= list(range(start,end+1))
    for n in new_range:
        if  str(n) not in S2L.keys():
            list_available.append(str(n))
    return list_available

def assign_syn_to_live(live_value,list_available,L2S,S2L):
    if live_value in L2S.keys():
        return L2S[live_value]
    else:
        if len(list_available)>0:
            new_syn=list_available.pop(0)
            L2S[live_value]=new_syn
            S2L[new_syn]=live_value
        else:
            print("Ran out of available ids")
            new_syn=""
        return new_syn


In [None]:

def anonymise(df, column_name, list_available, L2S, S2L ):
    for i, r in df.iterrows():
        live_value= r[column_name]
        syn_value = assign_syn_to_live(live_value,list_available,L2S,S2L)
        df.at[i, column_name]= syn_value
        
def de_anonymise(df,column_name,list_available,S2L):
    for i, r in df.iterrows():
        syn_value= r[column_name]
        live_value= S2L[syn_value]
        df.at[i,column_name]= live_value 

        
        
        

In [3]:
def anonymise_mass(df,column_name, L2S, S2L):
    trans = df[column_name].drop_duplicates().to_frame()
    trans.sort_values(column_name, inplace=True)
    trans.reset_index(drop=True, inplace=True)
    #This would be how to do it directly without any persistent translation table
    #trans.insert(1, 'New_ID', range(1, len(trans)+1))
    #trans_dict= pd.Series(trans.New_ID.values, index= trans.ID).to_dict()
    for i, r in trans.iterrows():
        live_value= r[column_name]
        syn_value = assign_syn_to_live(live_value,list_available,L2S,S2L)
 
    return L2S

In [5]:
trans_dict= create_translation(df,"ID")
trans_dict[800002]

3

In [3]:
df['New ID'] = df['ID'].map(trans_dict)
df

NameError: name 'trans_dict' is not defined