In [339]:
import pandas as pd


In [340]:
!ls

anonymised.csv    available_ids.csv translation.csv
anonymiser.ipynb  input.csv         translation2.csv


In [404]:
import random
from faker import Faker
def create_test_input_table(number_of_transactions, number_of_ids, types_of_transactions):
    d = []
    fake = Faker() 
    name_list=[]
    for i in range(0,number_of_ids):
        name_list.append(fake.name())
    print ("Created a list of names of len:" + str(len(name_list)))
    for i in range(0, number_of_transactions):
        id= random.randint(1,number_of_ids)
        name =name_list[id-1]
        transact= random.randint(1,types_of_transactions)
        d.append({'ID': id ,'Name': name, 'Transact': transact })
    print(len(d))
    df= pd.DataFrame(d)
    return df
    

In [405]:
df= create_test_input_table(1000,10,10)


Created a list of names of len:10
1000


In [357]:
#to delete any column by index from a dataframe
#df= df.drop(df.columns[0],1)
#df  (or use inplace=True)

In [364]:
import csv
def load_translation_table(file_path):
    L2S={}
    S2L={} 
    with open(file_path, 'r') as csvfile:
        r = csv.reader(csvfile, delimiter=',', quotechar='"',)
        next(r)
        for row in r:
            if row[0] in L2S.keys(): 
                print("Duplicate id value detected:" + str(row[0]))
            else:
                L2S[row[0]]=row[1]
            if row[1] in S2L.keys():
                print("Duplicate syn value detected:" + str(row[1]))
            else:
                S2L[row[1]]=row[0]
                
    return L2S, S2L


def save_translation_table(file_path,L2S):
    with open(file_path,'w') as f:
        w = csv.writer(f, delimiter=',', quotechar='"',quoting=csv.QUOTE_MINIMAL) 
        w.writerow(['id','syn'])
        w.writerows(L2S.items())




     

This function loads the translation table, which is a plain CSV
Check out the quoting and delimiter if you wish to work with text IDs
It does check if there is any duplicate. 
Because it is a transation table, there should be NO duplicates either way.

In [365]:
def create_syn_range(start,end):
    l = list(range(start,end+1))
    l = [str(i) for i in l]
    return l

def load_available_syns(file_path):
    l=[]
    with open(file_path , 'r') as f:
        f.readline()
        for line in f:
            line = line.strip() #or some other preprocessing
            l.append(line) #storing everything in memory!
    return l
def save_available_syns(file_path, list_available):
    with open(file_path , 'w') as f:
        f.write("syn\n")
        for item in list_available:
            f.write("%s\n" % item)

def check_available_syns(list_available, S2L, L2S, cleanup_used= False):
    error_count=0
    for n in S2L.keys():
        if n in list_available:          
            error_count += 1
            loc= list_available.index(n)
            print(str(n)+" was found in the list_available list" + str(loc))
            if cleanup_used==True:
                list_available.pop(loc)
                print ("Removed from list in memory")
    print ("Errors found checking S2L:" + str(error_count))
    error_count=0
    for n in L2S.values():
        if n in list_available:          
            error_count += 1
            loc= list_available.index(n)
            print(str(n)+" was found in the list_available list" + str(loc))
            if cleanup_used==True:
                list_available.pop(loc)
                print ("Removed from list in memory")
    print ("Errors found checking L2S:" + str(error_count))


def expand_available_syns(list_available,  S2L, start, end):
    new_range= list(range(start,end+1))
    for n in new_range:
        if  str(n) not in S2L.keys():
            list_available.append(str(n))
    return list_available


In [415]:
def assign_syn_to_live(live_value,list_available,L2S,S2L):
    if live_value in L2S.keys():
        return L2S[live_value]
    else:
        if len(list_available)>0:
            new_syn=list_available.pop(0)
            L2S[live_value]=new_syn
            S2L[new_syn]=live_value
        else:
            print("Ran out of available ids")
            new_syn=""
        return new_syn

def anonymise(df, column_name, list_available, L2S, S2L ):
    for i, r in df.iterrows():
        live_value= r[column_name]
        syn_value = assign_syn_to_live(live_value,list_available,L2S,S2L)
        df.at[i, column_name]= syn_value
def de_anonymise(df,column_name,list_available,S2L):
    for i, r in df.iterrows():
        syn_value= r[column_name]
        live_value= S2L[syn_value]
        df.at[i,column_name]= live_value 


In [418]:
%%time
L2S={}
S2L={}
df= create_test_input_table(1000,100,10)
list_available=create_syn_range(1,200)
len(df)
print(df)

Created a list of names of len:100
1000
     ID                   Name  Transact
0    48         Barbara Hodges         2
1    86            Warren Hunt         4
2    34            Lori Turner         1
3    76             Chase Soto         8
4    72         Cameron Walker         8
5    36        Kristine Larsen         7
6    51          Thomas Flores         3
7    90          Tanya Ramirez         2
8    93        Jonathan Miller         6
9    16            Karen Banks         6
10   19     Valerie Richardson         2
11   80      Dr. Gerald Murphy         5
12   36        Kristine Larsen         4
13   56            Michele Lee         4
14   33         Gary Davis DDS         5
15   21            Thomas Mays         4
16   73       Stephen Saunders         1
17   79     Christopher Hudson         2
18   71           Kirk Buckley         2
19   58        Nicholas French         6
20   65          Melissa Smith         6
21   10            Mark Jensen        10
22   88          

In [419]:

anonymise(df,"ID",list_available,L2S,S2L)

In [420]:
df

Unnamed: 0,ID,Name,Transact
0,1,Barbara Hodges,2
1,2,Warren Hunt,4
2,3,Lori Turner,1
3,4,Chase Soto,8
4,5,Cameron Walker,8
5,6,Kristine Larsen,7
6,7,Thomas Flores,3
7,8,Tanya Ramirez,2
8,9,Jonathan Miller,6
9,10,Karen Banks,6


In [421]:
de_anonymise(df,"ID",list_available,S2L)
df

KeyError: 1

In [338]:
%%time
df.to_csv( "anonymised.csv", index=False , sep=',')


## Test for a small scale translation with existing files

df is the dataframe with the data to anonymise. We use a plain csv loader that comes with Pandas.
Depending on the format you need to beef up this command to load or otherwise process the data into a single data frame.

In [None]:
print("We load the translation table, available keys and input file")
L2S , S2L = load_translation_table('translation.csv')
df= pd.read_csv('input.csv')
list_available=load_available_slots("available_ids.csv")

print (L2S.keys())
print (S2L.keys())
print (list_available)

print("Now We try to save the table and load the table again")
save_translation_table("translation.csv", L2S)
L2S , S2L = load_translation_table('translation.csv')
print (L2S.keys())
print (S2L.keys())


print ("Now we will erase the the list_available")
list_available= create_syn_range(1,10)
print ("Created list with " + str(len(list_available) + " elements")

expand_available_syns(list_available, S2L,1,40)
save_available_syns("available_ids.csv", list_available)


check_available_syns(list_available, S2L, L2S, True)
print ("Current list of available Syns:")
print (list_available)

new_syn, L2S, S2L = assign_syn_to_live(7777, list_available, L2S, S2L)
print(new_syn)
print(list_available)
print (L2S)
print (S2L)
