forked from jdaries/de_id
-
Notifications
You must be signed in to change notification settings - Fork 14
/
thrifty_gen.py
117 lines (107 loc) · 3.92 KB
/
thrifty_gen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#####
# Thrifty Algorithm Version in python 2.7
# Usage: python2 greedy_gen.py data_in data_out k [cols]
#####
import sys
import pandas as pd
import numpy as np
import copy
# this will be the function that does all the lumping
# df -> the data, in a pandas dataframe
# columns -> a list of columns that will be generalized on
# this is in order of which to generalize first
# will first generalize on columns[0], then columns[1], etc.
# k -> level of k-anonymity, default to 5
# outfile -> the file to write the steps to
# returns the generalized dataframe
# this will be the function that does all the lumping
# df -> the data, in a pandas dataframe
# columns -> a list of columns that will be generalized on
# this is in order of which to generalize first
# will first generalize on columns[0], then columns[1], etc.
# k -> level of k-anonymity, default to 5
# outfile -> the file to write the steps to
# returns the generalized dataframe
def thrifty(df, columns, k=5, outfile='out.txt'):
df = copy.deepcopy(df)
current = 0
current_col = columns[current]
def new_lump_col(x):
acc = ""
final = columns[-1]
for c in columns[:-1]:
acc += str(x[c])
acc += "*"
acc += str(x[final])
return acc
def lump(s, t1, t2):
if s == t1 or s == t2:
if str(t2)[-1] == '+':
return str(t2)
return str(t2) + '+'
return s
# set up file
out = open(outfile, 'w')
out.write("GENERALIZING BASED (IN ORDER) ON:\n")
for c in columns:
out.write(" " + c + "\n")
out.write("\n\nSTEPS TO GENERALIZE\n")
while (True):
# PRE-LUMPING
# before starting, see if we need to go to the next column to generalize on
if (len(df[current_col].value_counts()) < 2):
current += 1
current_col = columns[current]
# create the new columns we need
df['lump_col'] = df.apply(lambda row: new_lump_col(row), axis=1)
df['freq'] = df.groupby('lump_col')['lump_col'].transform('count')
# check if finished
if df['freq'].min() >= k:
break
# check if we can't make it because there is nothing left to do
elif len(df['freq'].unique()) <= 1:
out.write("Could not finish generalizing to " + str(k) + " anonymity.\n")
out.close()
return False
# GENERALIZING
# find the lowest
temp = copy.deepcopy(df)
row1 = df.loc[df['freq'] == df['freq'].min()].iloc[0]
lump1 = row1[current_col]
# try to generalize things of size n with things of size k-n
sz1 = row1['freq']
target = k - sz1
df2 = df.loc[(df['freq'] == target) & (df[current_col] != lump1)]
if not df2.empty:
row2 = df2.iloc[0]
lump2 = row2[current_col]
# if we cannot find size k-n, then just lump the two smallest together
else:
temp = temp[temp[current_col] != lump1]
row2 = temp.nsmallest(1, 'freq').iloc[0]
lump2 = row2[current_col]
sz2 = row2['freq']
out.write("Combine: " + lump1 + " with " + lump2 + ", sizes(" + str(sz1) + ", " + str(sz2) + ")\n")
# lump them together
df[current_col] = df.apply(lambda row: lump(row[current_col], lump1, lump2), axis=1)
# finally, return
out.close()
return df
#### Script of code
if len(sys.argv) < 5:
print("Usage: python2 greedy_gen.py data_in data_out k [cols]\n")
exit(1)
df = pd.read_csv(sys.argv[1])
k = int(sys.argv[3])
if k < 1:
print("Please enter a valid value of k\n")
exit(2)
cols = sys.argv[4:]
df_cols = list(df)
if not set(cols).issubset(set(df_cols)):
print("Please enter valid column values\n")
exit(3)
df2 = thrifty(df, cols, k)
df2.to_csv(sys.argv[2])
print("Data generalized to " + str(k) + " in " + sys.argv[2] + "\nStep by step process in out.txt\n")
exit(0)