forked from jdaries/de_id
-
Notifications
You must be signed in to change notification settings - Fork 14
/
numeric_generalization_v2.py
260 lines (231 loc) · 10.7 KB
/
numeric_generalization_v2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
__author__ = 'olivia'
import sys, pickle, csv
import numpy as np
import pandas as pd
"""
Bin a set of numeric values so that at least n entities are within each bin. In particular,
this code will take the Year of Birth (YoB) and the number of forum posts (nforum_posts) values
and produce bins with a particular range and calculate the mean for that range. This code is
improved in order to bin values together in such a way that minimizes the distortion of the mean
of the post-binned values.
"""
# Working for year of birth and number of forum posts
def collapse(val,num,denom,maxbinsize):
"""
Given a max bin size, greedily find the endpoints that will create the bins with the smallest distortion to the
overall bin mean that has at least maxbinsize members.
Note that this only works for integer values. The function also finds the mean of the values in each bin.
:param val: a list of the original, unbinned values
:param num: a list of the sum of the values with the new binned values
:param denom: a list of the number of values with the new binned values
:return: three lists of val, num, denom where val is a list of lists of the original values that now belong
to each bin; num is the sum of all the values that belong in that bin; and denom is the count of the number
of values that belong to that bin.
"""
def left_merge(index,val=val,num=num,denom=denom):
num[index-1] = num[index-1] + num[index]
num = np.delete(num,index)
denom[index-1] = denom[index-1] + denom[index]
denom = np.delete(denom,index)
val[index-1] = val[index-1]+val[index]
del val[index]
return val,num,denom
def right_merge(index,val=val,num=num,denom=denom):
num[index] = num[index] + num[index+1]
num = np.delete(num,index+1)
denom[index] = denom[index] + denom[index+1]
denom = np.delete(denom,index+1)
val[index] = val[index]+val[index+1]
del val[index+1]
return val,num,denom
indices_less_than_k = np.where(denom<maxbinsize)[0]
left_indices = indices_less_than_k-1
# if left endpoint is in list, then don't calculate for that index
if -1 not in left_indices:
left_errors = abs(num[indices_less_than_k]/denom[indices_less_than_k] - num[left_indices]/denom[left_indices])
minerror_left = min(left_errors)
minerror_left_index = indices_less_than_k[np.argmin(left_errors)]
elif len(left_indices) > 1:
# compare with left bucket
left_errors = abs(num[indices_less_than_k[1:]]/denom[indices_less_than_k[1:]] - num[left_indices[1:]]/denom[left_indices[1:]])
minerror_left = min(left_errors)
minerror_left_index = indices_less_than_k[np.argmin(left_errors)+1]
else:
minerror_left = np.inf
minerror_left_index = np.inf
right_indices = indices_less_than_k+1
# if right endpoint is in list, then don't calculate for that index
if len(denom) not in right_indices:
right_errors = abs(num[indices_less_than_k]/denom[indices_less_than_k] - num[right_indices]/denom[right_indices])
minerror_right = min(right_errors)
minerror_right_index = indices_less_than_k[np.argmin(right_errors)]
elif len(right_indices)>1:
# compare with right bucket
right_errors = abs(num[indices_less_than_k[:-1]]/denom[indices_less_than_k[:-1]] - num[right_indices[:-1]]/denom[right_indices[:-1]])
minerror_right = min(right_errors)
minerror_right_index = indices_less_than_k[np.argmin(right_errors)]
else:
minerror_right = np.inf
minerror_right_index = np.inf
# overall smallest error
# collapse left
if minerror_left <= minerror_right:
return left_merge(minerror_left_index)
elif minerror_right < minerror_left:
return right_merge(minerror_right_index)
# Creates a dictionary that maps each unique value onto a corresponding range
def createConversionDict(val,num,denom,numdenom):
"""
Take a list of endpoints as generated by collapse and create a dictionary whose keys are the unique first
values in qry and whose corresponding values are a pair of the bin that each of the unique values in the dataset
should be mapped onto and the mean of that bin.
:param val: a list of the original, unbinned values
:param num: a list of the sum of the values with the new binned values
:param denom: a list of the number of values with the new binned values
:param numdenom: a pandas dataframe of the original values, counts, and sums
:return: a dictionary keyed by value binned with values the pair (bin range, mean). Bin range will be a
string, while mean will be a floating point value
"""
numDict = {}
for i,v in enumerate(val):
tm = numdenom.ix[v]
bucket_count = tm['count'].sum()
bucket_mean = tm['sum'].sum() / float(bucket_count)
for v2 in v:
if len(v)==1:
numDict[v2] = (str(v[0]), bucket_mean, bucket_count)
else:
numDict[v2] = (str(min(v))+'-'+str(max(v)), bucket_mean, bucket_count)
return numDict
def build_bins(val_l, bin_size):
'''
Build the range of values, starting from a particular bin size. The algorithm will first build generalization bins
of the indicated size, but will then use the greedy algorithm in collapse() to re-arrange the bins so that the
mean value deviates as little as possible form the values in the bin. This means that the bins may be larger or
smaller than the indicated bin size, but minimizes the statistical bias of the generalization
:param val_l: A list of values and counts for those values
:param bin_size: the size of the initial bins to create
:return: a generalization table in the form of a dictionary mapping from value to the range of values that will be
substituted for the initial value when generalization occurs
'''
d_frame = pd.DataFrame(val_l)
d_frame.columns = ['item', 'count', 'sum']
d_frame.index = d_frame['item']
val = [[x] for x in list(d_frame['item'].values)]
denom = d_frame['count'].values
num = d_frame['sum'].values
while sum(denom<bin_size) > 0:
val, num, denom = collapse(val, num, denom, bin_size)
dictobj = createConversionDict(val, num, denom, d_frame)
return dictobj
def update_num_dict(val, dict):
'''
Update an entry in a dictionary keyed by quasi-identifier values with value the number of entries with that value
and the total for the value (value * count).
:param val: the value of the quasi-identifier
:param dict: the dictionary to be updated
:return: The dictionary updated
'''
if val != '':
i = int(val)
else:
i = -1
if i in dict:
dict[i][0] += 1
dict[i][1] += i
else:
dict[i] = [1, i]
return dict
def dict_to_list(d):
'''
Convert a dictionary keyed by quasi-identifier values with entries pairs of the number of items with that value
and the product of the value and the count into a list of triples <value, count, count*value>. Return this list,
sorted by value. Note that the special value of '' will be handled with a triple of <''. count, 0>; this will
be the last member of the returned list if it exists.
:param d: A dictionary of quasi-identifier values
:return: A sorted list of triples of the form <value, count, value*count>
'''
ret_list = []
for i,v in d.items():
ret_list.append([i, v[0], v[1]])
ret_list.sort()
l = ret_list.pop()
if l[0] == '-1':
l[2] = 0
ret_list.append(l)
return ret_list
def dump_map(m, f_name):
'''
Write a pickle of a map to an output file
:param m: the map to be pickled
:param f_name: the name of the file to contain the pickled map
:return: None
'''
f_out = open(f_name, 'wb')
pickle.dump(m, f_out)
f_out.close()
return None
def create_value_maps(cin, fname_ps, bin_size):
'''
Create the generalization tables, and save them in pickled form. This function takes a .csv file of quasi-identifiers,
the name to use as the base of the output file, and the size of the bins to be produced. It will create a mapping
file for each of the numeric quasi-identifiers, grouping the numeric values so that there are at least bin_size
records within each bin.
:param cin: A .csv file to be used as input. We assume the first line is made up of header information, so it is
skipped
:param fname_ps: The base name for the output files
:param bin_size: The minimum size of the bins that are to be create
:return: None
'''
#create the dictionaries for each of the different numeric values
yob_d = {}
f_post_d = {}
f_votes_d = {}
f_endorse_d = {}
f_threads_d = {}
f_comments_d = {}
#read the header line
next(cin)
#construct the dictionaries of value, count
for l in cin:
yob_d = update_num_dict(l[8], yob_d)
f_post_d = update_num_dict(l[10], f_post_d)
f_votes_d = update_num_dict(l[11], f_votes_d)
f_endorse_d = update_num_dict(l[12], f_endorse_d)
f_threads_d = update_num_dict(l[13], f_threads_d)
f_comments_d = update_num_dict(l[14], f_comments_d)
#convert the dictionaries to lists
yob_l = dict_to_list(yob_d)
f_post_l = dict_to_list(f_post_d)
f_votes_l = dict_to_list(f_votes_d)
f_endorse_l = dict_to_list(f_endorse_d)
f_threads_l = dict_to_list(f_threads_d)
f_comments_l = dict_to_list(f_comments_d)
#build the bins from the lists
yob_map = build_bins(yob_l, bin_size)
f_post_map = build_bins(f_post_l, bin_size)
f_votes_map = build_bins(f_votes_l, bin_size)
f_endorse_map = build_bins(f_endorse_l, bin_size)
f_threads_map = build_bins(f_threads_l, bin_size)
f_comments_map = build_bins(f_comments_l, bin_size)
#save everything
dump_map(yob_map, ''.join(['yob_map_', fname_ps, '.pkl']))
dump_map(f_post_map,''.join(['f_post_map_',fname_ps, '.pkl']))
dump_map(f_votes_map, ''.join(['f_votes_map_', fname_ps, '.pkl']))
dump_map(f_endorse_map, ''.join(['f_endorsed_map_', fname_ps, '.pkl']))
dump_map(f_threads_map, ''.join(['f_threads_map_', fname_ps, '.pkl']))
dump_map(f_comments_map, ''.join(['f_comments_map_', fname_ps, '.pkl']))
return None
if __name__ == '__main__':
if len(sys.argv) < 3:
print ('Usage: python numeric_generalization_v2.py in_file bin_size')
sys.exit(1)
fname_in = sys.argv[1]
fname_out = fname_in[:-4]
while '/' in fname_out:
cut_i = fname_out.find('/')
fname_out = fname_out[cut_i +1:]
bin_size = int(sys.argv[2])
f_in = open(fname_in, 'r')
c_in = csv.reader(f_in)
create_value_maps(c_in, fname_out, bin_size)