numeric_generalization_v2.py

__author__ = 'olivia'

import sys, pickle, csv
import numpy as np
import pandas as pd

"""
Bin a set of numeric values so that at least n entities are within each bin. In particular,
this code will take the Year of Birth (YoB) and the number of forum posts (nforum_posts) values
and produce bins with a particular range and calculate the mean for that range.  This code is
improved in order to bin values together in such a way that minimizes the distortion of the mean
of the post-binned values.

"""

# Working for year of birth and number of forum posts
def collapse(val,num,denom,maxbinsize):
    """
    Given a max bin size, greedily find the endpoints that will create the bins with the smallest distortion to the
    overall bin mean that has at least maxbinsize members.
    Note that this only works for integer values. The function also finds the mean of the values in each bin.

    :param val: a list of the original, unbinned values
    :param num: a list of the sum of the values with the new binned values
    :param denom: a list of the number of values with the new binned values
    :return: three lists of val, num, denom where val is a list of lists of the original values that now belong
    to each bin; num is the sum of all the values that belong in that bin; and denom is the count of the number
    of values that belong to that bin.
    """
    def left_merge(index,val=val,num=num,denom=denom):
        num[index-1] = num[index-1] + num[index]
        num = np.delete(num,index)
        denom[index-1] = denom[index-1] + denom[index]
        denom = np.delete(denom,index)
        val[index-1] = val[index-1]+val[index]
        del val[index]
        return val,num,denom
    
    def right_merge(index,val=val,num=num,denom=denom):
        num[index] = num[index] + num[index+1]
        num = np.delete(num,index+1)
        denom[index] = denom[index] + denom[index+1]
        denom = np.delete(denom,index+1)
        val[index] = val[index]+val[index+1]
        del val[index+1]
        return val,num,denom

    indices_less_than_k = np.where(denom<maxbinsize)[0]
    left_indices = indices_less_than_k-1
    # if left endpoint is in list, then don't calculate for that index
    if -1 not in left_indices:
        left_errors = abs(num[indices_less_than_k]/denom[indices_less_than_k] - num[left_indices]/denom[left_indices])
        minerror_left = min(left_errors)
        minerror_left_index = indices_less_than_k[np.argmin(left_errors)]
    elif len(left_indices) > 1:
        # compare with left bucket
        left_errors = abs(num[indices_less_than_k[1:]]/denom[indices_less_than_k[1:]] - num[left_indices[1:]]/denom[left_indices[1:]])
        minerror_left = min(left_errors)
        minerror_left_index = indices_less_than_k[np.argmin(left_errors)+1]
    else:
        minerror_left = np.inf
        minerror_left_index = np.inf
        

    right_indices = indices_less_than_k+1
    # if right endpoint is in list, then don't calculate for that index
    if len(denom) not in right_indices:
        right_errors = abs(num[indices_less_than_k]/denom[indices_less_than_k] - num[right_indices]/denom[right_indices])
        minerror_right = min(right_errors)
        minerror_right_index = indices_less_than_k[np.argmin(right_errors)]
    elif len(right_indices)>1:
        # compare with right bucket
        right_errors = abs(num[indices_less_than_k[:-1]]/denom[indices_less_than_k[:-1]] - num[right_indices[:-1]]/denom[right_indices[:-1]])
        minerror_right = min(right_errors)
        minerror_right_index = indices_less_than_k[np.argmin(right_errors)]
    else:
        minerror_right = np.inf
        minerror_right_index = np.inf
    
    # overall smallest error
    # collapse left
    if minerror_left <= minerror_right:
        return left_merge(minerror_left_index)
    elif minerror_right < minerror_left:
        return right_merge(minerror_right_index)


# Creates a dictionary that maps each unique value onto a corresponding range
def createConversionDict(val,num,denom,numdenom):
    """
    Take a list of endpoints as generated by collapse and create a dictionary whose keys are the unique first
    values in qry and whose corresponding values are a pair of the bin that each of the unique values in the dataset
    should be mapped onto and the mean of that bin.

    :param val: a list of the original, unbinned values
    :param num: a list of the sum of the values with the new binned values
    :param denom: a list of the number of values with the new binned values
    :param numdenom: a pandas dataframe of the original values, counts, and sums
    :return: a dictionary keyed by value binned with values the pair (bin range, mean). Bin range will be a
        string, while mean will be a floating point value
    """
    numDict = {}
    for i,v in enumerate(val):
        tm = numdenom.ix[v]
        bucket_count = tm['count'].sum()
        bucket_mean = tm['sum'].sum() / float(bucket_count)
        for v2 in v:
            if len(v)==1:
                numDict[v2] = (str(v[0]), bucket_mean, bucket_count)
            else:
                numDict[v2] = (str(min(v))+'-'+str(max(v)), bucket_mean, bucket_count)
    return numDict

def build_bins(val_l, bin_size):
    '''
    Build the range of values, starting from a particular bin size. The algorithm will first build generalization bins
    of the indicated size, but will then use the greedy algorithm in collapse() to re-arrange the bins so that the
    mean value deviates as little as possible form the values in the bin. This means that the bins may be larger or
    smaller than the indicated bin size, but minimizes the statistical bias of the generalization
    :param val_l: A list of values and counts for those values
    :param bin_size: the size of the initial bins to create
    :return: a generalization table in the form of a dictionary mapping from value to the range of values that will be
    substituted for the initial value when generalization occurs
    '''
    d_frame = pd.DataFrame(val_l)
    d_frame.columns = ['item', 'count', 'sum']
    d_frame.index = d_frame['item']
    val = [[x] for x in list(d_frame['item'].values)]
    denom = d_frame['count'].values
    num = d_frame['sum'].values
    while sum(denom<bin_size) > 0:
        val, num, denom = collapse(val, num, denom, bin_size)
    dictobj = createConversionDict(val, num, denom, d_frame)

    return dictobj

def update_num_dict(val, dict):
    '''
    Update an entry in a dictionary keyed by quasi-identifier values with value the number of entries with that value
    and the total for the value (value * count). 
    :param val: the value of the quasi-identifier
    :param dict: the dictionary to be updated
    :return: The dictionary updated
    '''
    if val != '':
        i = int(val)
    else:
        i = -1
    if i in dict:
        dict[i][0] += 1
        dict[i][1] += i
    else:
        dict[i] = [1, i]

    return dict

def dict_to_list(d):
    '''
    Convert a dictionary keyed by quasi-identifier values with entries pairs of the number of items with that value
    and the product of the value and the count into a list of triples <value, count, count*value>. Return this list,
    sorted by value. Note that the special value of '' will be handled with a triple of <''. count, 0>; this will 
    be the last member of the returned list if it exists.
    :param d: A dictionary of quasi-identifier values
    :return: A sorted list of triples of the form <value, count, value*count>
    '''
    ret_list = []
    for i,v in d.items():
        ret_list.append([i, v[0], v[1]])
    ret_list.sort()
    l = ret_list.pop()
    if l[0] == '-1':
        l[2] = 0
    ret_list.append(l)
    return ret_list

def dump_map(m, f_name):
    '''
    Write a pickle of a map to an output file
    :param m: the map to be pickled
    :param f_name: the name of the file to contain the pickled map
    :return: None
    '''
    f_out = open(f_name, 'wb')
    pickle.dump(m, f_out)
    f_out.close()
    return None

def create_value_maps(cin, fname_ps, bin_size):
    '''
    Create the generalization tables, and save them in pickled form. This function takes a .csv file of quasi-identifiers,
    the name to use as the base of the output file, and the size of the bins to be produced. It will create a mapping
    file for each of the numeric quasi-identifiers, grouping the numeric values so that there are at least bin_size
    records within each bin.
    :param cin: A .csv file to be used as input. We assume the first line is made up of header information, so it is 
    skipped
    :param fname_ps: The base name for the output files
    :param bin_size: The minimum size of the bins that are to be create
    :return: None
    '''
    #create the dictionaries for each of the different numeric values
    yob_d = {}
    f_post_d = {}
    f_votes_d = {}
    f_endorse_d = {}
    f_threads_d = {}
    f_comments_d = {}

    #read the header line
    next(cin)
    #construct the dictionaries of value, count
    for l in cin:
        yob_d = update_num_dict(l[8], yob_d)
        f_post_d = update_num_dict(l[10], f_post_d)
        f_votes_d = update_num_dict(l[11], f_votes_d)
        f_endorse_d = update_num_dict(l[12], f_endorse_d)
        f_threads_d = update_num_dict(l[13], f_threads_d)
        f_comments_d = update_num_dict(l[14], f_comments_d)

    #convert the dictionaries to lists
    yob_l = dict_to_list(yob_d)
    f_post_l = dict_to_list(f_post_d)
    f_votes_l = dict_to_list(f_votes_d)
    f_endorse_l = dict_to_list(f_endorse_d)
    f_threads_l = dict_to_list(f_threads_d)
    f_comments_l = dict_to_list(f_comments_d)

    #build the bins from the lists
    yob_map = build_bins(yob_l, bin_size)
    f_post_map = build_bins(f_post_l, bin_size)
    f_votes_map = build_bins(f_votes_l, bin_size)
    f_endorse_map = build_bins(f_endorse_l, bin_size)
    f_threads_map = build_bins(f_threads_l, bin_size)
    f_comments_map = build_bins(f_comments_l, bin_size)

    #save everything
    dump_map(yob_map, ''.join(['yob_map_', fname_ps, '.pkl']))
    dump_map(f_post_map,''.join(['f_post_map_',fname_ps, '.pkl']))
    dump_map(f_votes_map, ''.join(['f_votes_map_', fname_ps, '.pkl']))
    dump_map(f_endorse_map, ''.join(['f_endorsed_map_', fname_ps, '.pkl']))
    dump_map(f_threads_map, ''.join(['f_threads_map_', fname_ps, '.pkl']))
    dump_map(f_comments_map, ''.join(['f_comments_map_', fname_ps, '.pkl']))

    return None


if __name__ == '__main__':
    if len(sys.argv) < 3:
        print ('Usage: python numeric_generalization_v2.py in_file bin_size')
        sys.exit(1)

    fname_in = sys.argv[1]
    fname_out = fname_in[:-4]
    while '/' in fname_out:
        cut_i = fname_out.find('/')
        fname_out = fname_out[cut_i +1:]
    bin_size = int(sys.argv[2])

    f_in = open(fname_in, 'r')
    c_in = csv.reader(f_in)
    create_value_maps(c_in, fname_out, bin_size)