In [15]:
import pandas as pd
import numpy as np

### FUNCTION INVENTORY ###

In [16]:
def verticalMask(df, drop_cols = ['Program', 'Level', 'Number']):
    '''Asssuming the dataframe only contains 1 geography,
    iterrate through each column to perform vertical masking
    :param df: Dataframe
    :param drop_cols: list of columns to NOT be masked/ignored
    '''
    d = df.copy()
    #Iteratate through every report column
    cols = list(set(d.columns.tolist()) - set(drop_cols))
    
    for c in cols:
        print("...Processing",c)
        #Only mask if a single number in that column is -999
#         print(np.sum(d.loc[d[c] < 0, c]))
        if sum(d.loc[d[c] < 0, c]) == -999: 
            #Find the next smallest number and change it to 20
            next_min_idx = np.argmin(d.loc[d[c] >= 0 , c])
            next_min = np.min(d.loc[d[c] >= 0 , c])
            print("The next smallest value for column {} is at index {} of value {}".format(c,next_min_idx,next_min))
            #Secondary mask 
            d.loc[next_min_idx, c] = -100
    return d

In [17]:
def secondMask(row):
    '''
    Pass a vector in, mask if only one -999 value
    :param row: a numpy vector
    '''
    if len(row[row ==  -999]) == 1:
#         print("Row before:\n", row)
        next_min = np.min(row[row >= 0])
        next_min_idx = np.where(row==next_min)[0][0]

        print("Next smallest at index {} of value {}".format(next_min_idx, next_min))
        #Mask
        row[next_min_idx] = -100
#         print("Row after:\n", row)
    return row

In [18]:
def horizontalMask(df, grp:dict):
    '''
    Leveraging the secondMask function, change the 
    group of related column names, then perform masking 
    within the parameters for each row in group. 
    :param df: Pandas dataframe
    :grp: dictionary of gorup names
    '''
    d = df.copy()
    for k,v in grp.items():
        d.loc[:,v] = d.loc[:,v].apply(secondMask, axis=1)
    return d

In [19]:
def splitReport(df:pd.DataFrame, geo=["State Assembly Total","State Senate Total"
                                      ,"U.S. Congress Total"
                                     ,"County Total"]):
    '''
    Accept a DF that is the original Excel report,
    split into N sub-reports based on geographies as 
    lited by the geo parameters
    '''
    #We can identify each section of the report here...
    indices = np.zeros(len(geo),dtype='int')
    split_dfs = []
    for idx in range(len(geo)): 
        indices[idx] = df.index[df.Number == geo[idx]].tolist()[0]
        if idx == 0:
            split_dfs.append(df.iloc[:indices[idx]+1])
        else:
            split_dfs.append(df.iloc[indices[idx-1]+1:indices[idx]+1])
    return split_dfs

In [49]:
def batchMask(df, grp):
    masked_df = splitReport(df)
    final_df = pd.DataFrame()
    for i in range(len(masked_df)):
        #Horizontal masking
        masked_df[i] = horizontalMask(masked_df[i], grp)
        #Vertical masking
        masked_df[i] = verticalMask(masked_df[i])
        #Concatenate
        final_df = pd.concat([final_df, masked_df[i]])
    
    if ! assert df.shape == final_df.shape: 
        print("Final dataframe does NOT preserve original shape!!")
    return final_df

SyntaxError: invalid syntax (<ipython-input-49-30d06e9310e1>, line 12)

In [50]:
group_dict = dict()
group_dict["race"] = ['Other/Unknown','Black', 'White', 'Hispanic', 'Asian_PI', 'Native_American']
group_dict["gender"] = ['Female','Male', 'Unknown/Other Gender']
group_dict["prog"] = ['1 program', '2 programs', '3 programs','4 programs', '5+ programs']
group_dict["agy"] = ['1 department', '2 departments','3 departments', '4 departments']
group_dict["age1"] = ['17 and Under', '18 and Over', '18-64','65 and Over']
group_dict["age2"] = ['age 0', '1 to 2', '3 to 5', '6 to 10', '11 to 15','16 to 17', '18 to 21']


## PRODUCTION TEST

In [51]:
d = pd.read_excel("CWS_CMS_Dashboard_16.xlsx")

In [52]:
r = batchMask(d, group_dict)

Next smallest at index 0 of value 74
Next smallest at index 0 of value 43
Next smallest at index 0 of value 74
Next smallest at index 0 of value 26
Next smallest at index 0 of value 20
Next smallest at index 0 of value 51
Next smallest at index 4 of value 43
Next smallest at index 0 of value 176
Next smallest at index 0 of value 130
Next smallest at index 0 of value 102
Next smallest at index 4 of value 45
Next smallest at index 0 of value 137
Next smallest at index 0 of value 176
Next smallest at index 0 of value 120
Next smallest at index 0 of value 57
Next smallest at index 4 of value 109
Next smallest at index 0 of value 44
Next smallest at index 4 of value 139
Next smallest at index 0 of value 36
Next smallest at index 0 of value 25
Next smallest at index 0 of value 51
Next smallest at index 0 of value 38
Next smallest at index 0 of value 24
Next smallest at index 1 of value 2996
Next smallest at index 1 of value 1699
Next smallest at index 1 of value 2760
Next smallest at index 0

Next smallest at index 4 of value 77
Next smallest at index 1 of value 17
Next smallest at index 4 of value 13
Next smallest at index 4 of value 12
Next smallest at index 1 of value 0
Next smallest at index 4 of value 100
Next smallest at index 0 of value 227
Next smallest at index 0 of value 1281
Next smallest at index 0 of value 270
Next smallest at index 0 of value 810
Next smallest at index 0 of value 262
Next smallest at index 0 of value 1301
Next smallest at index 0 of value 983
Next smallest at index 0 of value 1109
Next smallest at index 0 of value 371
Next smallest at index 0 of value 225
Next smallest at index 0 of value 938
Next smallest at index 0 of value 380
Next smallest at index 0 of value 99
Next smallest at index 0 of value 711
Next smallest at index 0 of value 82
Next smallest at index 0 of value 460
Next smallest at index 0 of value 250
Next smallest at index 0 of value 286
Next smallest at index 1 of value 133
Next smallest at index 0 of value 908
Next smallest at 

In [54]:
r.to_excel("Masked_CWS_CMS_Dashboard_16.xlsx")