# DSC 530 Code Snippets


In [None]:
# //**** Mapping Values
id_map = {}
n = 1
for user in people.unique():
    id_map[user] = n
    n += 1

In [None]:
# //********************************************************************
# //*** Pandas Frequency from dataframe
# //********************************************************************
df.value_counts().sort_index()



In [None]:
# //*****************************************
# //*** Build a probability mass function
# //*****************************************
# //*** Returns Series as a PMF
# //*****************************************
def build_pmf(input_series):
    output_series = input_series.copy()
    total_values = input_series.sum()
    for value,freq in output_series.items():
        #print(f"{value} {freq} {total_values} {freq/total_values}")
        output_series.loc[value] = freq/total_values
    return output_series

In [None]:
# //*** Build a Cumulative Distribution Function from a Probability Mass Function
# //*** Returns a Series
def build_cdf(input_series):
    # //*** If input is not panda or pd series, try to convert it
    if not isinstance(input_series,pd.core.series.Series):
        input_series = pd.Series(input_series)
        
    # //*** If input is np.Array
    output_series = input_series.copy()
    cumulative_value = 0
    for value,freq in output_series.items():
        #print(f"{value} {freq} {cumulative_value} {freq + cumulative_value}")
        cumulative_value = freq + cumulative_value
        output_series.loc[value] = cumulative_value
    return output_series

In [None]:
# //*** Get a Percentile Rank from a CDF Frequency Series.
def PercentileRank(scores, input_score):
    count = 0
    for value,cum_prob in scores.items():
        if value <= input_score:
            count += 1

    percentile_rank = round(100.0 * count / len(scores),0)
    return percentile_rank


In [None]:
# //*** Retrieve a percentile value from a CDF.
# //*** Returns index value closest to input parameter percentile.
def get_cdf_percentile(input_cdf,percentile):
    #print(f"{input_cdf}")
    #//*** Initialize output to first value
    output = input_cdf.index[0]
    
    #//*** Loop through all items till the value exceeds the percentile
    #//*** Return value from last loop
    for index,value in input_cdf.items():
        
        if value > percentile:
            return output
        else:
            output = index


In [2]:
# //********************************************************************
# //*** Function - Build a frequency Dictionary from an input Series
# //********************************************************************
def build_frequency_dict(input_series):
    output_dict = {}
    
    for x in input_series.sort_values():

        if x not in  output_dict.keys():
            output_dict[x] = 1
        else:
            output_dict[x] = output_dict[x] + 1

    return output_dict

In [1]:
#//*********************************************************************
#//*** Quick regression model results using patsy formula.
#//*********************************************************************
#//*** input_dict kwargs:
#//*** summary (boolean) - Display a summary evaluation of the model
#//*** pval (boolean) - Display the p-value of the model
#//*** getrsquared (boolean) - returns the r**2 value of the model
#//*** getpvalue (boolean) - returns the p value of the model
#//*** getmodel (boolean) - returns the model
#//*** method (string) - model type to run
#//***       ols (default) - Ordinary Least Squares (linear regression)
#//***       poisson - Poisson regression
#//***       logit - logistic regression.
#//*** Single output are returned as a single tyoe
#//*** Multiple outots return as a string in the following order: rsquared, pvalue, model
#//*****************************************************************************************
def qmodel_patsy_ols(df,formula,**input_dict):
    
    #//*** Check that Statsmodel is loaded.
    #//*** Requires the sys library which imports if not in use.
    try:
        if 'sys' not in sys.modules:
            print(f"{sys.modules}")
        else:
            import sys
    except:
        import sys

    if 'sm' not in sys.modules:
        try:
            import statsmodels.api as sm
        except:
            print(f"This Function requires the 'statsmodel' library to be installed")
            return
    if 'sm' not in sys.modules:
        print(f"Load Statsmodel")
        try:
            import statsmodels.formula.api as smf
        except:
            print(f"This Function requires the 'statsmodel' library to be installed")
            return

    output = []
    display_summary = False
    display_pval = False
    getpvalue = False
    getrsquared = False
    getmodel = False
    method = "ols"
    generalError = False
    
    for key,value in input_dict.items():
        if key == 'summary':
            display_summary = value
        if key == 'pvalue':
            display_pval = value
        if key == 'getpvalue':
            getpvalue = value
        if key == 'getrsquared':
            getrsquared = value
        if key == 'method':
            method = value
        if key == 'getmodel':
            getmodel = value

    if method == "ols":
        model = smf.ols(formula=formula, data=df).fit()
    elif method == "poisson":
        model = smf.poisson(formula, data=join).fit()
    elif method == "logit":
        try:
            #model = smf.logit(formula, data=df).fit()
            model = smf.logit(formula, data=df)
            
            #//*** Endogenous variables designates variables in an economic/econometric model that are explained, or predicted, by that mode
            #//*** Test for endogenous variables. If less than half the variables are explained by the model, reject the model.
            nobs = len(model.endog)
            
            if nobs < len(df)/2:
                generalError = True
            else:
                model = model.fit()
        except:
            generalError = True
    else:
        print(f"Provide a valid Method:\n method='ols' [default]\nmethod='logit'")
        return
    # //*** On an error return 999 for all requested values
    if generalError:
        if getrsquared:
            output.append(999)

        if getpvalue:
            output.append(pd.Series(data=[999]))
        
        return output
        
    if display_summary:
        print("==========================")
        print("Q model Quick Display")
        print("==========================")
        print_model = model.summary()
        print(f"{print_model}")
    if display_pval:
        print("==========================")
        print("Q model P Values")
        print("==========================")
        model.pvalues.drop(['Intercept'])
        for x,y in model.pvalues.items():
            if x != 'Intercept':
                print(f"{x} : {y}")
    if getrsquared:
        if method == "logit":
            output.append(model.prsquared)
        else:
            output.append(model.rsquared)
    
    if getpvalue:
        try:
            output.append(model.pvalues.drop(index='Intercept') )
        except:
            output.append(model.pvalues.values)
    
    if getmodel:
        output.append(model)

    #//*** If more than one output variable, output a list
    if len(output) > 1:
        return output
    elif len(output) == 1:
        #//*** Single elements, just return the element
        return output[0]
        
        
        

In [None]:
# //*****************************************
# //*** Loop through a series
# //*****************************************
for index, value in s.items():
    print(f"{index} {value}")

In [None]:
# //*** Bin a continuous variable pandas Series
# //*** Probably should use np.arange and pandas groupby instead
def bin_value_counts(input_series, bin_size):
    
    # //*** build New Index
    bin_index = np.arange(int(input_series.index.min()), int(input_series.index.max()+1), bin_size, float)
    
    # //*** Binned values as an array, these will eventually be combined with the bin_index
    bin_values = []

   
    #//*** Build Bin counter. Start with Int of min value
    loop_bin_index = 1
    
    #//*** build counter for the current bin
    loop_bin_value = 0
    
    for bin in range(0,len(bin_index)):
    
        loop_value = 0
        for index,value in input_series.items():
            
            if bin+1 == len(bin_index):
                break
              
            if bin_index[bin] <= index < bin_index[bin + 1]:
                loop_value = loop_value + value
            if index > bin_index[bin + 1]:
                continue
        
            
        bin_values.append(loop_value)
    #print(bin_index)
    #print(bin_values)
    #print(input_series)
    return pd.Series(index=bin_index, data=bin_values).tail(20)


In [None]:
# //*** Matplotlib example side by side bar chart
n = 5
var1 = total_employee_hours_distribution_pmf
var2 = billable_employee_hours_distribution_pmf
var3 = overhead_employee_hours_distribution_pmf
fig, ax = plt.subplots()
index = np.arange(n)
bar_width = 0.2
opacity = 0.9
ax.bar(index, var1, bar_width, alpha=opacity, color='navy',label='Total Productivity')
                
ax.bar(index+bar_width, var2, bar_width, alpha=opacity, color='darkcyan', label='Billable')
ax.bar(index+(bar_width*2), var3, bar_width, alpha=opacity, color='skyblue', label='Overhead')
plt.ylim(billable_employee_hours_distribution_pmf.min() * .95,billable_employee_hours_distribution_pmf.max() * 1.05)
#ax.set_xlabel('Seasons')
ax.set_ylabel('Frequencies')
plt.title(f"All Employee: Productivity Distribution")
ax.set_xticks(index + bar_width / 2)
ax.set_xticklabels(g['days_of_week'])
ax.legend()
plt.show()

In [None]:
# //**** Legends automatically generate too many labels.
def deduplicate_legend(input_ax):
    # //**** Get handle and label list for the current legend
    # //**** Use first instance, toss the rest.
    handles, labels = input_ax.get_legend_handles_labels()

    handle_dict = {}

    for x in range(len(labels)):
        if labels[x] not in handle_dict.keys():
            # //*** Label = handle
            handle_dict[labels[x]] = handles[x]

    # //*** Build unique output ists and handles
    out_handles = []
    out_labels = []
    
    for label,handle in handle_dict.items():
        out_handles.append(handle)
        out_labels.append(label)
    
    return out_handles,out_labels

   
fig,ax = plt.subplots()
ax.plot(sample_sizes,mean_standard_errors,1, color='r', label='Mean Error')
ax.plot(sample_sizes,median_standard_errors,1, color='g', label='Median Error')
plt.ylim(0,pd.Series(mean_standard_errors).max()*1.05)
# //**** DeDuplicate handles and labels
handles,labels = deduplicate_legend(ax)
plt.legend(handles,labels )
plt.show()

In [None]:
# //*** Better example in: Week07_StoneburnerKurt_DSC530
# //*** dataframe binning from CDF
# //*** Displays a percentile plot of independent variable vs dependent variables
# //*** Dependent variable - dataframe column name - X axis, used as a mean for comparison
# //*** Independent variable - dataframe column name - Y axis, used for CDF percentiles
# //*** input_bin_percentages = a list [.1,.3,.5] of percentages to use for percentiles.
# //*** plot_now = Boolean. True draws a basic plot with no labels. False, doesn't draw plot and allows more plt
# //***            values to be assigned after such as adding labels.
def plot_cdf_percentiles(input_df,dependent_variable,independent_variable,input_bin_percentages,plot_now=True):
    # //*** Get min and Max ages to set the limits of the group by bins
    # //*** Converting to integers adds headroom (extra space) to the floor
    # //*** Max + 1 gives a little extra room for Max
    minVal = int(input_df[dependent_variable].min())
    maxVal = int(input_df[dependent_variable].max())+1

    # //*** Build binning parameters. Minimum value, Maximum Value, # of Bins
    bins = np.arange(minVal, maxVal, len(input_bin_percentages))

    # //*** generate indices based on bins
    indices = np.digitize(input_df[dependent_variable], bins)
    groups = input_df.groupby(indices)

    # //*** Build mean of dependent variable
    # //*** Builds a list of means for each dependent binned values
    dependent_variable_mean = [group[dependent_variable].mean() for i, group in groups]

    # //*** Build a list CDFs for each binned value
    CDFs = []
    # //*** group is a tuple. Tuple[0] = index, tuple[1] = dataframe
    for group in groups:
        #//*** CDF for each binned dataframe
        #//*** Each group is a tuple (Index,dataFrame) - hence group[1] = dataFrame
        #//*** Get the independent variable in each group
        #//*** 1. Build histogram
        #//*** 2. build PMF
        #//*** 3. build CDF
        #//*** 4. Add resulting CDF to CDFs
        CDFs.append( build_cdf( build_pmf( (group[1][independent_variable].value_counts().sort_index() ) ) ) )

    labels = []
    for x in input_bin_percentages:
        # //*** Build a list of percentiles to plot from each cdf using the percentile stored in bin_values
        weight_percentiles = [get_cdf_percentile(cdf,x) for cdf in CDFs]
        loop_label= f"{int(x*100)}th"
        plt.plot(dependent_variable_mean,weight_percentiles,1, label=loop_label)
        labels.append(loop_label)

    plt.legend(labels)
    plt.xlim(minVal,maxVal)
    
    if plot_now==True:
        plt.show()