# 8.1.1 Math Functions with Numpy/SciPy: Speed-Test

######

#### Create a program that does the following:

##### a. Create an ndarray containing 1,000,000 random numbers:

In [1]:
import numpy as np
# from Tools_Package.timer import timer, Timer
# import random

# Set Seed:
np.random.seed(0)

# Create the array
ndarray = np.random.randint(100000000, size = 1000000) 

# Measure the array length
print(f'Number of elements = {len(ndarray)}')

Number of elements = 1000000


##### b. Calculate the mean, median, mode, and standard deviation of the array. Compare the time spent running mean and standard deviation, 
##### vs. the previously implemented versions (from level 1). Is there a significant speedup? Why?

In [2]:
from Tools_Package.timer import timer, Timer # Set up for comparing time taken
import scipy.stats as spstats # for mode
import statistics as stats
import math as math

######################################## Mode Functions ########################################
# Mode (with ndArray)
def freqMapNDA(values):
    '''
    Returns a list of 2 arrays, where array 1 = unique items and array 2 = frequency.
    '''
    
    elts, frq = np.unique(values, return_counts=True) # return_counts=True: returns number of times element is repeated in the list.
    return elts, frq

def modeNDA(values):
    '''
    Calculates Mode (Modified for numpy)
    '''
    
    elts, frq = freqMapNDA(values)

    index = np.argmax((elts, frq), axis = 1) # Indices of largest values.
    # criteria = dictFreqmap[dictFreqmap in dictFreqmapIndex]

    a = np.extract(frq == max(frq), elts)
    b = np.extract(frq == max(frq), frq)
    
    result = list(zip(a, b))
    return result


# Mode (Original Code)
def freqMap(values):
    map={}
    for v in values:
            if not map.get(v):
                map[v] = 1
            else:
                map[v]=map[v]+1
                
    return map

def mode(values):
    dictFreqmap = freqMap(values)

    # Initial Code: Does not account for Multi-modal
    # sorted_dictFreqmap = dict(sorted(dictFreqmap.items(), key = itemgetter(1), reverse = True))

    # Final Code: Accounts for multi-modal
    maxval = max(dictFreqmap.values())
    dictFreqmap_Final = [(a, b) for a, b in dictFreqmap.items() if b == maxval]

    return dictFreqmap_Final
######################################## Mode Functions ########################################

#################################### Other Level 1 Functions ###################################
def myAveFunction(numberList):
    '''
    Calculates mean
    '''
    
    if not(all(type(value) != str for value in numberList)):
        print('This function only accepts lists containing numbers.')
    else:
        ttl = 0.0
        for num in numberList:
            ttl = ttl + num

        avg = ttl / len(numberList)
        # print('Average is:' + str(avg)) -- Up to caller of the function to decide what to do with the result.
        return avg

def mystddev(numberList, dof = 1):
    '''
    Calculates std dev
    '''
    print('\nThis function calculates the variance of a passed-in list:')

    if not(all(type(value) != str for value in numberList)):
        print('This function only accepts lists containing numbers.')
    else:
        average = myAveFunction(numberList)
        numerator = 0
        
        for num in numberList:
            numerator = (num - average)**2 + numerator

        var = numerator / (len(numberList) - dof)
        return math.sqrt(var)
#################################### Other Level 1 Functions ###################################

# NUMPY:
print(f'\nNUMPY Summary Statistics:')
# (1) Calculate mean
with Timer('myTimer'): # Compare time spent running mean and SD
    print(f'Calculated Mean: {ndarray.mean()}')
print(f'Calculated Mean (Method 2): {np.mean(ndarray)}')

# (2) Calculate median
print(f'Calculated Median: {np.median(ndarray)}') # No Method 1

# (3) Calculate mode (remember an array can be multi-modal especially given the number of elts)
## ORIGINAL APPROACH: print(f'\nCalculated Mode: {print(spstats.mode(ndarray))}') # Uses SciPy
## PREFERRED APPROACH: MODIFY MODE FUNCTION to optimize for numpy arrays
print(f'Calculated Mode: {modeNDA(ndarray)}')


# (4) Calculate standard deviation
with Timer('myTimer'): # Compare time spent running mean and SD
    print(f'Calculated SD: {np.std(ndarray)}') # No Method 1


# FROM LEVEL 1:
print(f'\nSummary Statistics (From Level 1):')
# (1) Calculate mean
with Timer('myTimer'): # Compare time spent running mean and SD
    print(f'Calculated Mean: {myAveFunction(ndarray)}')

# (2) Calculate median
print(f'Calculated Median: {stats.median(ndarray)}') # No Method 1

# (3) Calculate mode (remember an array can be multi-modal especially given the number of elts)
## ORIGINAL APPROACH: print(f'\nCalculated Mode: {print(spstats.mode(ndarray))}') # Uses SciPy
## PREFERRED APPROACH: MODIFY MODE FUNCTION to optimize for numpy arrays
print(f'Calculated Mode: {mode(ndarray)}')

# (4) Calculate standard deviation
with Timer('myTimer'): # Compare time spent running mean and SD
    print(f'\nCalculated SD: {mystddev(ndarray)}') # No Method 1

# Q. Is there a significant speedup? Why?
# A. Yes. This is because numpy uses vectorized calculations for the math formulas which rely on C, which is much faster than native python formulas/using other packages.




NUMPY Summary Statistics:
Calculated Mean: 50011868.328851
Calculated Mean (Method 2): 50011868.328851
Calculated Median: 49993993.5
Calculated Mode: [(4093577, 3), (6053978, 3), (9618465, 3), (15119109, 3), (26583719, 3), (27243659, 3), (27810631, 3), (28341367, 3), (39040889, 3), (44038140, 3), (46770978, 3), (51927366, 3), (52713666, 3), (59854431, 3), (59877001, 3), (63219613, 3), (68288915, 3), (72867228, 3), (74331526, 3), (79858304, 3), (86989666, 3), (89078864, 3), (99963822, 3)]




Calculated SD: 28860092.49194118

Summary Statistics (From Level 1):




Calculated Mean: 50011868.328851
Calculated Median: 49993993.5
Calculated Mode: [(4093577, 3), (63219613, 3), (52713666, 3), (74331526, 3), (26583719, 3), (59877001, 3), (99963822, 3), (27810631, 3), (27243659, 3), (86989666, 3), (9618465, 3), (51927366, 3), (68288915, 3), (79858304, 3), (89078864, 3), (59854431, 3), (6053978, 3), (44038140, 3), (46770978, 3), (72867228, 3), (28341367, 3), (39040889, 3), (15119109, 3)]

This function calculates the variance of a passed-in list:





Calculated SD: 28860106.921998


##### c. Calculate the 10, 20, 30, ..., 100 quantiles of the array.

In [3]:
for item in list(range(10, 101, 10)):
    print(f'{item}th quantile of the array = {np.quantile(ndarray, item/100)}')

10th quantile of the array = 9999588.5
20th quantile of the array = 20031686.8
30th quantile of the array = 30040834.7
40th quantile of the array = 40026548.60000002
50th quantile of the array = 49993993.5
60th quantile of the array = 60002358.0
70th quantile of the array = 69996841.49999999
80th quantile of the array = 79992810.80000001
90th quantile of the array = 89978057.3
100th quantile of the array = 99999924.0
