In [24]:
import random as rand
from math import *
import numpy as np
import copy
import sys

Create a random dataset with some character to it - in this case the sum of n random numbers is similar to a normal distribution. The values range from 0 to 1, but the functions could work with whatever arbitrary range.

This corresponds to the set of observations in a given week - so it could be hourly temperature measurements... perhaps 7 * 24 = 168 of them, or if there are data gaps then fewer.

In [25]:
n = 168 #number of hours in a week
rand_reps = 20
test_floats = [sum([rand.random() for i in range(rand_reps)])/rand_reps for i in range(n)]
#print(test_floats)

Describe the way that the histogram is built. It is defined between a min and a max, and has a specific bin width. So if the minimum was 1, and the bin_width 0.5, any value between 1 and 1.5 would fall into the first bin, while values between 1.5 and 2 would fall into the second bin. If the maximum was 2.2, there would be three bins, the third could be thought of as ranging between 2 and 2.2, but effectively ranges from 2 to 2.5. Values exactly on a bin_boundary would be assigned to the upper bin, so 1.5 would be in the 1.5 to 2 bin.

bin_boundaries are not actually used, just included for clarity. max is also not used - it's not prescriptive, except when determining how many bins to generate, and as alluded to above those could go higher than the max if the bin boundaries don't align with the max.

In [26]:
bin_width = 0.03
minimum = 0.0
maximum = 1.0
n_bins = int(ceil((maximum-minimum)/bin_width))
bin_boundaries = [minimum+i*bin_width for i in range(n_bins+1)]
print(bin_width, minimum, maximum, bin_boundaries)

0.03 0.0 1.0 [0.0, 0.03, 0.06, 0.09, 0.12, 0.15, 0.18, 0.21, 0.24, 0.27, 0.3, 0.32999999999999996, 0.36, 0.39, 0.42, 0.44999999999999996, 0.48, 0.51, 0.54, 0.57, 0.6, 0.63, 0.6599999999999999, 0.69, 0.72, 0.75, 0.78, 0.8099999999999999, 0.84, 0.87, 0.8999999999999999, 0.9299999999999999, 0.96, 0.99, 1.02]


Build a histogram data structure as a dictionary. Could be built object-oriented, but this is succinct in this context.

In [27]:
histogram_list = [0 for i in range(n_bins)] #Could be a numpy int array or something like that.
hist_settings = {
    "bin_width":bin_width,
    "minimum":minimum,
    "maximum":maximum,
    "n_bins":n_bins,
    "bin_boundaries":bin_boundaries,
    "histogram_list":histogram_list
}

Take the dataset and put it into the histogram - this is the critical "pre-processing" step that could be conducted at the time of download. Once this is done, the raw data theoretically doesn't need to be revisited. However, if we changed something like the bin-width, then it would have to be rerun.

The number of bins we use is a critical decision with a tradeoff on accuracy vs. data. The raw data includes 168 floats. The histogram might include more like 20 char values (given there's only 168 hours in a week, we never have a count of more than 256 in a single histogram bin). So 32 * 168 = 5376 bits vs 8 * 20 = 160 bits means 1/30th the memory footprint. Note however that 20 bins may be a bit limiting, since the histogram probably needs to span all possible values for all station locations. So temperatures might run from -100 F to + 150 F. That's 12.5 F per bin, so if there's a week where the temperature waffles between 30 and 40, all the data would fall in one bin. Is that a problem? Maybe not - a simple triangle distribution may be enough detail. Also we could pick the minimum and bin-width so that a bin boundary lay exactly at freezing, since that's a sort of important number to caputre accurately. Put another way, if we had a bin-boundary at 32 F, and 12.5 F bins, we couldnt' tell the difference between 32 F and 44.4 F. We would in effect assume all temperatures in that range are 38.25 F.

I don't think there's a problem assigning different numbers of bins to different parameters. Also we could apply some sort of transformation to vary precision, such as resolving fine differences in horizontal visibility in thick fog, but lumping together huge ranges for clear days (5 mile visibility isn't very different from 20 mile visibility, but 100 feet visibility is quite different from 500 feet.) To capture this possibility I included the optional transform function that can be passed to the populate function below, but to really use it the function needs to be more aware of what's going on - see object oriented version below.

In [28]:
def identity(value):
    return value

def populate_histogram(data, bin_width, minimum, maximum, n_bins, bin_boundaries, histogram_list, transform = identity):
    for value in data:
        value = transform(value)
        if value < minimum: list_index = 0
        elif value > maximum: list_index = n_bins
        else: list_index = int(floor((value-minimum)/bin_width))
        histogram_list[list_index] += 1
    return {
        "bin_width":bin_width,
        "minimum":minimum,
        "maximum":maximum,
        "n_bins":n_bins,
        "bin_boundaries":bin_boundaries,
        "histogram_list":histogram_list
    }


def log_transform(value): #Resolve small numbers more precisely than large ones
    return log(value)

histogram = populate_histogram(test_floats, **hist_settings)
print(histogram)

{'bin_width': 0.03, 'minimum': 0.0, 'maximum': 1.0, 'n_bins': 34, 'bin_boundaries': [0.0, 0.03, 0.06, 0.09, 0.12, 0.15, 0.18, 0.21, 0.24, 0.27, 0.3, 0.32999999999999996, 0.36, 0.39, 0.42, 0.44999999999999996, 0.48, 0.51, 0.54, 0.57, 0.6, 0.63, 0.6599999999999999, 0.69, 0.72, 0.75, 0.78, 0.8099999999999999, 0.84, 0.87, 0.8999999999999999, 0.9299999999999999, 0.96, 0.99, 1.02], 'histogram_list': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 7, 5, 25, 31, 26, 32, 20, 8, 6, 3, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


In [29]:
def histogram_integral_approximate(threshold, bin_width, minimum, maximum, n_bins, bin_boundaries, histogram_list):
    #Integrates from the threshold to infinity, including all values that share a bin with the threshold
    if threshold < minimum: return sum(histogram_list)
    if threshold > maximum: return 0
    threshold_bin = int(floor((threshold-minimum)/bin_width))
    total = 0
    for i in range(threshold_bin, n_bins): total+=histogram_list[i]
    return total

def histogram_integral_precise(threshold, bin_width, minimum, maximum, n_bins, bin_boundaries, histogram_list):
    #Integrates from the threshold to infinity, with linear interpolation within the bin that includes the threshold
    if threshold < minimum: return 0
    if threshold > maximum: return sum(histogram_list)
    threshold_bin = int(floor((threshold-minimum)/bin_width))
    total = int(round(histogram_list[threshold_bin] * (((threshold_bin+1)*bin_width-threshold - minimum)))) #Not sure why I have to force this to int - shouldn't it stay an int since items in histogram_list are ints?
    for i in range(threshold_bin+1, n_bins): total += histogram_list[i]
    return total

threshold = 0.5
print(histogram_integral_approximate(threshold, **histogram))
print(histogram_integral_precise(threshold, **histogram))
#approximate will always overestimate relative to precise

96
70


## Object-oriented version

This lays out the basic functionality, but to really see how this works it's better to see the data in the context provided by object-oriented code. Starting to lay that out using compact numpy arrays, with smart handling of transformations.

 First, bring in some code for unit handling:

In [30]:
# Function smart_units takes a string (or number - which will be retruned unchanged as a float) and converts it into a float in internal units
# internal units: meters, seconds, meters / second, cubic meters, radians for angles, celcius for temperatures, proportion (0 to 1) for proportions,
# cubic meters per second for volumetric rate, I think that's it.

#All these should be lower case
m_names = ['m', 'meters', 'meter']
km_names = ['km', 'kilometers', 'kilometer']
mi_names = ['mi', 'miles', 'mile']
NMi_names = ['nmi', 'nautical miles']
ft_names = ['ft', 'feet', 'foot']
in_names = ['in', 'inch', 'inches']
ms_names = ['m/s', 'meters per second']
mph_names = ['mph', 'miles per hour']
knots_names = ['knots', 'knot', 'nautical miles per hour', 'kts']
kph_names = ['kph', 'kilometers per hour', 'km/hr']
s_names = ['s', 'seconds', 'second']
min_names = ['min', 'minute', 'minutes']
hour_names = ['hr', 'hrs', 'hour', 'hours']
day_names = ['day','days']
year_names = ['year','yr','years']
c_names = ['c', 'celsius', '°c']
f_names = ['f', 'fahrenheit', '°f']
k_names = ['k', 'kelvin', '°k']
prop_names = ['proportion','p','prop']
percent_names = ['%','percent']
m3_names = ['m^3', 'm3', 'cubic meters', 'cubic meter']
l_names = ['l','liter','liters']
oil_bbl_names = ['bbl','barrel','bbls','barrels']
gal_names = ['gallons', 'gallon', 'gal']
m3_per_second_names = ['m^3/s', 'm^3 / s', 'm3ps', 'cubic meters per second']
liters_per_second_names = ['l/s','liters per second']
gallons_per_minute_names = ['gal/min', 'gallons per minute', 'gpm']
oil_barrels_per_hours_names = ['bbls/hr', 'bbl/hr', 'barrels per hour', 'bph']
degree_names = ['degrees', '°', 'deg']
radian_names = ['radians']

zero = [0,0.0,'0','zero','ZERO','Zero','0.0']

bad_chars = ' ,(){}[]\t\n' #To be stripped


def smart_units(input_string, verbose = False):
    try: return float(input_string) #If it's just a number, trust that it is what it's supposed to be.
    except ValueError: pass
    input_string.strip(bad_chars)
    if not verbose and input_string in zero: return 0.0
    elif input_string == 'infinite': return sys.float_info.max #maximum float value
    else:
        try: value, units = tuple(input_string.split(' ', 1))
        except ValueError: raise NameError('Not a usable string for verbose smart_units - must include units:',input_string)
        units.strip(bad_chars)
        units = units.lower()
        if verbose:
            if units in m_names + ft_names + in_names + km_names + NMi_names + mi_names: return length_m(value, units), 'length', "meters"
            elif units in ms_names + mph_names + knots_names + kph_names: return velocity_m_s(value, units), "velocity", "meters per second"
            elif units in s_names + min_names + hour_names + day_names + year_names: return time_s(value, units), "time", "seconds"
            elif units in m3_names + l_names + oil_bbl_names + gal_names: return volume_m3(value, units), "volume", "cubic meters"
            elif units in c_names + f_names + k_names: return temperature(value, units), "temperature", "Celsius"
            elif units in m3_per_second_names + liters_per_second_names + gallons_per_minute_names + oil_barrels_per_hours_names: return discharge_m3_s(value, units), "discharge", "m^3/s"
            elif units in prop_names + percent_names: return proportion(value, units), "proportion", "non-dimensional"
            elif units in degree_names + radian_names: return angle_rad(value, units), "angle", "radians"
            else: raise NameError('Unknown text string for smart_units: {0}\nNote that there has to be a space between the number and the unit, like "5 °F" not "5°F".'.format(input_string))
        else:
            if units in m_names + ft_names + in_names + km_names + NMi_names + mi_names: return length_m(value, units)
            elif units in ms_names + mph_names + knots_names + kph_names: return velocity_m_s(value, units)
            elif units in s_names + min_names + hour_names + day_names + year_names: return time_s(value, units)
            elif units in m3_names + l_names + oil_bbl_names + gal_names: return volume_m3(value, units)
            elif units in c_names + f_names + k_names: return temperature(value, units)
            elif units in m3_per_second_names + liters_per_second_names + gallons_per_minute_names + oil_barrels_per_hours_names: return discharge_m3_s(value, units)
            elif units in prop_names + percent_names: return proportion(value, units)
            elif units in degree_names + radian_names: return angle_rad(value, units)
            else: raise NameError('Unknown text string for smart_units: {0}\nNote that there has to be a space between the number and the unit, like "5 °F" not "5°F".'.format(input_string))

def length_m(value, units=None):
    units.strip(bad_chars)
    try: value = float(value)
    except TypeError:
        value.strip(bad_chars)
        if value == 'infinite': return sys.float_info.max #maximum float value
        elif value in zero: return 0.0
        value = float(value)
    if units.lower() in m_names: return value
    elif units.lower() in ft_names: return value * 0.3048
    elif units.lower() in in_names: return value * 0.0254
    elif units.lower() in km_names: return value * 1000
    elif units.lower() in mi_names: return value * 1609.34
    elif units.lower() in NMi_names: return value * 1852
    else: raise NameError('Unknown length units '+units)

def velocity_m_s(value, units=None):
    units.strip(bad_chars)
    try: value = float(value)
    except TypeError:
        value.strip(bad_chars)
        if value == 'infinite': return sys.float_info.max #maximum float value
        elif value in zero: return 0.0
        value = float(value)
    if units.lower() in ms_names: return value
    elif units.lower() in mph_names : return value * 0.44704
    elif units.lower() in knots_names: return value * 0.514444444
    elif units.lower() in kph_names: return value * 0.277778
    else: raise NameError('Unknown velocity units '+units)

def time_s(value, units=None): #returns a value in seconds
    units.strip(bad_chars)
    try: value = float(value)
    except TypeError:
        value.strip(bad_chars)
        if value == 'infinite': return sys.float_info.max #maximum float value
        elif value in zero: return 0.0
        value = float(value)
    if units.lower() in s_names: return value
    elif units.lower() in min_names: return 60.0 * value
    elif units.lower() in hour_names: return 3600.0 * value
    elif units.lower() in day_names: return 86400.0 * value
    elif units.lower() in year_names: return 366.0*86400*value #assumes a leap-year, so this is maximum year-length rather than true year length
    else: raise NameError('Unknown period units: '+units)

def volume_m3(value, units=None):
    units.strip(bad_chars)
    try: value = float(value)
    except TypeError:
        value.strip(bad_chars)
        if value == 'infinite': return sys.float_info.max #maximum float value
        elif value in zero: return 0.0
        value = float(value)
    if units.lower() in m3_names: return value
    elif units.lower() in l_names: return value / 1000
    elif units.lower() in oil_bbl_names: return value * .158987
    elif units.lower() in gal_names: return value * .00378541
    else: raise NameError('Unknown volume units: '+units)

def discharge_m3_s(value, units=None):
    units.strip(bad_chars)
    try: value = float(value)
    except TypeError:
        value.strip(bad_chars)
        if value == 'infinite': return sys.float_info.max #maximum float value
        elif value in zero: return 0.0
        value = float(value)
    if units.lower() in m3_per_second_names: return value
    elif units.lower() in liters_per_second_names: return value / 1000
    elif units.lower() in gallons_per_minute_names: return value * 0.0000630902
    elif units.lower() in oil_barrels_per_hours_names: return value * 0.0000441631
    else: raise NameError('Unknown discharge units: '+units)


def temperature(value, units=None): #retunrs a value in celsius
    units.strip(bad_chars)
    try: value = float(value)
    except TypeError:
        value.strip(bad_chars)
        if value == 'infinite': return sys.float_info.max #maximum float value
        value = float(value)
    if units.lower() in c_names: return value
    elif units.lower() in f_names: return (value - 32) / 1.8
    elif units.lower() in k_names: return value - 273.15
    else: raise NameError('Unknown temperature units: '+units)

def proportion(value, units=None): #returns a non-dimensional value from zero to one
    units.strip(bad_chars)
    try: value = float(value)
    except TypeError:
        value.strip(bad_chars)
        if value == 'infinite': return sys.float_info.max #maximum float value
        elif value in zero: return 0.0
        value = float(value)
    if units.lower() in prop_names: return value
    elif units.lower() in percent_names: return value/100
    else: raise NameError('Unknown proportion units: '+units)

def angle_rad(value, units=None):
    units.strip(bad_chars)
    try: value = float(value)
    except TypeError:
        value.strip(bad_chars)
        if value == 'infinite': return sys.float_info.max #maximum float value
        elif value in zero: return 0.0
        value = float(value)
    if units.lower() in degree_names: return radians(value)
    elif units.lower() in radian_names: return value
    else: raise NameError('Unknown angle units: '+units)


First a function to generate some random test data for a week. Would be good to make this so it would generate different random distributions for testing.

In [31]:
def random_central_variable_1_week(n = 168, how_extreme = 5):
    return [sum([rand.random() for i in range(how_extreme)])/rand_reps for i in range(n)]

def random_exponential_variable_1_week(n = 168, how_extreme = 5):
    return [rand.random()**how_extreme for i in range(n)]

def random_exponential_zero_common_1_week(n = 168, how_extreme = 5):
    value_list = []
    p_zero = 0.9
    for i in range(n):
        if rand.random() < p_zero: value_list.append(0.0)
        else: value_list.append(rand.random()**how_extreme)
    return value_list

def rescale_random_0_1_to_range(value_list, minimum, maximum):
    total_range = maximum-minimum
    for i,value in enumerate(value_list):
        value_list[i] = value*total_range + minimum
    return value_list #Why do I have to return this?

def make_weeks(n_weeks, rand_function, minimum_text, maximum_text, how_extreme = 5):
    return [rescale_random_0_1_to_range(rand_function(168, how_extreme),smart_units(minimum_text),smart_units(maximum_text)) for i in range(n_weeks)]

Some functions to transform data that's not well partitioned on a histogram with linearly distributed bins. Each function contains its own inverse.

In [32]:
#Transforms
def identity(value, inverse=False): #Identity is its own inverse
    return value

def log_value(value, inverse=False):
    if inverse: return e**value #By default log is the natural log, base e, which is provided by math
    else: return log(value)

def sqrt_value(value, inverse=False):
    if inverse: return value*value
    else: return sqrt(value)

A place for utility functions

In [33]:
#Utility functions
def rescale_metric(metric_array):
    minimum = min(metric_array)
    maximum = max(metric_array)
    raw_range = maximum-minimum
    if raw_range > 0:
        for i,value in enumerate(metric_array):
            metric_array[i] = (value - minimum)/raw_range
    else:
        for i in range(len(metric_array)):
            metric_array[i] = 0.0
    #Array should be passed by reference - no return necessary

Now a set of functions that generate a metric from a specific histogram, with some given settings. All these are passed the same variables so that later we can call them knowing what they'll want, even if they don't use it.

In [34]:
#Metric functions
def count_is_above(histogram_list, minimum, maximum, n_bins, bin_width, threshold, transform = identity):
    #Integrates from the threshold to infinity, with linear interpolation within the bin that includes the threshold
    threshold = transform(threshold) #This means linear interpolation in the transformed space, which I think is good - maybe needs a little thinking
    if threshold < minimum: return sum(histogram_list)
    if threshold > maximum: return 0
    threshold_bin = int(floor((threshold-minimum)/bin_width))
    total = int(round(histogram_list[threshold_bin] * (((threshold_bin+1)*bin_width - threshold - minimum)/bin_width))) #Not sure why I have to force this to int - shouldn't it stay an int since items in histogram_list are ints?
    for i in range(threshold_bin+1, n_bins): total += histogram_list[i]
    return total

def count_is_below(histogram_list, minimum, maximum, n_bins, bin_width, threshold, transform = identity):
    #Integrates from the threshold to infinity, with linear interpolation within the bin that includes the threshold
    threshold = transform(threshold)
    if threshold < minimum: return 0
    if threshold > maximum: return sum(histogram_list)
    threshold_bin = int(floor((threshold-minimum)/bin_width))
    total = int(round(histogram_list[threshold_bin] * ((threshold - minimum - threshold_bin*bin_width)/bin_width))) #Not sure why I have to force this to int - shouldn't it stay an int since items in histogram_list are ints?
    for i in range(0, threshold_bin): total += histogram_list[i]
    return total

def median(histogram_list, minimum, maximum, n_bins, bin_width, threshold = None, transform = identity):
    #Do all the math in the transformed space, then un-transform it when returning the result
    total = sum(histogram_list)
    median_rank = float(total)/2
    partial_sum = 0
    prev_partial_sum = 0
    bin = 0
    while partial_sum < median_rank:
        prev_partial_sum = partial_sum
        partial_sum += histogram_list[bin]
        bin += 1
    untransformed_median = bin_width * ((float(partial_sum) - median_rank)/(partial_sum - prev_partial_sum)+ bin - 1) - minimum
    return transform(untransformed_median, inverse = True)

def sum_up(histogram_list, minimum, maximum, n_bins, bin_width, threshold = None, transform = identity):
    total = 0.0
    current = minimum + bin_width/2
    for i in range(n_bins):
        total += transform(current, inverse=True) * histogram_list[i] #A subtlety here is that we assume the transformation is more than a convenience - it's a meaningful property of the system, reflecting how values are distributed between bin boundaries
        i+=1
        current += bin_width
    return total

Now settings - the variable settings are probably a universal constant, whereas the metric settings reflect a single user's preferences. In this case, they match the example wireframe I sketched out.

In [35]:
#Settings
variable_settings = {
    'wind'       :{'minimum':0,'maximum':'150 mph','bin_width':'5 mph'},
    'gust'       :{'minimum':0,'maximum':'150 mph','bin_width':'5 mph'},
    'temperature':{'minimum':'-50 C', 'maximum':'70 C', 'bin_width':'5 C'},
    'dir'        :{'minimum':0, 'maximum':'360 degrees', 'bin_width':'10 degrees'},
    'hvis'       :{'minimum':'5 meters', 'maximum':'100 miles', 'transform':log_value, 'n_bins':20}, #With log transform it seems better to define it based on the number of bins rather than the bin_width
    'ceiling'    :{'minimum':'100 meters', 'maximum':'100 miles', 'transform':log_value, 'n_bins':20},
    'cover'      :{'minimum':0, 'maximum':'100 %', 'bin_width':'10 %'},
    'rain'       :{'minimum':0, 'maximum':'1 foot', 'transform':sqrt_value, 'n_bins':32}, #Ranges that extend to zero can't be transformed with log - use sqrt instead (but not for negative numbers)
    'humidity'   :{'minimum':0, 'maximum':'100 %', 'bin_width':'5 %'}
}

#Ultimately need some code to ensure that the weights add to 1
metric_settings = {
    'temperature':{'function':count_is_above, 'threshold':'62 F', 'weight': 0.42},
    'rain'       :{'function':sum_up, 'weight':0.2},
    'cover'      :{'function':median, 'weight':0.38}
}

vis_settings = { #These should be between 0 and 1
    'low_medium':0.08,
    'medium_high':0.15
}

Finally make some classes for a single variable, and for an entire dataset.

In [36]:
#Classes
class variable:
    
    def __init__(self, n_weeks=0, raw_values=None, minimum=0.0, maximum=sys.float_info.max, bin_width=None, transform=identity, n_bins=0):
        if raw_values: self.n_weeks = len(raw_values) #n_weeks should only be populated if the intention is to initialize with no data
        else: self.n_weeks = n_weeks
        self.transform = transform
        self.minimum = transform(smart_units(minimum))
        self.maximum = transform(smart_units(maximum))
        if bin_width:
            self.bin_width = transform(smart_units(bin_width)) #Generally you should only define bin width if you're using the identy transform, but just in case...
            self.n_bins = int(ceil((self.maximum-self.minimum)/self.bin_width))
        elif n_bins:
            self.n_bins = n_bins
            self.bin_width = (self.maximum-self.minimum)/n_bins
        else:
            raise Exception("No settings for either n_bins or bin_width, so the variable can't be initialized")
        self.bins = np.zeros((self.n_weeks,self.n_bins), dtype=np.uint8) #2D array, one row per week, one column per histogram bin
        if raw_values: self.process_raw_data(raw_values)
        
    def process_raw_data(self, raw_values):
        for i,week in enumerate(raw_values):
            for value in week:
                value = self.transform(value)
                if value < self.minimum: list_index = 0
                elif value > self.maximum: list_index = self.n_bins-1
                else: list_index = int(floor((value-self.minimum)/self.bin_width))
                self.bins[i][list_index] += 1

class station:
    
    def __init__(self, n_weeks, variable_settings, raw_data):
        self.n_weeks = n_weeks
        self.raw_data = raw_data
        self.processed_data = dict([(name, variable(raw_values = raw_data[name], **settings)) for name,settings in variable_settings.items()])
    
    def generate_metric(self, metric_settings):
        total_metric = np.zeros(self.n_weeks, dtype=np.float32)
        for variable_name, settings in metric_settings.items():
            data = self.processed_data[variable_name]
            metric = np.zeros(self.n_weeks, dtype=np.float32)
            for i,week in enumerate(self.processed_data[variable_name].bins):
                if 'threshold' in settings: threshold = smart_units(settings['threshold'])
                else: threshold = None
                metric[i] = settings['function'](week, data.minimum, data.maximum, data.n_bins, data.bin_width, threshold, data.transform)
            rescale_metric(metric) #Now the values range from 0.0 to 1.0. Scaled based on the whole station, but not applicable between stations.
            for i in range(self.n_weeks):
                total_metric[i] += metric[i] * settings['weight']
        return total_metric
    
    def metric_to_3_cats(self, vis_settings, metric_settings=None, raw_metric=None):
        if metric_settings: raw_metric = self.generate_metric(metric_settings)
        elif raw_metric is None: raise Exception('Must send metric_to_3_cats either raw_metric or metric_settings')
        vis_values = []
        for value in raw_metric:
            if value < vis_settings['low_medium']: vis_values.append('low')
            elif value < vis_settings['medium_high']: vis_values.append('medium')
            else: vis_values.append('high')
        return vis_values

Does it seem to work with some test-data?

In [37]:
#Test data
n_weeks = 260 #5 years
test_data = {
    'wind'       :make_weeks(n_weeks, random_exponential_variable_1_week, 0, '150 mph', 10),
    'gust'       :make_weeks(n_weeks, random_exponential_variable_1_week, 0, '150 mph', 7),
    'temperature':make_weeks(n_weeks, random_central_variable_1_week, '-50 C', '70 C', 5),
    'dir'        :make_weeks(n_weeks, random_central_variable_1_week, 0, '360 degrees', 1), 
    'hvis'       :make_weeks(n_weeks, random_central_variable_1_week, '5 m', '100 miles', 8),
    'ceiling'    :make_weeks(n_weeks, random_central_variable_1_week, '100 m', '100 miles', 8),
    'cover'      :make_weeks(n_weeks, random_central_variable_1_week, 0, '100 %', 1), 
    'rain'       :make_weeks(n_weeks, random_exponential_zero_common_1_week, 0, '1 foot', 10),
    'humidity'   :make_weeks(n_weeks, random_central_variable_1_week, 0, '100 %', 3),
}

test_station = station(n_weeks, variable_settings, test_data)
metric_test = test_station.generate_metric(metric_settings)
metric_vis_test = test_station.metric_to_3_cats(vis_settings, raw_metric=metric_test)
print(metric_test)
print(metric_vis_test)

[8.10917765e-02 7.80063272e-02 9.84968394e-02 3.69461998e-02
 1.19462023e-02 6.42405003e-02 7.38923997e-02 8.54430348e-02
 1.24208853e-02 8.31487328e-02 4.97626588e-02 3.98734175e-02
 1.05379738e-01 1.53481010e-02 9.35126543e-02 1.04588605e-01
 5.78322783e-02 5.91772124e-02 1.39715180e-01 1.18354425e-01
 4.09810096e-02 6.25000000e-02 2.91139241e-02 3.34651880e-02
 2.42088605e-02 6.62974641e-02 7.16772154e-02 3.53639238e-02
 2.87974682e-02 8.52848068e-02 2.45253164e-02 1.09493673e-01
 7.35759437e-02 1.66139249e-02 5.01582250e-02 7.34177157e-02
 4.98417718e-03 5.46677224e-02 6.89082295e-02 4.17721495e-02
 9.15348083e-02 4.38291132e-02 1.41613912e-02 2.22310126e-02
 3.23575959e-02 4.73892391e-02 6.01265766e-03 9.09810048e-03
 3.30696180e-02 7.59493634e-02 2.80854441e-02 2.47626584e-02
 4.65189852e-02 3.16455704e-03 1.00791141e-01 5.02373390e-02
 8.93196166e-02 4.99208830e-02 4.39082272e-02 4.10601273e-02
 3.13291140e-02 2.60284804e-02 8.84493589e-02 7.65031651e-02
 4.28797454e-02 8.868671