In [1]:
import os

import pandas as pd
import numpy as np
from base64 import b64decode

import datetime as dt
import itertools

import matplotlib.pyplot as plt
import seaborn as sns

import os
from IPython.display import clear_output

###UMAP specific#####
import umap.plot
import umap

from random import choice
from string import ascii_lowercase

from itertools import cycle
#import panel as pn

from bokeh import events
from bokeh.models import (Div, CategoricalColorMapper, Range1d, ColorBar, LogTicker, DataTable,
                          TableColumn, Button, DataCube, GroupingInfo, SumAggregator, Legend,
                          LegendItem, NumberFormatter, LinearColorMapper, BasicTicker,
                         LogColorMapper, Label)
from bokeh.models.ranges import Range
from bokeh.models.sources import ColumnDataSource
from bokeh.models.callbacks import CustomJS
from bokeh.models.tools import BoxSelectTool, LassoSelectTool, HoverTool, BoxZoomTool, WheelZoomTool, SaveTool, ResetTool, PanTool
from bokeh.models.widgets import FileInput, AutocompleteInput, Panel, Tabs#, Button
#from bokeh.models.annotations import Legend, LegendItem

from bokeh.plotting import figure#, output_file, show 
from bokeh.io import push_notebook, output_notebook, show, save, curdoc #updated for notebook
from bokeh.layouts import row,column,layout

In [2]:
############
#Jupyter NB convenience functions
#import sys
#sys.path.insert(0, '../src/')
#from jupyter_nb_fxns import toggle_code #not working in Jupyter Lab...
############

### Misc code chunks (ignore)

In [3]:
#normalized_df=(df-df.mean())/df.std()
def std_norm(df):
    return(df - df.mean()/df.std())
#normalized_df=(df-df.min())/(df.max()-df.min())
def min_max(df):
    return((df - df.min())/(df.max() - df.min()))

def min_max_std(val_list):
    return([(d - min(val_list))/(max(val_list) - min(val_list)) for d in val_list])

def scale_range(val_list, scale_range = (0,1)):
    return([(scale_range[1]-scale_range[0])*((d - min(val_list))/(max(val_list) - min(val_list)))+scale_range[0] for d in val_list])

#b.groupby(by=[b.index.month, b.index.year]) #might be deprecated for version 0.21+
#b.groupby(pd.Grouper(freq='M')) 

def autocorr(x):
    result = np.correlate(x, x, mode='full')
    return result[result.size//2:]

def acf(x, length=20):
    return np.array([1]+[np.corrcoef(x[:-i], x[i:])[0,1]  \
        for i in range(1, length)])

def calc_acf(df, threshold = 3, return_all = False):
    acf_sum = {col:acf(df[col].dropna().to_numpy()).sum() for col in df}
    acf_sum_sorted = dict(sorted(acf_sum.items(), key=lambda item: item[1], reverse = True))
    least_cor = [i[0] for i in list(acf_sum_sorted.items()) if i[1] < threshold]
    if return_all is True:
        return acf_sum_sorted
    return least_cor



def knn(df, nn=100):
    #temp_knn = temp.dropna().reset_index().drop('date_hour', axis=1)

    clf = KNeighborsRegressor(n_neighbors=nn, weights='uniform')
    clf.fit(df.index.values[:, np.newaxis], 
            df.iloc[:, 0])
    y_pred = clf.predict(df.index.values[:, np.newaxis])
    #ax = pd.Series(df.iloc[:, 0]).plot(color='lightgray')
    #pd.Series(y_pred).plot(color='black', ax=ax, figsize=(10, 4))
    #plt.tight_layout()
    #plt.show()
    return y_pred

################# 
##Hide the cell##
#toggle_code() ##
#################

### Read data

In [4]:
epis = pd.read_csv("../data/temp/park1/filtered/df_BDfilt_dayfilt_day.csv")
epis['datetime'] = pd.to_datetime(epis['datetime'])

In [5]:
###########
#Labels from other methods - e.g. clustering
#kmeans_labels = pd.read_csv("../data/processed/park1/20200417_final/park1_kmeans_labels.csv")['predicted_cluster']
#gmm_labels = pd.read_csv("../data/processed/park1/20200417_final/park1_gmm_labels.csv")['gmm_cluster']
############################
#### Continuous values to overlay
### OPTION 1 ###
# raw output #
raw_val = pd.read_csv("../data/raw/New_data/SolarPark1_Jun_2019_Jun2020_string_production.csv", sep = ";")
raw_val['datetime'] = pd.to_datetime(raw_val['datetime'])

#############
### OPTION 2 ###
# soiling estimates #
soiling_data = pd.read_csv("../data.bk/processed/park1/soiling/park1_soiling_rate_per_string_split_2.csv")
soiling_data['datetime'] = pd.to_datetime(soiling_data['datetime'])
soiling_data = (1 - soiling_data.set_index('datetime')).loc["2019-09-24":"2019-12-20"].reset_index() #get soiling loss

############################################
# optional transformation of continuous data

# soiling_data.set_index('datetime').mask(~(soiling_data.set_index('datetime') > 0.6)).dropna(how='all', axis = 1).dropna(
#     how='all', axis = 0).groupby(
#     soiling_data.set_index('datetime').columns.str[:-4],axis=1).mean().resample('M').mean().loc["2019-09-24":"2019-12-20"].mean().sort_values(ascending=False).pc()


data_temp = np.log(1 + soiling_data.fillna(0).set_index('datetime')).reset_index()
data_temp['datetime'] = pd.to_datetime(data_temp['datetime'])
data_temp = data_temp.set_index('datetime').resample('10D').median().reset_index()

#########
#STRINGS WITH DATA: remove strings with 0  median soiling
# keep_strings = data_temp[epis.columns].drop(columns=["datetime"]).median()[data_temp[epis.columns].drop(columns=["datetime"]).median().sort_values() > 0].index.to_list()

epis = epis.drop(columns = [i for i in epis.columns if i not in soiling_data.columns])

#### Selecting data

In [6]:
data = epis

## Clustering (UMAP)

In [7]:
%matplotlib inline

######################
seed = 11
#################

def cluster_umap(df,
                 time_window = 'h',
                 cols_to_drop = None,
                 cols_to_keep = None,
                 cluster_labels = 'default',
                 continuous_data = None,
                 dark_mode = False,
                debug = False,
                 bg_color = None,
                 cmap = None,
                 default_cmap = plt.get_cmap('Blues'),
                 ckey = None,
                 log = False, 
                 ckey_cmap = None,
                 scale_colors = False,
                 log_palette = "Blues",
                 label = None,
                 table_csv_filename = "./umap_table_data.csv",
                 **kwargs):
    
    '''
    Builds off of the umap.interactive plot to add more features such as a legend, different values to colorize by, simple statistics on selections, and so on.
    (Does not actually extend the function or class directly but only modifies its output in-place.)

            Parameters:
                    df (pandas.DataFrame, np.array): A pandas dataframe or numpy array with numerical values (must drop all other columns first EXCEPT a datetime column that is used as an index).
                    #CHOOSE ONE:#
                    cluster_labels (str, list): A list of strings/discrete integers of categorical labels to label each point with. (Must match the number of columns in the dataframe.)
                                                Can also pass "id" and it will use the prefixes of the column names in the dataframe as identities.
                    #OR#
                    continuous_data (list): A list of numerical values that will be used for the continuous color bar and coloring of each dot. (Cannot be used along with cluster_labels.)

            Returns:
                    mapper.__embedding (np.array): UMAP embedding coordinates that can be saved for plotting elsewhere. Returned as 2-column (x-coord, y-coord) numpy array.
                    final_layout (panel.layout): Panel Layout object of UMAP plot composed with other added elements that can then be rendered.
    '''
    
    if continuous_data is not None and cluster_labels == 'default':
        print("Using continuous data for color bar.")
        cluster_labels == None
    elif cluster_labels == 'default':
        cluster_labels = 'id'
    
    
    color_factors = None

    time_col = [i for i in df.columns if str(i).find("date") > 0 or str(i).find("time") > 0]
    if len(time_col) == 0:
        print("WARNING - no time/date column detected. Did you forget to transpose the data, or are you trying to cluster by time?")
        time_col = list(range(df.shape[0]))
    
    if cols_to_keep is not None:
        temp_og = df[cols_to_keep]
        cols_to_drop = [i for i in cols_to_drop if i in temp_og.columns.to_list()]
    elif cols_to_drop is not None:
        temp_og = df.drop(cols_to_drop, axis = 1)
    else:
        temp_og = df
    
    
    ###### NOT IMPLEMENTED #######
    def upload_data(attr, old, new):

        decoded = b64decode(new)
        f = io.BytesIO(decoded)
        temp = pd.read_csv(f)
        push_notebook()

    file_input = FileInput(accept=".csv")
    file_input.on_change('value', upload_data)
    doc=curdoc()
    doc.add_root(file_input)    
    
    
    #####
    
    
    temp = temp_og.dropna().set_index(time_col)

    data_min = min(temp.mean())
    data_max = max(temp.mean())
    temp = temp.dropna()
    temp = temp.T



    temp['label'] = temp.index
    temp['label_1'] = temp['label'].apply(lambda x: x[3:4])
    temp['label_2'] = temp['label'].apply(lambda x: x[5:6])
    temp['label_3'] = temp['label'].apply(lambda x: x[7:8])

    ######################
    div = Div(width=400)
    div2 = Div(width=400)
    
    color_switch_button_disable = True
    color_cycle_cds = umap_values_color = labels = None #None is a singleton object so this is allowed
        
    palette = sns.color_palette("tab10").as_hex()
    id_color_factors = temp['label_1'].unique()#temp_og.columns.unique().to_list()

    if cluster_labels == "id":
        color_switch_button_disable = False
        labels = temp['label_1'].to_list()
        color_factors = [str(i) for i in set(labels)]
        cmap = CategoricalColorMapper(factors=color_factors,
                                   palette=palette)
    elif cluster_labels is not None:
        color_switch_button_disable = False
        color_factors = [str(i) for i in set(cluster_labels.factorize()[0])]
        color_map = CategoricalColorMapper(factors=color_factors,
                                   palette=palette)
        color_map.tags.append("categorical")
        color_map.tags.append(False)
    
    id_color_map = CategoricalColorMapper(factors=id_color_factors,palette=palette[:len(id_color_factors)])

    ################
    default_values = temp.drop(['label','label_1','label_2','label_3'], axis = 1).mean(axis = 1).values
    
    mapper = umap.UMAP(random_state=seed).fit(temp.drop(['label','label_1','label_2','label_3'], axis = 1))

    hover_data = pd.DataFrame({'Full name':temp.index.to_list(),
                              'First pos':temp['label_1'].to_list(),
                              'Second pos':temp['label_2'].to_list(),
                              'Third pos':temp['label_3'].to_list()
                              })
    
    
    log_cmap = False
    if continuous_data is not None:
        umap_values_color = continuous_data
        if scale_colors is True:
            umap_values_color = scale_range(umap_values_color, )#min_max_std(umap_values_color)
        if log is True:
            log_cmap = LogColorMapper(palette= sns.color_palette(log_palette).as_hex(), low=min([i for i in umap_values_color if i > 0]), high=max(umap_values_color))
    else:
        if cluster_labels is None:
            umap_values_color = default_values #change this earlier in code and use this as the default

    theme = "fire" if dark_mode is True else None

    umap_format_dict = {
        k:v for k,v in zip(["theme", "background", "cmap", "color_key", "color_key_cmap", "values", "labels"],
                           [theme, bg_color, cmap, ckey, ckey_cmap, umap_values_color, labels]) if v is not None}
        
    if debug is True:
        print(
            umap_values_color,
            umap_format_dict.items(),
            sep = "\n\n"
        )
        
    p = umap.plot.interactive(mapper,
                              interactive_text_search = True,
                              interactive_text_search_alpha_contrast=0.9,
                              #labels=cluster_labels,
                              hover_data=hover_data,
                              point_size=7,

                              **umap_format_dict
                             )
        
    if type(p) == type(column()):
        text_s, p = p.children
        text_search = AutocompleteInput()
        text_search.update(**text_s.properties_with_values())
        #text_search.update(completions = List(String))
        text_search.update(completions = temp['label'].to_list(),
                           restrict = False,
                           min_characters = 1,
                           case_sensitive = False
                          )

    source = p.renderers[0].data_source
    
    if source.data.get('value') is None:
        try:
            source.data['value'] = default_values 
            p.renderers[0].glyph.fill_color = {"field":'value', "transform":source.data['cmap']}
            p.renderers[0].glyph.line_color = {"field":'value', "transform":source.data['cmap']}
        except:
            pass #NOT IMPLEMENTED
    else:
        color_cycle_cds = dict(clist1 = dict(fill = p.renderers[0].glyph.fill_color['transform'],
                                             line = p.renderers[0].glyph.line_color))
        
        
    if debug is True:
        print(source.data)
    
    if log is True:
        p.renderers[0].glyph.fill_color = {"field":'value', "transform":log_cmap};
        p.renderers[0].glyph.line_color = {"field":'value', "transform":log_cmap};

    if cluster_labels is None or cluster_labels == 'id':
        alt_colors = None
    else:
        alt_colors = cluster_labels.factorize()[0]
        source.data['label'] = [str(i) for i in alt_colors]

    
    ########### TO DO - clean up ##########
    try:
        p.renderers[0].glyph.fill_color['transform'].tags.append('continuous')
        p.renderers[0].glyph.fill_color['transform'].tags.append(True)

        if cluster_labels is not None and cluster_labels != 'id':
            color_cycle_cds['clist2'] = color_map
        else:
            color_cycle_cds['clist2'] = id_color_map
    except:
        try:
            color_cycle_cds = dict(clist2 = id_color_map)
        except:
            pass
        pass
    ###############################

    p.add_layout(Legend(), 'right')
    if cluster_labels is not None and cluster_labels != 'id':
        c = p.circle('x', 'y', source=source, alpha=0.8, legend_field="label", color = dict(field='label', transform=color_map))
        c.visible = False

    p.legend.location = 'left'

    
    ######### FIX #######
    try:
        color_bar = ColorBar(color_mapper=color_cycle_cds['clist1']['fill'],
                             border_line_color=None, location=(0,0))
    except:
        color_bar = ColorBar(color_mapper=LinearColorMapper(
            palette="Viridis256",
            low=data_min,
            high=data_max),
                             ticker= BasicTicker(),
                             location=(0,0))
    ##########################
    
    p.add_layout(color_bar, "left")
    
    ############
    
    color_button_callback = CustomJS(args=dict(leg=p.legend, p=p, source=source, colors=color_cycle_cds, div=div), code="""
    div.text = "<p> Renderers: " + [p.renderers] //+ "<br>Main glyph: " + JSON.stringify(p.renderers[0].glyph);
    if (p.renderers[0].glyph.fill_color['transform'].tags.includes("continuous")) {
        p.renderers[0].glyph.fill_color = {"field":'label', "transform":colors['clist2']};
        p.renderers[0].glyph.line_color = {"field":'label', "transform":colors['clist2']};
        
        p.renderers[0].nonselection_glyph.fill_color = {"field":'label', "transform":colors['clist2']};
        p.renderers[0].nonselection_glyph.line_color = {"field":'label', "transform":colors['clist2']};
        
        //p.renderers[1].glyph.fill_color = {"field":'label', "transform":colors['clist2']};
        //p.renderers[1].glyph.line_color = {"field":'label', "transform":colors['clist2']};

    } else {
        p.renderers[0].glyph.fill_color = {"field":'value', "transform":colors['clist1']['fill']}
        p.renderers[0].glyph.line_color = colors['clist1']['line'];
        
        p.renderers[0].nonselection_glyph.fill_color = {"field":'value', "transform":colors['clist1']['fill']};
        p.renderers[0].nonselection_glyph.line_color = colors['clist1']['line'];

        //p.renderers[1].glyph.fill_color = {"field":'value', "transform":colors['clist1']['fill']};
        //p.renderers[1].glyph.line_color = colors['clist1']['line'];
    }
    source.change.emit();
    p.change.emit();
    //div.text = "<p>Current labels: " + JSON.stringify(p.glyph.fill_color['transform'].tags) +"<br><br>"+ p.glyph.fill_color['transform'].tags.includes("continuous").toString() + leg[0];


""")
    
    ##################
    
    tools = ["box_zoom","wheel_zoom_tool","reset", "save_tool"]
    for tool in tools:
        for i in range(len(p.tools)):
            try:
                if not str(p.tools[i]).lower().startswith(tool.replace('_',"")):
                    del p.tools[i]
            except IndexError:
                pass


    box = BoxSelectTool()
    p.add_tools(box)
    p.add_tools(ResetTool())
    p.add_tools(LassoSelectTool(select_every_mousemove = False))
    p.add_tools(WheelZoomTool())
    p.add_tools(PanTool())

    ########
    hover = HoverTool()
    hover.tooltips = [
    ("String", "@{Full name}"),
   # ("Inverter", "@{First pos}"),
    #("Combiner", "@{Second pos}"),
    ("Value", "@{value}")
    #("(x,y)", "($x, $y)")
]
    p.add_tools(hover)
    ########
    
    ##########CDS 2 & STATS############
    s2 = ColumnDataSource(data=dict(x=[], y=[], label_fn=[], label_1=[], label_2=[], label_3=[], value=[]))
    
    stats = ColumnDataSource(data=dict(labels=[],counts=[]))
    #############################

    p2 = figure(plot_width=400, plot_height=400,hidpi = True,
                x_range=p.x_range,
                y_range=p.y_range,
                tools="",
                title="Selected view",
                align='center'
                )
    
    p2.add_layout(Legend(), 'right') # Necessary to render legend out of plot area
    r = p2.circle('x', 'y', source=s2, alpha=0.6, legend_field="label_1", color = dict(field='label_1', transform=id_color_map))
    ######

    p.js_on_event(events.SelectionGeometry, CustomJS(
        args=dict(div=div, xrange = p2.xaxis[0], yrange = p2.yaxis[0]),
        code="""

    //div.text = "Selection <p> <p>" + JSON.stringify(cb_obj.geometry, undefined, 2);
    var x0 = cb_obj.geometry['x0'];
    var x1 = cb_obj.geometry['x1'];
    var y0 = cb_obj.geometry['y0'];
    var y1 = cb_obj.geometry['y1'];

    //div2.text = "x0 <p>" + JSON.stringify(x0,undefined, 2)+"<br><p><p>"+"x1<br>" +JSON.stringify(x1,undefined, 2)+"<br><p><p>"+"y0 <p>" + JSON.stringify(y0,undefined, 2)+"<br><p><p>"+"y1<br>" +JSON.stringify(y1,undefined, 2);

    //xrange.bounds = [x0, x1];
    //yrange.bounds = [y0, y1];

    //xrange.start = x0;
    //xrange.end = x1;
    //yrange.start = y0;
    //yrange.end = y1;

    """))


    source.selected.js_on_change('indices',
                                 CustomJS(
                                     args=dict(s1=source,
                                               s2=s2,
                                               p=p,
                                               s3=stats,
                                               #r=r,
                                               #smapper=color_map,
                                               div=div,
                                               div2=div2,
                                               cfactors = color_factors
                                              ),
                                     code="""
            var inds = cb_obj.indices;
            var d1 = s1.data;
            var d2 = s2.data;
            var stats = s3.data;
            
            d2['x'] = [];
            d2['y'] = [];
            d2['label_fn'] = [];
            d2['label_1'] = [];
            d2['label_2'] = [];
            d2['label_3'] = [];
            d2['value'] = [];

            for (var i = 0; i < inds.length; i++) {
                d2['x'].push(d1['x'][inds[i]]);
                d2['y'].push(d1['y'][inds[i]]);
                d2['label_fn'].push(d1['Full name'][inds[i]]);
                d2['label_1'].push(d1['First pos'][inds[i]]);
                d2['label_2'].push(d1['Second pos'][inds[i]]);
                d2['label_3'].push(d1['Third pos'][inds[i]]);
                try{
                d2['value'].push(d1['value'][inds[i]]); //no 'value' for colors used in original UMAP
                }
                catch (e){} 
            }
            
            //let unique = [...new Set(d2['label_1'])];
            
            var counts = {};

            for (var i = 0; i <  d2['label_1'].length; i++) {
              var num =  d2['label_1'][i];
              counts[num] = counts[num] ? counts[num] + 1 : 1;
            }
            
            try{
                let average = (array) => array.reduce((a, b) => a + b) / array.length;
                var mean = Math.round(average(d2['value'])*100)/100;

                //const arrSort = average(d2['value']).sort();
                //const mid = Math.ceil(len / 2);
                //const median = len % 2 == 0 ? (arrSort[mid] + arrSort[mid - 1]) / 2 : arrSort[mid - 1];

                let values = d2['value'];
                values.sort(function(a,b) {
                         return a - b;
                                });


                let lowMiddle = Math.floor( (values.length - 1) / 2);
                let highMiddle = Math.ceil( (values.length - 1) / 2);
                let median = Math.round(( values[lowMiddle] + values[highMiddle]) / 2*100)/100;
                //var median = 2;


                //div.text = "<p>" + JSON.stringify(d1['color']);
                div2.text = "<p><br><br><b>Inverters present:</b> " + [...new Set(d2['label_1'])] + "<br>Mean of selection: "+mean+ "<br>Median of selection: "+median
                //div2.text = "<p><br><br><b>Inverters present:</b> " + [...new Set(d2['label_1'])] //+"<p><p> <b>CBs present:</b> "+[...new Set(d2['label_2'])]//+"<p>Strings present: "+[...new Set(d2['label_3'])];
                //div2.text = "Inverters present: " + unique;
                //div2.text = "Strings present: " + JSON.stringify(counts);

                //s2.change.emit();

                stats['labels'] = [];
                stats['counts'] = [];
                //stats['means'] = [];
                //stats['medians'] = [];

                for (var key in counts){
                  stats['labels'].push(key);
                  stats['counts'].push(counts[key]);
                  //stats['means'].push(mean)
                }
            }
            catch(e){console.log("value attribute is empty.")}
            
            s2.change.emit();
            s3.change.emit();
        
        """)
    )

    
    ############################################

    p2.legend.click_policy = "mute"
    p2.legend.background_fill_alpha = 0.5
    p2.legend.padding = 1
    p2.legend.spacing = 1
        
    ################
    columns = [
        TableColumn(field="labels", title="Inverter #"),# formatter=NumberFormatter(format='0',text_align='center')),
        TableColumn(field="counts", title="Count"),#, formatter=NumberFormatter(format='0',text_align='center')),
    ]

    data_table = DataTable(source=stats, columns=columns, width=300, height=280, index_position=None)#, autosize_mode="fit_columns") #aspect_ratio = 'auto', sizing_mode = "scale_both")
    
    if debug is True:
        print(source.data, sep = "\n\n")
        
    if color_cycle_cds is None or len(color_cycle_cds.keys()) == 1:
        color_switch_button_disable = True
        
    button = Button(label="Switch colors", button_type = "primary", default_size = 50, align="end", disabled = color_switch_button_disable)
    button.js_on_click(color_button_callback)

    if color_switch_button_disable is True:
        button.label = "No alternate labels found"
    
    save_table_button = Button(label="Download data", button_type="success")
    save_table_button.js_on_click(CustomJS(args=dict(source=s2),code=open("../src/scripts/js_helper/download.js").read()))
    
    p.toolbar_location = "right"
    ##########

    text_label = Label(x=300, y=20, x_units='screen', y_units='screen',
                 text=label, render_mode='css',
                 border_line_color='black', border_line_alpha=0.0,
                 background_fill_color='white', background_fill_alpha=0.0)

    p.add_layout(text_label)
    #########
    del temp, temp_og
    final_layout =  layout([[text_search, button],
                    [p, [p2, column(div2,data_table,save_table_button)]],
                    #[button],
                    [div]
                    #[file_input, div]
                   ])

    return [mapper.embedding_, final_layout]

########################################################################################

In [8]:
output_notebook()

umap_coords, new_layout = cluster_umap(data,
                                      #cluster_labels = None,
                                      dark_mode = False,
                                    label = "Park 1",
                                       #debug = True
                                      )

show(new_layout, notebook_handle = True);

TypeError: string indices must be integers