In [None]:
# %load_ext autoreload
# %autoreload 2

In [None]:
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import pandas as pd

# from plotly_wordcloud import plotly_wordcloud
from geopy.geocoders import Nominatim
import folium
from wordcloud import WordCloud, STOPWORDS
from collections import defaultdict
from skimage import io
import numpy as np
import re

# from jupyter_dash import JupyterDash
stopwords = set(STOPWORDS)

# Setting renderer to SVG ensures Plotly output works in multiple places:
# In VSCode and also nbconvert from jupyter notebook to HTML
# See https://plotly.com/python/renderers/#multiple-renderers
#
# A default SVG renderer is used to ensure data visualizations are loaded quickly without
# JS blocking. The default Plotly renderer produces visualizations in JSON, which is chunky
# and can block JS for an incredulous period of time, creating a disatisfying UX experience for users
pio.renderers.default = "notebook"
# png_renderer = pio.renderers["png"]
# png_renderer.width = 1000
# png_renderer.height = 600

## Default Layout and Trace Configuration Functions

In [None]:
# Default dropdown menu button styling
default_bar_dropdown_styling = dict(
    bgcolor="white",
    active=0,
    yanchor="top",
    xanchor="center",
    direction="up",
    y=-0.1,
    x=0.5,
)

# default_table_dropdown_styling = dict(
#     bgcolor="white",
#     active=0,
#     yanchor='top',
#     xanchor='center',
#     direction='down',
#     y=1.3,
#     x=0,
# )

# MACROS used for automatic calculation of table height
TABLE_CELL_HEIGHT_DEFAULT = 50
TABLE_HEADER_HEIGHT_DEFAULT = 60
TABLE_CELL_PADDING = 6
BACKGROUND_COLOR_DEFAULT = ""

In [None]:
# Customize Plotly figure styles
def bnw_template():
    bnw = go.layout.Template(
        layout=go.Layout(
            xaxis=go.layout.XAxis(showgrid=False, zeroline=True),
            # margin=go.layout.Margin(
            #     l=2,
            #     r=2
            # ),
            yaxis=go.layout.YAxis(showgrid=True, zeroline=True),
            margin=go.layout.Margin(l=100, b=100),
            font=go.layout.Font(family="Arial, Verdana", size=19),
            title=go.layout.Title(
                font=go.layout.title.Font(family="Calibri Black, Arial Black", size=25),
                # ),
            ),
            legend=go.layout.Legend(font=go.layout.legend.Font(size=17)),
            plot_bgcolor="#f0f0f0",
            paper_bgcolor="#f0f0f0",
            dragmode="pan",
            showlegend=False,
            colorway=px.colors.qualitative.D3,
            uniformtext_minsize=12,
            uniformtext_mode="hide",
        ),
        # I cannot get hovertemplate custom configuration to work
        data=dict(
            bar=[go.Bar(hovertemplate="<b>%{x}</b><br><i>Value</i>: %{y}")],
            scatter=[go.Scatter(hovertemplate="<b>%{x}</b><br><i>Y</i>: %{y}")],
            pie=[
                go.Pie(
                    hovertemplate="<b>%{label}</b><br><i>Count</i>: %{value}<br><i>Percent</i>: %{percent}"
                )
            ],
            histogram=[go.Histogram(hovertemplate="<b>%{x}</b><br><i>Value</i>: %{y}")],
        ),
    )
    return bnw


# bnw.data.scatter = [go.Scatter(mode="lines+markers", )]?
# Add template
pio.templates["bnw"] = bnw_template()

# If there are multiple templates delimited by +, last template takes precedence
px.defaults.template = "bnw"

In [None]:
fig = go.Figure(go.Bar(x=[1, 2, 3], y=[2, 3, 4]))
fig.update_layout(template="bnw")
fig

## Helpful Colorscales

In [None]:
fig = px.colors.qualitative.swatches()
fig.show(renderer="notebook")

In [None]:
fig = px.colors.diverging.swatches_continuous()
fig.show()

In [None]:
a = pd.DataFrame({"a": [1, 2], "b": [3, 4]})
fig = px.line(a, x="a", y="b")
px.defaults.reset()
fig

## Helper Functions

In [None]:
def generate_value_count_stats(series: pd.Series, places=3, include_na=False):
    """
    Assuming a series of values

    Specify decimal places in variable "places"

    Add percentage column and text column reflecting percentage values if necessary,
    in order to make dataframe input into plotly easier
    """
    counts = pd.DataFrame(series.value_counts(dropna=not include_na), columns=[series.name])
    counts["percentage"] = ((counts / counts.sum() * 100).round(places)).astype(
        str
    ) + "%"
    counts["plotly_text"] = counts.index.astype(str) + "<br>" + counts["percentage"]
    return counts


# a = pd.Series([1, 1, 2, 2, 2, 3], name="hi")
# counts = generate_value_count_stats(a)
# counts

### Count Chart Helpers

In [None]:
class PlotlyChartUtilities:
    def __init__(self, figures: list[go.Figure]):
        self.figures = figures if isinstance(figures, list) else [figures]

    def remove_px_attributes(self,
        remove_hovertemplate=True,
        remove_mode=True) -> None:
        '''
            Remove Plotly Express attributes that override default templates.
            Technically we copy old traces, remove all old traces and add 
            new traces without specific attributes.

            In the future, it may be viable to have a list of attributes to remove 
        '''
        for fig in self.figures:
            copy_fig = go.Figure(fig)
            fig.data = []

            for trace in copy_fig.data:
                # to_plotly_json() should also return the type of
                # the existing trace, so we know what type of trace
                # we need to add to the new figure
                data_json = trace.to_plotly_json()
                if 'hovertemplate' in data_json and remove_hovertemplate:
                    del data_json["hovertemplate"]
                
                if 'mode' in data_json and remove_mode:
                    del data_json["mode"]

                fig.add_trace(data_json)

    def convert_figures_to_figurewidgets(self) -> list[go.FigureWidget]:
        '''
            Convert go.Figures to go.FigureWidgets
        '''
        return [go.FigureWidget(data=[trace.to_plotly_json() for trace in figure["data"]], 
                                layout=figure["layout"]) for figure in self.figures]

    def add_text(self, text_arr):
        '''
            Add text to traces
        '''
        for fig in self.figures:
            fig.update_traces(texttemplate="%{text}", text=text_arr)

    def add_annotation_for_figure(self, text, x_anchor="right", y_anchor="top", x=1, y=-0.1):
        '''
        fig: Plotly figure on which annotation is added
        df_size: size of DataFrame
        x_anchor: the relative location on which text is anchored on the figure's x-axis
        y_anchor: the relative locatio on which text is anchored on the figure's y-axis
        x: x-offset from relative location on x-axis
        y: y-offset from relative location on y-axis
        '''
        for fig in self.figures:
            fig.add_annotation(
                dict(xref="paper", yref="paper",
                        xanchor=x_anchor, yanchor=y_anchor, x=x, y=y, text=text, ax=0, ay=0)
            )


    
    

In [None]:
class SingleDistributionUtility(PlotlyChartUtilities):
    def __init__(self, distrib: pd.DataFrame, figures: list[go.Figure], custom_sample_size=None, distrib_column=None, percent_column=None, text_column=None):
        super().__init__(figures)
        self.distrib = distrib
        self.percent_column = percent_column
        self.text_column = text_column
        self.distrib_column = distrib_column
        self.custom_sample_size = custom_sample_size

    def automate_plot(self, is_text=True, use_percentage=True):
        desired_column = self.distrib_column
        
        if desired_column is None and self.custom_sample_size is None:
            for column in self.distrib.columns:
                if column == "index":
                    continue
                else:
                    desired_column = column
                    break

        self.remove_px_attributes()
        
        # add sample size annotation
        self.add_annotation_for_figure(f"Sample Size: {self.custom_sample_size if self.custom_sample_size is not None else sum(self.distrib[desired_column])}")

        # add text. Encode percentage in text, count in hover, as percentage information
        # is more relevant in machine learning circumstances
        if is_text:
            if use_percentage:
                self.add_text(self.distrib[self.percent_column].astype(str) if self.percent_column is not None else self.distrib["percentage"].astype(str))
            else:
                self.add_text(self.distrib[self.text_column] if self.text_column is not None else self.distrib["plotly_text"])



        




        

In [None]:
def convert_figures_to_figurewidgets(figures : list[go.Figure]) -> list[go.FigureWidget]:
    '''
        Convert go.Figures to go.FigureWidgets
    '''
    return [go.FigureWidget(data=[trace.to_plotly_json() for trace in figure["data"]], 
                            layout=figure["layout"]) for figure in figures]


def remove_px_attributes(fig : go.Figure,
                         remove_hovertemplate=True,
                         remove_mode=True):
    '''
        Remove Plotly Express attributes that override default templates.
        Technically we copy old traces, remove all old traces and add 
        new traces without specific attributes 
    '''
    copy_fig = go.Figure(fig)
    fig.data = []

    for trace in copy_fig.data:
        # to_plotly_json() should also return the type of
        # the existing trace, so we know what type of trace
        # we need to add to the new figure
        data_json = trace.to_plotly_json()
        if 'hovertemplate' in data_json and remove_hovertemplate:
            del data_json["hovertemplate"]
        
        if 'mode' in data_json and remove_mode:
            del data_json["mode"]

        fig.add_trace(data_json)

In [None]:
from plotly.subplots import make_subplots

def generate_subplots(figures, custom_specs=None, desired_rows=None, desired_columns=None, titles=None):

    '''
        For each figure, append their traces to new subplot figure
        based on the correct row and col indices
    '''
    num_figures = len(figures)
    # num_traces = sum([len(figure.data) for figure in figures])
    
    # if desired_rows is not None:
    #     desired_columns = num_figures // desired_rows + 1 

    # if desired_columns is not None:
    #     desired_rows = num_figures // desired_columns + 1


    fig = make_subplots(desired_rows, desired_columns,
                        specs=np.full((desired_rows, desired_columns), {}).tolist() if custom_specs is None else custom_specs,
                        subplot_titles=titles)
    
    row = 1
    col = 1
    for figure in figures:
        if col > desired_columns:
                col = 1
                row += 1
        
        for trace in figure["data"]:
            fig.append_trace(
                trace, row=row, col=col
            )

        col += 1
    fig.update_layout(template="bnw")

    return fig

# fig1 = go.Figure(go.Scatter(x=[1,2,3], y=[4,5,6], mode="lines+markers"))
# fig2 = px.line(x=[1,2,3], y=[7,8,9])
# fig2.update_traces(line=dict(color="firebrick"), mode="lines")
# remove_px_attributes(fig2, remove_hovertemplate=True, remove_mode=False)
# fig3 = px.bar(pd.DataFrame({"a" : [1,1,3], "b": [7,8,9], "c" : ["blue", "brown", "brown"]}),
#               x="a", y="b", color="c")
# remove_px_attributes(fig3)


# specs=[
#     [{}, {}],
#     [{"colspan": 2}, None]
# ]
# figs = [fig1, fig2, fig3]
# generate_subplots(figs, custom_specs=specs, desired_rows=2)

In [None]:
def add_annotation_for_figure(fig, text, x_anchor="center", y_anchor="top", x=1, y=-0.1):
    '''
    fig: Plotly figure on which annotation is added
    df_size: size of DataFrame
    x_anchor: the relative location on which text is anchored on the figure's x-axis
    y_anchor: the relative locatio on which text is anchored on the figure's y-axis
    x: x-offset from relative location on x-axis
    y: y-offset from relative location on y-axis
    '''
    fig.add_annotation(
        dict(xref="paper", yref="paper",
                xanchor=x_anchor, yanchor=y_anchor, x=x, y=y, text=text, ax=0, ay=0)
    )

# a = pd.DataFrame({"a" : [1,2], "b" : [3,4]})
# fig = px.line(a, x="a", y="b")
# add_annotation_for_figure(fig, len(a))
# fig

In [None]:
def add_percent_labels_for_hist(fig : go.Figure, series : pd.Series=None, horizontal : bool=None, custom=False, custom_text_arr=None):
    '''
        Add the proportion of each histogram bar with respect to all histogram bars
        series: Series representing the entire population from which you want to deduce proportions. 
        (Ex. series should represent all proportions summed up)
        
        fig: Plotly Figure
        horizontal: Boolean representing if bar chart is horizontal or vertical
        custom: Boolean representing if custom text is used on bar charts
        custom_text_arr: If custom is True, assign custom_text_arr to histogram text 
    '''
    texts = []
    # assert fig.data[0].type == "histogram", "You can only add percent labels for histograms"

    if custom is True:
        texts=custom_text_arr
        for i, hist in enumerate(fig.data):
            hist["text"] = f"{custom_text_arr[i]}"

    else:
        # Only works for named bar charts
        if horizontal is True:
            texts=[format(len(hist['y'])/series.notna().sum() * 100, ".2f") + "%" for hist in fig.data]
        else:
            texts=[format(len(hist['x'])/series.notna().sum() * 100, ".2f") + "%" for hist in fig.data]
    
        for i, hist in enumerate(fig.data):
            if horizontal is True:
                hist["text"] = f"{len(hist['y'])} ({texts[i]})"
            else:
                hist["text"] = f"{len(hist['x'])} ({texts[i]})"

# a = pd.DataFrame({"a": ["a", "a", "b", "b", "b"]})
# fig = px.histogram(a, y="a", color="a") # Try changing between x and y
# fig.update_xaxes(categoryorder="category descending")
# # add_percent_labels_for_hist(a["a"], fig, horizontal=True) # Try changing between horizontal = True and horizontal = False
# fig

In [None]:
def change_to_dark_mode_plotly(fig):
    '''
        Change Plotly figure to dark mode
    '''
    fig.update_layout(paper_bgcolor='#2d3035', font=dict(color="#dcdcdc"), plot_bgcolor='#2d3035',
                      title_font=dict(color="#dfdfdf")) 
    if fig.layout.updatemenus:
        fig.layout.updatemenus[0].bgcolor="#ECECEC"
        fig.layout.updatemenus[0].font=dict(color="#000000")


# a = pd.DataFrame({"a": [1,2,3]})
# fig = px.histogram(a, x="a")
# change_to_dark_mode_plotly(fig)
# fig.update_layout(title="Hi")

In [None]:
def break_text(texts, char_limit=20):
    '''
        Break text based on how many chars are allowed in a line.
        The number of chars is assigned by char_limit
    '''
    def closest_next(text, idx):
        ci = " "
        return next(filter(lambda i: ci == text[i], range(idx, len(text))), len(text))


    if type(texts) == str:
        new_text = texts
        for i in range(char_limit, len(texts) + 1, char_limit):
            closest_i = closest_next(new_text, i)
            new_text = new_text[0:closest_i] + "<br>" + new_text[closest_i+1:]

        return new_text

    if type(texts) == list:
        list_of_new_texts = []
        for text in texts:
            new_text = text
            for i in range(char_limit, len(text) + 1, char_limit):
                closest_i = closest_next(new_text, i)
                new_text = new_text[0:closest_i] + "<br>" + new_text[closest_i+1:]

            list_of_new_texts.append(new_text)
        
        return list_of_new_texts
    

# break_text("It was a fine spring evening. I ate a lot of potatoes. Ggz bro")

In [None]:
def create_table(a_new,
                 column=None,
                 label=None,
                 reindex=False,
                 reindex_order : list[str] = None, 
                 headers=["Answer", "Number of Students"],
                 cell_height=TABLE_CELL_HEIGHT_DEFAULT,
                 header_height=TABLE_HEADER_HEIGHT_DEFAULT ,
                 base_height=320,
                 include_percentage=True,
                 limit=None):
    '''
        Returns a table of value counts
    ''' 
    if label is None:
        label = column

    if column is None:
         a_new_count = pd.DataFrame(a_new.value_counts()).reset_index() if reindex == False \
        else pd.DataFrame(a_new.value_counts()).reindex(reindex_order).reset_index()
    else:
        a_new_count = pd.DataFrame(a_new[column].value_counts()).reset_index() if reindex == False \
        else pd.DataFrame(a_new[column].value_counts()).reindex(reindex_order).reset_index()
        
    if limit:
        a_new_count = a_new_count.head(limit)
    
    if include_percentage:
            if column is not None:
                a_new_count[a_new_count.columns[1]] = a_new_count[a_new_count.columns[1]].astype(str) + " (" + \
                    (a_new_count[a_new_count.columns[1]] / a_new[column].notna().sum() * 100).round(2).astype(str) + "%" + ")"
            else:
                a_new_count[a_new_count.columns[1]] = a_new_count[a_new_count.columns[1]].astype(str) + " (" + \
                    (a_new_count[a_new_count.columns[1]] / a_new.notna().sum() * 100).round(2).astype(str) + "%" + ")"


    fig = go.Figure(
        go.Table(header=dict(values=headers, height=header_height, fill_color=px.colors.qualitative.Pastel2[0], 
                             font=dict(color="white", size=header_height/4), line_color=BACKGROUND_COLOR_DEFAULT), 
                
                 cells=dict(values=a_new_count.T.values, height=cell_height, 
                            font=dict(size=cell_height/3.5), 
                            fill_color=px.colors.qualitative.Pastel1[2], line_color=BACKGROUND_COLOR_DEFAULT)
                )
    )
    fig.update_layout(margin=dict(l=2, r=2),
                       height=calc_table_height(a_new_count, height_per_row=cell_height, base=base_height), title_text=label,
                       plot_bgcolor=BACKGROUND_COLOR_DEFAULT,
                       paper_bgcolor=BACKGROUND_COLOR_DEFAULT) 
    return fig


# create_table(pd.Series([1, 1, 2, 2, 2], name="hi"))

In [None]:
def create_histogram(series : pd.Series, horizontal=False, color=True):
    fig = None
    if color is False:
        fig = px.histogram(series)
    else:
        fig = px.histogram(series, color="value")

    if horizontal is False:
        fig.update_xaxes(title_text = series.name)
        fig.update_yaxes(title_text = "count")
    else:
        fig.update_yaxes(title_text = series.name)
        fig.update_xaxes(title_text = "count")

    add_percent_labels_for_hist(series, fig)
    fig.update_layout(legend_title=series.name)
    return fig


# create_histogram(pd.Series(["a", "a", "b", "b", "b"], name="hi"))
def create_bar(df, x_column, y_column, horizontal=False, color=True):
    '''
        Bars are used for situations where the distribution does not reflect the population. 
        Ex. Multiple Choice answers
    '''
    fig = None
    if color is True:
        fig = px.bar(df, x=x_column, y=y_column, color=x_column)
    else:
        fig = px.bar(df, x=x_column, y=y_column)

    if horizontal is False:
        fig.update_xaxes(title_text = series.name)
        fig.update_yaxes(title_text = "count")
    else:
        fig.update_yaxes(title_text = series.name)
        fig.update_xaxes(title_text = "count")

    fig.update_layout(legend_title=series.name)
    return fig

def create_pie(series: pd.Series):
    fig = px.pie(series, names=series.name)
    fig.update_traces(texttemplate="%{label}<br>%{value} (%{percent})")
    fig.update_layout(margin=dict(l=5, r=2, t=0, b=0))
    return fig

In [None]:
def create_rating_distributions(score_suffix, title_text, star_distrib : pd.Series):
    fig_avg = go.Figure()
    fig_avg.update_xaxes(visible=False)
    fig_avg.update_yaxes(visible=False)
    fig_avg.update_layout(paper_bgcolor=BACKGROUND_COLOR_DEFAULT,
                    height=300, margin=dict(l=0, r=0, t=60, b=60))
    fig_avg.add_annotation(
        dict(xref="paper", yref="paper", xanchor="center", yanchor="middle", x=0.5, y=0.5, ax=0, ay=0,
        text=f"{format(star_distrib.mean(), '.2f')} {score_suffix}", font=dict(size=22))
    )

    fig_avg.add_annotation(
        dict(xref="paper", yref="paper", xanchor="center", yanchor="middle", x=0.5, y=0.3, ax=0, ay=0,
        text="Average", font=dict(size=16))
    )

    fig_avg.add_annotation(
        dict(xref="paper", yref="paper", xanchor="right", yanchor="middle", x=0.98, y=0.1, ax=0, ay=0,
        text=f"Sample Size: {len(star_distrib)}", font=dict(size=12))
    ) 

    fig_avg.add_layout_image(
        dict(source="./icons/star.png", xref="paper", yref="paper", xanchor="center", yanchor="middle", x=0.35, y=0.525,
        sizex=0.6, sizey=0.6, sizing="contain")
    )

    star_distrib_str = star_distrib.astype(str)

    fig = px.histogram(star_distrib_str, y=star_distrib_str.name, 
                  color=star_distrib_str.name)
    fig.update_yaxes(title="Rating", categoryorder="array", 
                    categoryarray=
                    star_distrib
                    .sort_values()
                    .astype(str).unique(),
                    showgrid=False)
    fig.update_traces(hovertemplate=less_cringe_hovertext_template["histogram_h"]["hovertemplate"],
                    marker=dict(color="rgb(253, 240, 54)", 
                                line=dict(color="rgb(0, 0, 0)", width=2)
                    )
    )

    add_percent_labels_for_hist(star_distrib, fig, horizontal=True)

    fig.update_layout(
        bargap=0.5,
        title=dict(
            text=title_text,
            xanchor="left",
            yanchor="bottom",
            y=0.93,
            x=0.05,
            font=dict(size=18)
        ),
        margin=dict(
            t=100,
            pad=7
        )
    )
    

    return fig_avg, fig

## Wordmap Functions

In [None]:
# Saves a wordcloud into a specific file path and plot. Then plot the image
def analyze_word_map_frequencies(series: pd.Series, columns=["language", "num_people_speaking"],
                                 delimiter=",\s*") -> pd.DataFrame:
    word_default_dict = defaultdict(int)
    for _, values in series.items():
        for word in re.split(delimiter, values):
            word_default_dict[word] += 1

    return pd.DataFrame(sorted(word_default_dict.items(), key=lambda x: x[1], reverse=True), 
            columns=columns)

def generate_word_map(series : pd.Series, delimiter=", ", file_path=None, width=1000, height=600, max_words=200, is_frequency=False,
                      collocations=True, relative_scaling=0.75):
    '''
        Saves a wordcloud into a specific file path
    '''
    cloud = WordCloud(max_words=max_words, mask=None, 
    stopwords=stopwords, width=width, height=height, colormap="Set2", mode="RGBA",
    background_color=None,
    min_font_size=8, max_font_size=125, relative_scaling=relative_scaling, collocations=collocations)
    
    if is_frequency is False:
        cloud.generate(" ".join(map(str, series)))
    else:
        freq_dict_records = analyze_word_map_frequencies(series, columns=["phrase", "count"], 
                                                         delimiter=delimiter).to_dict("records")
        freq_dict_reformatted = {phrase["phrase"] : phrase["count"] 
                                         for phrase in freq_dict_records}
    
        cloud.generate_from_frequencies(freq_dict_reformatted)
    cloud.to_file(file_path)



def plot_word_map(file_path : str):
    '''
        file_path : path to wordcloud image    
    '''
    img = io.imread(file_path)
    fig = px.imshow(img, binary_compression_level=6)
    fig.update_layout(xaxis={'showgrid': False, 'showticklabels': False, 'zeroline': False},
                        yaxis={'showgrid': False, 'showticklabels': False, 'zeroline': False},
                        margin=dict(autoexpand=False, b=0, l=0, r=0, t=0), 
                        hovermode=False)
    
    return fig


# generate_word_map(pd.Series(["ash", "ash", "bet", "bet", "bet"]), is_frequency=True)



## Map Functions

In [None]:
## Map setup, see Location Profile for Usage
def map_location_setup(location_series: pd.Series, data_coordinates: list[tuple]):
    # geolocator = Nominatim(user_agent="app")
    # vancouver_location = geolocator.geocode("Vancouver, BC")
    # m = folium.Map(location=(vancouver_location.latitude, vancouver_location.longitude), tiles="cartodbpositron",
    #            zoom_start=2)
    
    unique_locs = location_series.unique().tolist()
    num_per_unique_loc = location_series.value_counts()
    a_location_coordinates = pd.DataFrame(
        {
    "Coordinates" : data_coordinates
    },
    index=unique_locs)
    return pd.merge(a_location_coordinates, num_per_unique_loc, left_index=True, right_index=True)

################################################# CAN'T MODULARIZE FOLIUM FOR SOME REASON ############33
# def add_map_markers(a: pd.DataFrame, count_help_text,
#                     icon_object=None):
#     """
#         DataFrame format should look like 

#         | Coordinates | <Count_Column>       
#         -------------------------------

#         Index should be assigned to name of the location.
#         Output from map_location_setup is an acceptable input
#     """
#     m = create_map()
#     if icon_object is None:
#         icon_object = folium.Icon(icon="school", color='lightblue', prefix='fa')

#     for i in range(a.shape[0]):
#         current_data = a.iloc[i]
#         current_name = current_data.name
#         number_per_name = current_data[a.columns[1]]
#         html = f'''
#         <div style="display: flex; justify-content: left; flex-direction: column;">
#             <div style="padding:0 10px 10px 0;color:grey"><b>{current_name}</b></div>
#             <div style="padding:0 10px 10px 0;font-size:40;font-weight:100;text-align:center">{number_per_name}</div>
#             <div style="text-align:center;padding:0 10px 0 0;">{count_help_text}</div>
#         </div>
#         '''

#         iframe = folium.IFrame(html=html, width=170, height=170)
#         icon = icon_object
#         popup = folium.Popup(iframe)
#         folium.Marker(location=current_data["Coordinates"], popup=popup, icon=icon
#         ).add_to(m)

#     return m
    

def create_map():
    # Use coordinates of Vancouver as dummy coordinates
    m = folium.Map(location=(49.2827, -123.1207), tiles='cartodbpositron',
               control_scale=True, zoom_start=2)
    # folium.LayerControl().add_to(m)
    return m

def get_figure_for_map(m: folium.Map, height=400):
    f = folium.Figure(height=height)
    m.add_to(f)
    return f

In [None]:
# Additional Plotly figure configurations
less_cringe_hovertext_template = dict(
    bar=dict(hovertemplate="<b>%{x}</b><br><i>Count</i>: %{y}"),
    scatter=dict(hovertemplate="<b>%{x}</b><br><i>Count</i>: %{y}"),
    pie=dict(hovertemplate="<b>%{label}</b><br><i>Count</i>: %{value}"),
    histogram=dict(hovertemplate="<b>%{x}</b><br><i>Count</i>: %{y}"),
    histogram_h=dict(hovertemplate="<b>%{y}</b><br><i>Count</i>: %{x}") # histogram_horizontal
)

no_scroll_zoom_config = dict(
    scrollZoom=False
) 

static_plot_config = dict(
    staticPlot=True
) 

In [None]:
def data_checker(a: pd.Series, check_unique=False, check_null=True):
    '''
    Explores a column of data passed into data_checker() as a Series.
    Set check_unique=True to explore unique values for a specifically column.
    Number of nulls of a column is always checked.

    Example:

    ug157#   a = pd.DataFrame([[1], [2], [2]], columns=["A"])
    ug157#   data_checker(a["A"], check_unique=True)

    Out:  
    Unique values [1 2]
    Number of NA Values: 0
    '''
    if check_unique:
        if a.dtype == np.float64:
            print("Unique values", a.sort_values().unique())
        else:     
            print("Unique values", a.sort_values(key=lambda x: x.str.lower()).unique())
    
    if check_null:
        print("Number of NA Values:", int(a.isnull().sum()))

# a = pd.DataFrame([[1], [2], [2]], columns=["A"])
# data_checker(a["A"], check_unique=True)

def calc_table_height(a, base=208, height_per_row=25, char_limit=30, height_padding=16.5) -> int:
    '''
    Used to automatically calculate cell and table dimensions of a table visualization
    
    a: The dataframe with only the columns you want to plot
    base: The base height of the table (header without any rows)
    height_per_row: The height that one row requires
    char_limit: If the length of a value crosses this limit, the row's height needs to be expanded to fit the value
    height_padding: Extra height in a row when a length of value exceeds char_limit

    Source: https://stackoverflow.com/questions/48223370/python-plotly-autosize-table-plot
    '''
    total_height = 0 + base
    for x in range(a.shape[0]):
        total_height += height_per_row
        for y in range(a.shape[1]):
            if len(str(a.iloc[x][y])) > char_limit:
                total_height += height_padding
                break
    return total_height
    

## Dropdown Functions

TO-DO

In [None]:
def create_table_with_dropdown(a_new,
                               columns,
                               labels : list[str] = None,
                               reindex : list[bool] = None, # Example: [False, False]
                               reindex_orders : list[list[str]] = None, # Example: [None, ["A", "B"]]. Reindex only second column 
                               headers=["Answer", "Number of Students"],
                               limit=None,
                               base_height=320,
                               include_percentage=True,
                               header_height=TABLE_HEADER_HEIGHT_DEFAULT,
                               cell_height=TABLE_CELL_HEIGHT_DEFAULT
                               ):
    """
        Count unique values for every column and display those counts in a table

        a_new: dataframe containing values to visualize
        columns: columns from the dataframe to visualize
        labels: list of column headers to show in table visualization
        reindex: 

        ug157#    a = pd.DataFrame({"a" : [1, 1, 2, 2], "b" : [2, 2, 3, 3]})
        ug157#    create_table_with_dropdown(a, ["a", "b"])

        # Reindex Example:

        ug157#    a = pd.DataFrame({"a" : ["Male", "Female", "Female", "Female"], "b" : ["A", "D", "B", "C"]})
        ug157#    create_table_with_dropdown(a, ["a"    , "b"], reindex=[False, True], reindex_orders=[None, ["A", "B", "C", "D"]])
        
    """
    array_of_value_counts = []
    if labels is None:
        labels = columns

    if reindex is None:
        array_of_value_counts = [pd.DataFrame(a_new[column].value_counts()).reset_index() for column in columns]
    else:
        for i, column in enumerate(columns):
            column_value_count = pd.DataFrame(a_new[column].value_counts())
            if reindex[i] == True:
                array_of_value_counts.append(
                    column_value_count.reindex(reindex_orders[i]).reset_index()
                )
            else:
                array_of_value_counts.append(
                    column_value_count.reset_index()
                )
    

    if limit:
        array_of_value_counts = [value_count.head(limit) for value_count in array_of_value_counts]

    if include_percentage:
        for value_count in array_of_value_counts:
            value_count[value_count.columns[1]] = value_count[value_count.columns[1]].astype(str) + " (" + \
                (value_count[value_count.columns[1]] / value_count[value_count.columns[1]].sum() * 100).round(2).astype(str) + "%" + ")"

    a_new_count_zero = array_of_value_counts[0]
    fig = go.Figure(go.Table(
        header=dict(values=headers, height=header_height, fill_color=px.colors.qualitative.Pastel2[0], 
                    font=dict(color="white", size=header_height/4), line_color=BACKGROUND_COLOR_DEFAULT), 

        cells=dict(values=a_new_count_zero.T.values, height=cell_height, 
                    font=dict(size=cell_height/3.5), fill_color=px.colors.qualitative.Pastel1[2], 
                    line_color=BACKGROUND_COLOR_DEFAULT)
        )
    )

    fig.update_layout(margin=dict(l=2, r=2)),
    
    fig.update_layout(
        updatemenus=[
            {   
                **default_bar_dropdown_styling,
                "buttons" : [{
                    "label" : labels[i],
                    "method": "update",
                    "args" : [
                        {
                            "cells" : {
                                "values" : array_of_value_counts[i].T.values, 
                                "height": TABLE_CELL_HEIGHT_DEFAULT,
                            }
                        },
                        { 
                            "title" : labels[i],
                            "height": calc_table_height(array_of_value_counts[i], base=base_height),
                            "cells_line_color": BACKGROUND_COLOR_DEFAULT,
                            "cells_fill_color": px.colors.qualitative.Pastel1[2]
                        }
                    ]
                } for i, c in enumerate(columns)]
            }
        ],
        paper_bgcolor="#FFFFFF"
    )

    # Alway set height to the dropdown option with the most data, because it has the max height
    fig.update_layout(title_text=labels[0], margin=dict(l=2, r=2),
                      height=calc_table_height(max(array_of_value_counts, key=lambda x: x.shape[0]), base=base_height),
                      plot_bgcolor=BACKGROUND_COLOR_DEFAULT,
                      paper_bgcolor=BACKGROUND_COLOR_DEFAULT
    )
    fig.layout.template["data"]["table"][0]["cells"]["fill"]["color"] = px.colors.qualitative.Pastel1[2]
    fig.layout.template["data"]["table"][0]["cells"]["line"]["color"] = BACKGROUND_COLOR_DEFAULT

    return fig

# Example:
# a = pd.DataFrame({"a" : ["Male", "Female", "Female", "Female"], "b" : ["A", "D", "B", "C"]})
# fig = create_table_with_dropdown(a, ["a", "b"], reindex=[False, True], reindex_orders=[None, ["A", "B", "C", "D"]], base_height=320)
# fig.show(renderer="notebook")


In [None]:
def create_figure_with_dropdown(a_new, 
                                options=["What is your gender?", "Are you a domestic or international student?"],
                                filters=None,
                                labels=None,
                                showticklabels=[True, True],
                                sort_traces=False,
                                is_order_manually=False,
                                manual_category_orders=None,
                                horizontal=False,
                                add_legend=True,
                                textposition="outside"):
    """
        Create bar chart with dropdown selects. No Documentation on advanced usage with filters + labels.

        # Create Bar Chart Dropdown
        a = pd.DataFrame({"a" : [1, 1, 2, 2], "b" : [2, 2, 3, 3]})
        create_figure_with_dropdown(a, ["a", "b"])

        # Create Bar Chart with Sorted Counts (sort_traces=True). 
        # In this example, sort_traces would cause Female to come first (placed left) in the count plot instead of Male.
        a = pd.DataFrame({"a" : ["Male", "Female", "Female"], "b" : ["A", "A", "B"]})
        create_figure_with_dropdown(a, ["a", "b"], sort_traces=True)
        
    """
    fig = None
    visibilities = []
    fig = go.Figure()

    if labels is None:
        labels = options

    unique_values_for_each_option = None
    if filters is None:
        unique_values_for_each_option = [a_new[option].unique() for option in options]
    else:
        unique_values_for_each_option = [a_new.loc[filters[i], option].unique() for i, option in enumerate(options)]
        
    # Add initial traces
    length_per_option = [len(unique_values_for_one_option) for unique_values_for_one_option 
                            in unique_values_for_each_option]
    
    for i, unique_values_for_one_option in enumerate(unique_values_for_each_option):
        for unique_value in unique_values_for_one_option:
            unique_value_trace = None
            if filters is not None and filters[i] is not None: 
                unique_value_trace = a_new.loc[filters[i] & (a_new[options[i]] == unique_value), 
                                                            options[i]]

                if unique_value_trace.shape[0] == 0:
                    continue
            else:
                unique_value_trace = a_new.loc[a_new[options[i]] == unique_value, 
                                                            options[i]]
            
            if horizontal is False:
                fig.add_trace(go.Histogram(
                                            x=unique_value_trace, 
                                            name=unique_value,
                                            visible=True if i == 0 else False,
                                            # marker_autocolorscale=True,
                                            text=f'{len(unique_value_trace)}' + \
                                                f' ({format(len(unique_value_trace)/a_new[options[i]].notna().sum() * 100, ".2f")}%)',
                                            textposition=textposition
                                        ))
            else:
                fig.add_trace(go.Histogram(
                            y=unique_value_trace, 
                            name=unique_value,
                            visible=True if i == 0 else False,
                            # marker_autocolorscale=True,
                            text=f'{len(unique_value_trace)}' + \
                                                f' ({format(len(unique_value_trace)/a_new[options[i]].notna().sum() * 100, ".2f")}%)',
                            textposition=textposition
                        ))
        

    # i indexes the option being analyzed
    for i, _ in enumerate(options):        
        # Toggle visibility of traces per option
        visibility = []
        # j also indexes every option, but includes the lengths (number of unique values) for each option
        for j, option_length in enumerate(length_per_option):
            arr = [True] * option_length if i == j else [False] * option_length
            visibility.extend(arr)
        
        visibilities.append(visibility)


    ## Adjust axes properties for each option
    args_arr = []

    for i, option in enumerate(options):
        
        ## I totally forget why I use two indice orders
        indice_order = a_new[option].value_counts().index.tolist() if filters is None else \
                       a_new.loc[filters[i], option].value_counts().index.tolist()
        
        indice_order_sorted = a_new[option].value_counts().sort_values().index.tolist() if filters is None else \
                       a_new.loc[filters[i], option].value_counts().sort_values().index.tolist()

        if horizontal is False: 
            args_arr.append({
                "categoryorder" : "array" if sort_traces is True or (is_order_manually is True 
                                                                and manual_category_orders[i] != None) else 'trace',
                "categoryarray" :  manual_category_orders[i] if is_order_manually is True and manual_category_orders[i] != None else 
                                  indice_order if sort_traces is True else None, # only has effect when categoryorder is array
                "showticklabels" : showticklabels[i]
            })
        else:
            args_arr.append({
                "categoryorder" : "array" if sort_traces is True or (is_order_manually is True 
                                                                and manual_category_orders[i] != None) else 'trace',
                "categoryarray" :  manual_category_orders[i] if is_order_manually is True and manual_category_orders[i] != None else 
                                  indice_order_sorted if sort_traces is True else None, # only has effect when categoryorder is array
                "showticklabels" : showticklabels[i]
            })
            

    fig.update_traces(hovertemplate="<b>%{x}</b><br><i>Count:</i> %{y}" if horizontal is False 
                                    else "<b>%{y}</b><br><i>Count:</i> %{x}", marker_autocolorscale=True)
    fig.update_layout(
        updatemenus=[
            dict(
                **default_bar_dropdown_styling,
                buttons=list([
                    dict(label=labels[i],
                         method="update",
                         args=[
                            {"visible" : visibilities[i]},
                            {"title" : labels[i],
                             "xaxis" if horizontal is False else "yaxis" : args_arr[i]
                             },
                         ])
                for i, option_name in enumerate(options)])
            )
        ],
        colorway=px.colors.qualitative.Pastel2,
        paper_bgcolor=BACKGROUND_COLOR_DEFAULT,
        plot_bgcolor=BACKGROUND_COLOR_DEFAULT
    )

    indice_order = a_new[options[0]].value_counts().index.tolist() if filters is None else \
                    a_new.loc[filters[0], options[0]].value_counts().index.tolist()
    
    indice_order_sorted = a_new[options[0]].value_counts().sort_values().index.tolist() if filters is None else \
                    a_new.loc[filters[0], options[0]].value_counts().sort_values().index.tolist()

    if horizontal is not True:
        if sort_traces:
            fig.update_xaxes(categoryorder="array", categoryarray=indice_order)
        elif is_order_manually: 
            fig.update_xaxes(categoryorder="array", categoryarray=manual_category_orders[0])
        
        fig.update_xaxes(showticklabels=showticklabels[0])
    else:
        if sort_traces:
            fig.update_yaxes(categoryorder="array", categoryarray=indice_order_sorted)
        elif is_order_manually: 
            fig.update_yaxes(categoryorder="array", categoryarray=manual_category_orders[0])  
              
        fig.update_yaxes(showticklabels=showticklabels[0])
        
    fig.update_layout(title_text=labels[0], margin=dict(l=2, r=2),
                      dragmode="pan", showlegend=False)
    
    fig.update_xaxes(fixedrange=True)
    fig.update_yaxes(fixedrange=True)
    
    if add_legend is True:            
        fig.update_layout(legend=dict(yanchor="bottom", xanchor="right",
                                x=1, y=0, orientation="v"),
                                showlegend=True)
    return fig

# a = pd.DataFrame({"a" : ["Male", "Female", "Female", "Female", "Female", "Female", "Female" "Female", "Female", 
#                           "Female", "Female", "Male", "Male", "Male", "Male"], 
#                     "b" : ["A", "A", "B", "B", "B", "C", "C", "C", "C", "D", "D", "D", "D", "D"]})
# fig = create_figure_with_dropdown(a, ["a", "b"],
#                             textposition="inside", showticklabels=[False, False], sort_traces=True, horizontal=True)
# fig.show(renderer="notebook")

# fig.update_layout(showlegend=True, 
#                   legend=dict(yanchor="top", xanchor="center",
#                               orientation="h",
#                               x=0.5, y=-0.1))



# a = pd.DataFrame({"a" : ["Male", "Female", "Female"], "b" : ["A", "A", "B"]})
# create_figure_with_dropdown(a, ["a", "b"], sort_traces=False) # Default to False


# a = pd.DataFrame({"a" : ["He", "He", "He", "Loves", "Eating", "Strawberry", "Strawberry", "Wee"],
#                    "b" : ["Sa", "Sa", "Sa", "Sa", "Sa", "Sa", "Ge", "Yo"]})
# create_figure_with_dropdown(a, options=["a", "b"], is_wordcloud=False)

# Clustering Functions

For finding the ground truths behind messy typos

In [None]:
def generate_sim_matrix(series: pd.Series) -> pd.DataFrame:
    from scipy.spatial.distance import pdist, squareform
    from similarity.longest_common_subsequence import LongestCommonSubsequence

    LCS = LongestCommonSubsequence()
    def compare_lcs(u, v):
        min_length = min(len(u[0]), len(v[0]))
        return min(-0.001, -(1 - (abs((LCS.distance(u[0], v[0]) - len(u[0]) - len(v[0])) / 2) / min_length)))
        # return -((1 - cosine.similarity_profiles(cosine.get_profile(u[0]), cosine.get_profile(v[0]))) + levenshtein.distance(u[0], v[0]))

    precomputed_similarity_matrix = squareform(pdist(np.expand_dims(series.unique(), axis=1), compare_lcs)) 
    df_precompute_sim_matrix = pd.DataFrame(
        precomputed_similarity_matrix,
        index=series.unique(),
        columns=series.unique(),
    )
    return df_precompute_sim_matrix

# Generate similarity matrix between every possible professor input
# df_precompute_sim_matrix

# Distribution of Similarity Matrix
# px.histogram(df_precompute_sim_matrix[df_precompute_sim_matrix != 0].max(axis=1), title="Distribution of Largest Common Subsequence Metric")

# ### Removing Unique Professor Names
# We apply this filter to try out best to not cluster professor names that were only entered in one way. 
# For example, if we all entered Hamid's name as Hamid, then there is no need to rewrite Hamid is any other way.

# I only included inputs that have an LCS distance > -0.3, because I thought most inputs that have an LCS distance < -0.3 appeared to be unique professor names. I deducted this through trial and error. For each input, I found their most closely associated inputs with the LCS distance closest to 0 and put this information in a dataframe
# def gather_closest_distances_for_each_word(row):
#     closest_distances = row[row == row[row != 0].max()]
#     best_matches_for_each_word = closest_distances.index.tolist()
#     return [best_matches_for_each_word, row[row != 0].max()]


# df_MAT = MAT.apply(gather_closest_distances_for_each_word, axis=1, result_type="expand")
# df_MAT

# Per analysis above, We only include words iff they have at least one LCS distance < -0.3 with another word.

def remove_unique(df_precompute_sim_matrix: pd.DataFrame, threshold=0.3): 
    df_precompute_sim_matrix = df_precompute_sim_matrix.loc[df_precompute_sim_matrix[(df_precompute_sim_matrix > -threshold) & (df_precompute_sim_matrix != 0)].any()]
    df_precompute_sim_matrix = df_precompute_sim_matrix[df_precompute_sim_matrix.index]
    return df_precompute_sim_matrix


# Fit data to AP Cluster model
def fit_data(df_precompute_sim_matrix: pd.DataFrame):
    from sklearn.cluster import AffinityPropagation
    ap_cluster = AffinityPropagation(random_state=5, affinity="precomputed").fit(df_precompute_sim_matrix.to_numpy())
    return ap_cluster
    ### Automated Text Cleaning Using AP Clustering 

def generate_replacement_file(ap_cluster, df_precompute_sim_matrix: pd.DataFrame, output_file=None):
    # ONLY RUN ONCE
    # # ONLY RUN ONCE. This codeblock is used to memoize user prompt input.

    replacement_tracker = [] # Keep track of user prompt input
    for label in sorted(np.unique(ap_cluster.labels_)):
        prompt_tuple = []
        indices = df_precompute_sim_matrix.index[np.where(ap_cluster.labels_ == label)]
        text = input(f"Cluster: {indices}. Would you like to replace this cluster? (Y/N)")
        replacement_text = None
        if text == "Y" or text == "y":
            replacement_text = input(f"Cluster: {indices}. Enter the text your would like to replace this cluster with:")
        else:
            pass

        prompt_tuple.append(text)
        prompt_tuple.append(indices.tolist())
        prompt_tuple.append(replacement_text)
        replacement_tracker.append(prompt_tuple)

    
    '''
        Write out replacement file. In the form of [Should_I_Replace_Cluster_With_Text, Replacement_Text]
        [["Y", "Ashvin"], ["Y", "Bruno"], ["N", None]]
    '''
    
    json_obj = json.dumps(replacement_tracker, indent=4)
    with open(output_file, "w") as f:
        f.write(json_obj)


def execute_replacements(df, column=None, infile=None):
    f = open(infile) 
    replacement_list = json.load(f)
    f.close()
    for text, indices, replacement_text in replacement_list:
        if (text == "Y" or text == "y") and column is not None:
            df[column].replace({index: replacement_text for index in indices}, inplace=True)
        elif (text == "Y" or text == "y") and column is None:
            df.replace({index: replacement_text for index in indices}, inplace=True)
        else:
            continue