In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import pandas as pd
# from plotly_wordcloud import plotly_wordcloud
from geopy.geocoders import Nominatim
import folium
from wordcloud import WordCloud, STOPWORDS
from collections import defaultdict
from skimage import io
# from jupyter_dash import JupyterDash

stopwords = set(STOPWORDS)
# This ensures Plotly output works in multiple places:
# In VSCode and also nbconvert from jupyter notebook to HTML
# See https://plotly.com/python/renderers/#multiple-renderers
pio.renderers.default = "svg+notebook"


In [4]:
# Default Dropdown Menu Button Styling 
default_bar_dropdown_styling = dict(
    bgcolor="white",
    active=0,
    yanchor='top',
    xanchor='center',
    direction='up',
    y=-0.1,
    x=0.5,
)

# default_table_dropdown_styling = dict(
#     bgcolor="white",
#     active=0,
#     yanchor='top',
#     xanchor='center',
#     direction='down',
#     y=1.3,
#     x=0,
# )

# MACROS used for calculation of table height
TABLE_CELL_HEIGHT = 25
TABLE_CELL_PADDING = 6

In [5]:
# Plotly Custom Template
bnw = go.layout.Template(
    layout=go.Layout(
        xaxis=go.layout.XAxis(
            showline=True,
            linecolor="black",
            linewidth=2,
            mirror=True,
            title=""
        ),
        margin=go.layout.Margin(
            l=2,
            r=2
        ),
        yaxis=go.layout.YAxis(
            showline=True,
            linecolor="black",
            linewidth=2,
            mirror=True,
            title=""
        ),
        font=dict(color="#4f4f4f"),
        title=go.layout.Title(
            font=go.layout.title.Font(
            #     # family="Old Standard TT",
                size=17
            ),
            # ),
        ),
        legend=go.layout.Legend(
            font=go.layout.legend.Font(
                size=13
            )
        ),
        paper_bgcolor="#D0E3F1",
        dragmode="pan",
        autosize=True,
        showlegend=False,
    ),

    # Does not work for hovertemplate
    # data=dict(
    #     bar=[go.Bar(hovertemplate="<b>%{label}</b><br><i>Count</i>: %{value}")],
    #     scatter=[go.Scatter(hovertemplate="<b>%{label}</b><br><i>Count</i>: %{value}")],
    #     pie=[go.Pie(hovertemplate="<b>%{label}</b><br><i>Count</i>: %{value}")],
    #     histogram=[go.Histogram(hovertemplate="<b>%{label}</b><br><i>Count</i>: %{value}")]
    # )
)

pio.templates["bnw"] = bnw

# Combine custom template with seaborn
px.defaults.template = "seaborn+bnw"

In [6]:
less_cringe_hovertext_template = dict(
    bar=dict(hovertemplate="<b>%{x}</b><br><i>Count</i>: %{y}"),
    scatter=dict(hovertemplate="<b>%{x}</b><br><i>Count</i>: %{y}"),
    pie=dict(hovertemplate="<b>%{label}</b><br><i>Count</i>: %{value}"),
    histogram=dict(hovertemplate="<b>%{x}</b><br><i>Count</i>: %{y}"),
    histogram_h=dict(hovertemplate="<b>%{y}</b><br><i>Count</i>: %{x}") # histogram_horizontal
)

no_scroll_zoom_config = dict(
    scrollZoom=False
) 

static_plot_config = dict(
    staticPlot=True
) 

In [7]:
def data_checker(a: pd.Series, check_unique=False, check_null=True):
    '''
        Set check_unique=True to check for unique values for a specifically column.
        Number of nulls is checked automatically.

        Example:

        ug157#   a = pd.DataFrame([[1], [2], [2]], columns=["A"])
        ug157#   data_checker(a["A"], check_unique=True)

        Out:  
        Unique values [1 2]
        Number of NA Values: 0
    '''
    if check_unique:
        print("Unique values", a.sort_values().unique())
    
    if check_null:
        print("Number of NA Values:", int(a.isnull().sum()))

# a = pd.DataFrame([[1], [2], [2]], columns=["A"])
# data_checker(a["A"], check_unique=True)

def calc_table_height(a, base=208, height_per_row=25, char_limit=30, height_padding=16.5):
    '''
    a: The dataframe with only the columns you want to plot
    base: The base height of the table (header without any rows)
    height_per_row: The height that one row requires
    char_limit: If the length of a value crosses this limit, the row's height needs to be expanded to fit the value
    height_padding: Extra height in a row when a length of value exceeds char_limit

    Source: https://stackoverflow.com/questions/48223370/python-plotly-autosize-table-plot
    '''
    total_height = 0 + base
    for x in range(a.shape[0]):
        total_height += height_per_row
        for y in range(a.shape[1]):
            if len(str(a.iloc[x][y])) > char_limit:
                total_height += height_padding
                break
    return total_height
    

## Helper Functions

In [8]:
def add_percent_labels_for_hist(series : pd.Series, fig, horizontal=False):
    texts = []
    if horizontal is True:
        texts=[format(len(hist['y'])/series.notna().sum() * 100, ".2f") + "%" for hist in fig.data]
    else:
        texts=[format(len(hist['x'])/series.notna().sum() * 100, ".2f") + "%" for hist in fig.data]
    
    for i, hist in enumerate(fig.data):
        hist["text"] = texts[i]
    

In [9]:
def change_to_dark_mode_plotly(fig):
    fig.update_layout(paper_bgcolor='#2d3035', font=dict(color="#dcdcdc"), plot_bgcolor='#2d3035',
                      title_font=dict(color="#dfdfdf")) 
    if fig.layout.updatemenus:
        fig.layout.updatemenus[0].bgcolor="#ECECEC"
        fig.layout.updatemenus[0].font=dict(color="#000000")

In [11]:
def process_multiple_choice_survey(series: pd.Series, delimeter=",\s+", column="substance"):
    """
        Process each answer of multiple choice answers as one row. 
        For example: a respondent answering 'a, b, c' will have his or her
        answer processed as 'a' in one row, 'b' in the next, and 'c' in the next using
        the default delimeter.   

        TO-DO: Remove stopwords
    """
    all_substances = []
    for element in series:
        all_substances.extend(re.split(delimeter, element.strip()))

    return pd.DataFrame(np.expand_dims(all_substances, axis=1), columns=[column])

def get_percentages_for_multiple_choice_survey(series: pd.Series):
    df_copy = series.value_counts().reset_index()
    df_copy["percentage"] =  (df_copy[series.name] / len(df) * 100).round(2).astype(str) + "%"
    df_copy.columns = ["index", "count", "percentage"]
    return df_copy


In [None]:
def remove_regex(series: pd.Series):
    series_lowered = series.str.lower()
    return series_lowered.str.replace(r'[-./?!,":;()\']', '', regex=True)

In [None]:
def remove_redundancy(series: pd.Series, delimeter=", "):
    """
        Remove redundancy from multiple choice data columns. Ex. "Course 6, Course 5" and "Course 5, Course 6" would both be
        modified to look like "Course 5, Course 6". 

        remove_redundancy(pd.Series(["5,6", "6,5", "6,6", "7,8", "8,7"]), delimeter=",")

        Out:
        0    5,6
        1    5,6
        2    6,6
        3    7,8
        4    7,8
        dtype: object
    """
    series_remove_redundancy = series.str.split(delimeter).map(lambda x: delimeter.join(sorted(x)))
    return series_remove_redundancy

# remove_redundancy(pd.Series(["5,6", "6,5", "6,6", "7,8", "8,7"]), delimeter=",")
    

In [None]:
def break_text(texts, char_limit=20):
    def closest_next(text, idx):
        ci = " "
        return next(filter(lambda i: ci == text[i], range(idx, len(text))), len(text))


    if type(texts) == str:
        new_text = texts
        for i in range(char_limit, len(texts) + 1, char_limit):
            closest_i = closest_next(new_text, i)
            new_text = new_text[0:closest_i] + "<br>" + new_text[closest_i+1:]

        return new_text

    if type(texts) == list:
        list_of_new_texts = []
        for text in texts:
            new_text = text
            for i in range(char_limit, len(text) + 1, char_limit):
                closest_i = closest_next(new_text, i)
                new_text = new_text[0:closest_i] + "<br>" + new_text[closest_i+1:]

            list_of_new_texts.append(new_text)
        
        return list_of_new_texts

## Table and Bar Functions 

In [2]:
def create_table_with_dropdown(a_new,
                               columns,
                               labels = None,
                               reindex : list[bool] = None, # Example: [False, False]
                               reindex_orders : list[list[str]] = None, # Example: [None, ["A", "B"]]. Reindex only second column 
                               headers=["Answer", "Number of Students"],
                               limit=None,
                               base_height=208,
                               ):
    """
        Count unique values for every column and display those counts in a table

        ug157#    a = pd.DataFrame({"a" : [1, 1, 2, 2], "b" : [2, 2, 3, 3]})
        ug157#    create_table_with_dropdown(a, ["a", "b"])

        # Reindex Example:

        ug157#    a = pd.DataFrame({"a" : ["Male", "Female", "Female", "Female"], "b" : ["A", "D", "B", "C"]})
        ug157#    create_table_with_dropdown(a, ["a", "b"], reindex=[False, True], reindex_orders=[None, ["A", "B", "C", "D"]])
        
    """
    array_of_value_counts = []
    if labels is None:
        labels = columns

    if reindex is None:
        array_of_value_counts = [pd.DataFrame(a_new[column].value_counts()).reset_index() for column in columns]
    else:
        for i, column in enumerate(columns):
            column_value_count = pd.DataFrame(a_new[column].value_counts())
            if reindex[i] == True:
                array_of_value_counts.append(
                    column_value_count.reindex(reindex_orders[i]).reset_index()
                )
            else:
                array_of_value_counts.append(
                    column_value_count.reset_index()
                )
    
    if limit:
        array_of_value_counts = [value_count.head(limit) for value_count in array_of_value_counts]

    a_new_count_zero = array_of_value_counts[0]
    fig = go.Figure(go.Table(header=dict(values=headers, fill_color="darkblue", 
                                         font=dict(color="white"), line_color="black"), 
                             cells=dict(values=a_new_count_zero.T.values,
                                        height=TABLE_CELL_HEIGHT)
                            )
    )
    
    fig.update_layout(
        updatemenus=[
            {   
                **default_bar_dropdown_styling,
                "buttons" : [{
                    "label" : labels[i],
                    "method": "update",
                    "args" : [
                        {
                            "cells" : {
                                "values" : array_of_value_counts[i].T.values, 
                                "height": TABLE_CELL_HEIGHT,
                            }
                        },
                        { 
                            "title" : labels[i],
                            "height": calc_table_height(array_of_value_counts[i], base=base_height),
                            "cells_line_color": "black",
                            "cells_fill_color": "#f5f5f5"
                        }
                    ]
                } for i, c in enumerate(columns)]
            }
        ],
        paper_bgcolor="#FFFFFF"
    )

    # Alway set height to the dropdown option with the most data, because it has the max height
    fig.update_layout(title_text=labels[0], margin=dict(l=2, r=2),
                      height=calc_table_height(max(array_of_value_counts, key=lambda x: x.shape[0]), base=base_height)
    )
    fig.layout.template["data"]["table"][0]["cells"]["fill"]["color"] = "white"
    fig.layout.template["data"]["table"][0]["cells"]["line"]["color"] = "darkslategray"

    return fig

# Example:
# a = pd.DataFrame({"a" : ["Male", "Female", "Female", "Female"], "b" : ["A", "D", "B", "C"]})
# create_table_with_dropdown(a, ["a", "b"], reindex=[False, True], reindex_orders=[None, ["A", "B", "C", "D"]])


In [68]:
def create_figure_with_dropdown(a_new, 
                                options=["What is your gender?", "Are you a domestic or international student?"],
                                filters=None,
                                labels=None,
                                showticklabels=[True, True],
                                sort_traces=False,
                                is_order_manually=False,
                                manual_category_orders=None,
                                horizontal=False,
                                add_legend=True,
                                textposition="outside"):
    """
        Create bar chart with dropdown selects. No Documentation on advanced usage with filters + labels.

        # Create Bar Chart Dropdown
        a = pd.DataFrame({"a" : [1, 1, 2, 2], "b" : [2, 2, 3, 3]})
        create_figure_with_dropdown(a, ["a", "b"])

        # Create Bar Chart with Sorted Counts (sort_traces=True). 
        # In this example, sort_traces would cause Female to come first (placed left) in the count plot instead of Male.
        a = pd.DataFrame({"a" : ["Male", "Female", "Female"], "b" : ["A", "A", "B"]})
        create_figure_with_dropdown(a, ["a", "b"], sort_traces=True)
        
    """
    fig = None
    visibilities = []
    fig = go.Figure()
    
    if labels is None:
        labels = options

    unique_values_for_each_option = None
    if filters is None:
        unique_values_for_each_option = [a_new[option].unique() for option in options]
    else:
        unique_values_for_each_option = [a_new.loc[filters[i], option].unique() for i, option in enumerate(options)]
        
    # Add initial traces
    length_per_option = [len(unique_values_for_one_option) for unique_values_for_one_option 
                            in unique_values_for_each_option]
    
    for i, unique_values_for_one_option in enumerate(unique_values_for_each_option):
        for unique_value in unique_values_for_one_option:
            unique_value_trace = None
            if filters is not None and filters[i] is not None: 
                unique_value_trace = a_new.loc[filters[i] & (a_new[options[i]] == unique_value), 
                                                            options[i]]

                if unique_value_trace.shape[0] == 0:
                    continue
            else:
                unique_value_trace = a_new.loc[a_new[options[i]] == unique_value, 
                                                            options[i]]
            
            if horizontal is False:
                fig.add_trace(go.Histogram(
                                            x=unique_value_trace, 
                                            name=unique_value,
                                            visible=True if i == 0 else False,
                                            marker_autocolorscale=True,
                                            text=format(len(unique_value_trace)/a_new[options[i]].notna().sum() * 100, ".2f") + "%",
                                            textposition=textposition
                                        ))
            else:
                fig.add_trace(go.Histogram(
                            y=unique_value_trace, 
                            name=unique_value,
                            visible=True if i == 0 else False,
                            marker_autocolorscale=True,
                            text=format(len(unique_value_trace)/a_new[options[i]].notna().sum() * 100, ".2f") + "%",
                            textposition=textposition
                        ))
        

    # i indexes the option being analyzed
    for i, _ in enumerate(options):        
        # Toggle visibility of traces per option
        visibility = []
        # j also indexes every option, but includes the lengths for each option
        for j, option_length in enumerate(length_per_option):
            arr = [True] * option_length if i == j else [False] * option_length
            visibility.extend(arr)
        
        visibilities.append(visibility)


    ## Adjust axes properties for each option
    args_arr = []

    for i, option in enumerate(options):
        
        indice_order = a_new[option].value_counts().index.tolist() if filters is None else \
                       a_new.loc[filters[i], option].value_counts().index.tolist()
        
        indice_order_sorted = a_new[option].value_counts().sort_values().index.tolist() if filters is None else \
                       a_new.loc[filters[i], option].value_counts().sort_values().index.tolist()

        if horizontal is False: 
            args_arr.append({
                "categoryorder" : "array" if sort_traces is True or (is_order_manually is True 
                                                                and manual_category_orders[i] != None) else 'trace',
                "categoryarray" :  manual_category_orders[i] if is_order_manually is True and manual_category_orders[i] != None else 
                                  indice_order if sort_traces is True else None, # only has effect when categoryorder is array
                "showticklabels" : showticklabels[i]
            })
        else:
            args_arr.append({
                "categoryorder" : "array" if sort_traces is True or (is_order_manually is True 
                                                                and manual_category_orders[i] != None) else 'trace',
                "categoryarray" :  manual_category_orders[i] if is_order_manually is True and manual_category_orders[i] != None else 
                                  indice_order_sorted if sort_traces is True else None, # only has effect when categoryorder is array
                "showticklabels" : showticklabels[i]
            })
            

    fig.update_traces(hovertemplate="<b>%{x}</b><br><i>Count:</i> %{y}" if horizontal is False 
                                    else "<b>%{y}</b><br><i>Count:</i> %{x}", marker_autocolorscale=True)
    fig.update_layout(
        updatemenus=[
            dict(
                **default_bar_dropdown_styling,
                buttons=list([
                    dict(label=labels[i],
                         method="update",
                         args=[
                            {"visible" : visibilities[i]},
                            {"title" : labels[i],
                             "xaxis" if horizontal is False else "yaxis" : args_arr[i]
                             },
                         ])
                for i, option_name in enumerate(options)])
            )
        ],
        paper_bgcolor="#D0E3F1"
    )

    indice_order = a_new[options[0]].value_counts().index.tolist() if filters is None else \
                    a_new.loc[filters[0], options[0]].value_counts().index.tolist()
    
    indice_order_sorted = a_new[options[0]].value_counts().sort_values().index.tolist() if filters is None else \
                    a_new.loc[filters[0], options[0]].value_counts().sort_values().index.tolist()

    if horizontal is not True:
        if sort_traces:
            fig.update_xaxes(categoryorder="array", categoryarray=indice_order)
        elif is_order_manually: 
            fig.update_xaxes(categoryorder="array", categoryarray=manual_category_orders[0])
        
        fig.update_xaxes(showticklabels=showticklabels[0])
    else:
        if sort_traces:
            fig.update_yaxes(categoryorder="array", categoryarray=indice_order_sorted)
        elif is_order_manually: 
            fig.update_yaxes(categoryorder="array", categoryarray=manual_category_orders[0])  
              
        fig.update_yaxes(showticklabels=showticklabels[0])
        
    fig.update_layout(title_text=labels[0], margin=dict(l=2, r=2),
                      dragmode="pan", showlegend=False)
    
    if add_legend is True:            
        fig.update_layout(legend=dict(yanchor="bottom", xanchor="right",
                                x=1, y=0, orientation="v"),
                                showlegend=True)
    return fig

# a = pd.DataFrame({"a" : ["Male", "Female", "Female", "Female", "Female", "Female", "Female" "Female", "Female", 
#                           "Female", "Female", "Male", "Male", "Male", "Male"], 
#                     "b" : ["A", "A", "B", "B", "B", "C", "C", "C", "C", "D", "D", "D", "D", "D"]})
# fig = create_figure_with_dropdown(a, ["a", "b"],
#                             textposition="inside", showticklabels=[False, False], sort_traces=True, horizontal=True)
# fig

# fig.update_layout(showlegend=True, 
#                   legend=dict(yanchor="top", xanchor="center",
#                               orientation="h",
#                               x=0.5, y=-0.1))



# a = pd.DataFrame({"a" : ["Male", "Female", "Female"], "b" : ["A", "A", "B"]})
# create_figure_with_dropdown(a, ["a", "b"], sort_traces=False) # Default to False


# a = pd.DataFrame({"a" : ["He", "He", "He", "Loves", "Eating", "Strawberry", "Strawberry", "Wee"],
#                    "b" : ["Sa", "Sa", "Sa", "Sa", "Sa", "Sa", "Ge", "Yo"]})
# create_figure_with_dropdown(a, options=["a", "b"], is_wordcloud=False)

In [13]:
def create_table(a_new,
                 column,
                 label=None,
                 reindex=False,
                 reindex_order : list[str] = None, 
                 headers=["Answer", "Number of Students"],
                 limit=None):
    '''
        Returns a table of value counts
    ''' 
    if label is None:
        label = column

    a_new_count = pd.DataFrame(a_new[column].value_counts()).reset_index() if reindex == False \
    else pd.DataFrame(a_new[column].value_counts()).reindex(reindex_order).reset_index()
    
    if limit:
        a_new_count = a_new_count.head(limit)
    
    fig = go.Figure(go.Table(header=dict(values=headers, fill_color="black", font=dict(color="white"), line_color="black"), 
                             cells=dict(values=a_new_count.T.values, height=25, line_color="black", fill_color="white")))
    fig.update_layout(margin=dict(l=2, r=2),
                       height=calc_table_height(a_new_count), title_text=label) 
    return fig

# a = pd.DataFrame({"a" : [1, 1, 2, 2, 3, 3, 3]})

# create_table(a, "a", reindex=True, reindex_order=a["a"].value_counts().index.sort_values().tolist())


In [None]:
def create_rating_distributions(score_suffix, title_text, star_distrib : pd.Series):
    fig_avg = go.Figure()
    fig_avg.update_xaxes(visible=False)
    fig_avg.update_yaxes(visible=False)
    fig_avg.update_layout(paper_bgcolor="#D0E3F1",
                    height=300, margin=dict(l=0, r=0, t=60, b=60))
    fig_avg.add_annotation(
        dict(xref="paper", yref="paper", xanchor="center", yanchor="middle", x=0.5, y=0.5, ax=0, ay=0,
        text=f"{format(star_distrib.mean(), '.2f')} {score_suffix}", font=dict(size=22))
    )

    fig_avg.add_annotation(
        dict(xref="paper", yref="paper", xanchor="center", yanchor="middle", x=0.5, y=0.3, ax=0, ay=0,
        text="Average", font=dict(size=16))
    )

    fig_avg.add_annotation(
        dict(xref="paper", yref="paper", xanchor="right", yanchor="middle", x=0.98, y=0.1, ax=0, ay=0,
        text=f"Sample Size: {len(star_distrib)}", font=dict(size=12))
    )

    fig_avg.add_layout_image(
        dict(source="./icons/star.png", xref="paper", yref="paper", xanchor="center", yanchor="middle", x=0.35, y=0.525,
        sizex=0.6, sizey=0.6, sizing="contain")
    )

    star_distrib_str = star_distrib.astype(str)

    fig = px.histogram(star_distrib_str, y=star_distrib_str.name, 
                  color=star_distrib_str.name)
    fig.update_yaxes(title="Rating", categoryorder="array", 
                    categoryarray=
                    star_distrib
                    .sort_values()
                    .astype(str).unique())
    fig.update_traces(hovertemplate=less_cringe_hovertext_template["histogram_h"]["hovertemplate"],
                    marker=dict(color="rgb(253, 240, 54)", 
                                line=dict(color="rgb(0, 0, 0)", width=2)
                    )
    )

    add_percent_labels_for_hist(star_distrib, fig, horizontal=True)

    fig.update_layout(
        bargap=0.5,
        title=dict(
            text=title_text,
            xanchor="left",
            yanchor="bottom",
            y=0.93,
            x=0.05,
            font=dict(size=18)
        ),
        margin=dict(
            t=100,
            pad=7
        )
    )
    

    return fig_avg, fig

## Wordmap Functions

In [77]:
def generate_word_map(series : pd.Series, file_path=None, width=1000, height=600):
    '''
        Saves a wordcloud into a specific file path
    '''
    cloud = WordCloud(background_color="white", max_words=200, mask=None, 
    stopwords=stopwords, width=width, height=height, colormap="tab20",
    min_font_size=8, max_font_size=125, relative_scaling=0.75)
    
    cloud.generate(" ".join(map(str, series)))
    cloud.to_file(file_path)

# Saves a wordcloud into a specific file path and plot. Then plot the image
def analyze_word_map_frequencies(series: pd.Series, columns=["language", "num_people_speaking"]):
    word_default_dict = defaultdict(int)
    for _, values in series.items():
        for word in values.split():
            word_default_dict[word] += 1
    
    return pd.DataFrame(sorted(word_default_dict.items(), key=lambda x: x[1], reverse=True), 
            columns=columns)

def plot_word_map(file_path : str):
    '''
        file_path : path to wordcloud image    
    '''
    img = io.imread(file_path)
    fig = px.imshow(img, binary_compression_level=0)
    fig.update_layout(xaxis={'showgrid': False, 'showticklabels': False, 'zeroline': False},
                        yaxis={'showgrid': False, 'showticklabels': False, 'zeroline': False},
                        margin=dict(autoexpand=False, b=0, l=0, r=0, t=0), 
                        hovermode=False)
    
    return fig


## Map Functions

In [78]:
## Map setup, see Location Profile for Usage
def map_location_setup(location_series: pd.Series, data_coordinates: list[tuple]):
    # geolocator = Nominatim(user_agent="app")
    # vancouver_location = geolocator.geocode("Vancouver, BC")
    # m = folium.Map(location=(vancouver_location.latitude, vancouver_location.longitude), tiles="cartodbpositron",
    #            zoom_start=2)
    
    unique_locs = location_series.unique().tolist()
    num_per_unique_loc = location_series.value_counts()
    a_location_coordinates = pd.DataFrame(
        {
    "Coordinates" : data_coordinates
    },
    index=unique_locs)
    return pd.merge(a_location_coordinates, num_per_unique_loc, left_index=True, right_index=True)

################################################# CAN'T MODULARIZE FOLIUM FOR SOME REASON ############33
# def add_map_markers(a: pd.DataFrame, count_help_text,
#                     icon_object=None):
#     """
#         DataFrame format should look like 

#         | Coordinates | <Count_Column>       
#         -------------------------------

#         Index should be assigned to name of the location.
#         Output from map_location_setup is an acceptable input
#     """
#     m = create_map()
#     if icon_object is None:
#         icon_object = folium.Icon(icon="school", color='lightblue', prefix='fa')

#     for i in range(a.shape[0]):
#         current_data = a.iloc[i]
#         current_name = current_data.name
#         number_per_name = current_data[a.columns[1]]
#         html = f'''
#         <div style="display: flex; justify-content: left; flex-direction: column;">
#             <div style="padding:0 10px 10px 0;color:grey"><b>{current_name}</b></div>
#             <div style="padding:0 10px 10px 0;font-size:40;font-weight:100;text-align:center">{number_per_name}</div>
#             <div style="text-align:center;padding:0 10px 0 0;">{count_help_text}</div>
#         </div>
#         '''

#         iframe = folium.IFrame(html=html, width=170, height=170)
#         icon = icon_object
#         popup = folium.Popup(iframe)
#         folium.Marker(location=current_data["Coordinates"], popup=popup, icon=icon
#         ).add_to(m)

#     return m
    

def create_map():
    geolocator = Nominatim(user_agent="app")
    vancouver_location = geolocator.geocode("Vancouver, BC")
    m = folium.Map(location=(vancouver_location.latitude, vancouver_location.longitude), tiles='cartodbpositron',
               control_scale=True, zoom_start=2)
    # folium.LayerControl().add_to(m)
    return m

def get_figure_for_map(m: folium.Map, height=400):
    f = folium.Figure(height=height)
    m.add_to(f)
    return f

# Clustering Functions

In [20]:
def generate_sim_matrix(series: pd.Series) -> pd.DataFrame:
    from scipy.spatial.distance import pdist, squareform
    from similarity.longest_common_subsequence import LongestCommonSubsequence

    LCS = LongestCommonSubsequence()
    def compare_lcs(u, v):
        min_length = min(len(u[0]), len(v[0]))
        return min(-0.001, -(1 - (abs((LCS.distance(u[0], v[0]) - len(u[0]) - len(v[0])) / 2) / min_length)))
        # return -((1 - cosine.similarity_profiles(cosine.get_profile(u[0]), cosine.get_profile(v[0]))) + levenshtein.distance(u[0], v[0]))

    precomputed_similarity_matrix = squareform(pdist(np.expand_dims(series.unique(), axis=1), compare_lcs)) 
    df_precompute_sim_matrix = pd.DataFrame(
        precomputed_similarity_matrix,
        index=series.unique(),
        columns=series.unique(),
    )
    return df_precompute_sim_matrix

# Generate similarity matrix between every possible professor input
# df_precompute_sim_matrix

# Distribution of Similarity Matrix
# px.histogram(df_precompute_sim_matrix[df_precompute_sim_matrix != 0].max(axis=1), title="Distribution of Largest Common Subsequence Metric")

# ### Removing Unique Professor Names
# We apply this filter to try out best to not cluster professor names that were only entered in one way. 
# For example, if we all entered Hamid's name as Hamid, then there is no need to rewrite Hamid is any other way.

# I only included inputs that have an LCS distance > -0.3, because I thought most inputs that have an LCS distance < -0.3 appeared to be unique professor names. I deducted this through trial and error. For each input, I found their most closely associated inputs with the LCS distance closest to 0 and put this information in a dataframe
# def gather_closest_distances_for_each_word(row):
#     closest_distances = row[row == row[row != 0].max()]
#     best_matches_for_each_word = closest_distances.index.tolist()
#     return [best_matches_for_each_word, row[row != 0].max()]


# df_MAT = MAT.apply(gather_closest_distances_for_each_word, axis=1, result_type="expand")
# df_MAT

# Per analysis above, We only include words iff they have at least one LCS distance < -0.3 with another word.

def remove_unique(df_precompute_sim_matrix: pd.DataFrame, threshold=0.3): 
    df_precompute_sim_matrix = df_precompute_sim_matrix.loc[df_precompute_sim_matrix[(df_precompute_sim_matrix > -0.3) & (df_precompute_sim_matrix != 0)].any()]
    df_precompute_sim_matrix = df_precompute_sim_matrix[df_precompute_sim_matrix.index]
    return df_precompute_sim_matrix


# Fit data to AP Cluster model
def fit_data(df_precompute_sim_matrix: pd.DataFrame):
    from sklearn.cluster import AffinityPropagation
    ap_cluster = AffinityPropagation(random_state=5, affinity="precomputed").fit(df_precompute_sim_matrix.to_numpy())
    return ap_cluster
    ### Automated Text Cleaning Using AP Clustering 

def generate_replacement_file(ap_cluster, df_precompute_sim_matrix: pd.DataFrame, output_file=None):
    # ONLY RUN ONCE
    # # ONLY RUN ONCE. This codeblock is used to memoize user prompt input.

    replacement_tracker = [] # Keep track of user prompt input
    for label in sorted(np.unique(ap_cluster.labels_)):
        prompt_tuple = []
        indices = df_precompute_sim_matrix.index[np.where(ap_cluster.labels_ == label)]
        text = input(f"Cluster: {indices}. Would you like to replace this cluster? (Y/N)")
        replacement_text = None
        if text == "Y" or text == "y":
            replacement_text = input(f"Cluster: {indices}. Enter the text your would like to replace this cluster with:")
        else:
            pass

        prompt_tuple.append(text)
        prompt_tuple.append(indices.tolist())
        prompt_tuple.append(replacement_text)
        replacement_tracker.append(prompt_tuple)

    
    '''
        Write out replacement file. In the form of [Should_I_Replace_Cluster_With_Text, Replacement_Text]
        [["Y", "Ashvin"], ["Y", "Bruno"], ["N", None]]
    '''
    json_obj = json.dumps(replacement_tracker)
    with open(output_file, "w") as f:
        f.write(json_obj)


def execute_replacements(df, column, infile=None):
    f = open(infile) 
    replacement_list = json.load(f)
    f.close()
    for text, indices, replacement_text in replacement_list:
        if text == "Y" or text == "y":
            df[column].replace({index: replacement_text for index in indices}, inplace=True)
        else:
            continue