# template functions plotly
Template to functions:
- transversal functions to transform data
- transversal functions to plot data

In [1]:
import numpy as np
import pandas as pd
import statsmodels
import seaborn as sns

import matplotlib.pyplot as plt
from statsmodels.graphics import tsaplots

# plotly
import plotly
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots

## Transversal functions to plot data

### 1. transform dataframe into plotly table

In [2]:
# auxliar function to plot a dataframe as a plotly table
def plot_df_table_plotly(df_to_plotly):
    """
    Given a dataframe, transform into a plotly table
    Args
        df_to_plotly (dataframe): dataframe that will be transformed into plotly table

    Return
        table_fig (figure plotly): fig of plotly with the plot generated
    """
    table_fig = go.Figure(data=[go.Table(
    header=dict(values=list(df_to_plotly.columns),
                fill_color='paleturquoise',
                align='left'),
    cells=dict(values=[df_to_plotly[col] for col in df_to_plotly.columns],
               fill_color='lavender',
               align='left'))
    ])
    return table_fig

### 2. given a dataframe generate a individual plot

In [3]:
def plot_individual_hist_segment(df, var_segment, feature_hist):
    """
    Plot individual hist
    Args
        df (dataframe): input dataframe
        varg_segment (string): name of the column in the input dataframe that indicate the differents segments in the data
        feature_hist (string): name of the feature in the input dataframe that will plot its histogram

    Return
        fig (figure plotly): fig of plotly with the plot generated
    """

    # TODO: ADD CODE TO GENERATE FIGURE
    fig = px.histogram(df, x = feature_hist, color = var_segment, barmode='overlay', opacity=0.4)

    # update title
    fig.update_layout(
      title_text = f'Histogram: {feature_hist}',
      title_x = 0.5, # centrar titulo
      title_font = dict(size = 20)
    )
    return fig

### 3.1 given a dataframe generate multiple/subplots for each feature - use plotly graph object (go)
- Using go in subplots, each plot generated have different color automatically

In [4]:
def plot_multiple_hist(df, number_columns = 2):
    """
    Plot multiple hist for each feature in the dataframe
    
    Args
        df (datafame)

    Return
        fig (figure plotly): fig of plotly with the plot generated
    """
    # get list features
    list_features = df.columns.tolist()


    # get number of rows (number row = number of data / number of columns)
    # (considering fixed the number of columns) 
    if (df.shape[1] % number_columns) != 0:
        number_rows = (df.shape[1] // number_columns) + 1 
    else:
        number_rows = (df.shape[1] // number_columns)


    ############################## 
    # Create los subplots
    fig = make_subplots(rows = number_rows, cols = number_columns, shared_xaxes=False, subplot_titles=list_features)

    # add each histogram
    for index_feature, feature in enumerate(list_features):

        # obtener índices en el subplot (en plotly los índices comienzan en 1, por lo que debe sumarse un 1 a los resultados obtenidos)
        row = (index_feature // number_columns) + 1
        column = (index_feature % number_columns) + 1
        
        # TODO: ADD CODE TO GENERATE GRAPH USING PLOTLY OBJECT (go)
        fig.add_trace(go.Histogram(x = df[feature], name = feature), row = row, col = column)
        

    # update layout
    fig.update_layout(height=len(list_features)*250, 
                      width=1600, 
                      title_text = "Histograms",
                      title_x = 0.5, # centrar titulo
                    title_font = dict(size = 28)
                     )
    #fig.update_layout( title_text = "Histograms")

    
    return fig

### 3.2 given a dataframe generate multiple/subplots for each feature - use plotly express (px)
- Using the same functions to individual plot, now to subplots.
- In this case a individual figure is generated, then the data of the figure is extracted using "fig.data"
- Depending of the number of figure, the "fig.data" could change its shape. For example in you do a line plot of one feature only exists one data and you can access with "fig.data[0]"
- But when a multiple plots is generated (multiple in a way internall of plotly of count the plots) the list "fig.data" has more than one element. For example, ploting a line plot of 3 feature, the object figure has 3 differents data one for each plot. fig.data[0], fig.data[1], fig.data[2]
- In addition, in this data getting you can modified the values and get change for example the color of the plot.
- "fig.data" contains all the information necesary to plot the plotly figure
- When in a subplots generate a individual plot with plotly express, the caracterstics of the plot is the same and only change the values. Por example, in a subplot for each feature in all subplot the values change according the values of the feature, but the color of the plot is the same in each subplot

In [5]:
def plot_multiple_boxplot_months(df, number_columns = 1):
    """
    Plot boxplots of each month and each year. See the montly distribution of ALL features

    Args
        df (datafame): dataframe input
        number_columns (integer): number of columns, by default ONE column

    Return
        fig (figure plotly): fig of plotly with the plot generated
    """
    
    # get list of features
    list_features = df.columns.tolist()

    # get number of rows (number row = number of data / number of columns)
    # (considering fixed the number of columns) 
    if (df.shape[1] % number_columns) != 0:
        number_rows = (df.shape[1] // number_columns) + 1 
    else:
        number_rows = (df.shape[1] // number_columns)


    ############################## 
    # create subplots
    fig = make_subplots(rows = number_rows, 
                        cols = number_columns, 
                        subplot_titles = df.columns,
                        shared_xaxes=False
                        #vertical_spacing = 0.2 / len(number_rows) # with this parameter is possible reduce the vertical space, but the figure size need to be modified because the subplots become biggers
                       )

    # add subplot of boxplots for each month and year
    for index_feature, feature in enumerate(list_features):

        # obtener índices en el subplot (en plotly los índices comienzan en 1, por lo que debe sumarse un 1 a los resultados obtenidos)
        row = (index_feature // number_columns) + 1
        column = (index_feature % number_columns) + 1
        
        # TODO: DO PLOT WITH PLOTLY EXPRESS AND THEN GET THE INFROMATION OF EACH TRACE IN fig.data AND ADD IT INTO THE SUBPLOTS
        box_fig = px.box(df, x=df.index.month, y=feature, color=df.index.year)
        for trace in box_fig.data:
            fig.add_trace(trace, row = row, col = column)


  # adjust plot
    fig.update_layout(title = 'Boxplots for Month and Year',
                      xaxis_title='Month',
                      yaxis_title='Value',
                      legend_title='Year',
                      title_x=0.5,  # center
                      title_font=dict(size=20),
                      height=1450 * number_rows,  # largo
                      width=1850 * number_columns, # ancho
                      showlegend=True,
                      boxmode='group',  # Group boxplots by month
                      boxgap=0.2)  # Adjust the gap between grouped boxplots
    ############################## 

    return fig

### 4. Change atributes of the plot with fig.data
In this example the color of a trend line is modified

In [6]:
def plot_individual_scatter_plot(df, feature_x, feature_y, marginal_hist = False):
    """
    Create an individual scatter plot between two variables
    
    Args
        df (dataframe): input dataframe with the feature to plot in the scatter plot
        feature_x (string): name of the feature in x-axis
        feature_y (string): name of the feature in y-axis
        marginal_hist (bool): plot a histogram as marginal (feature_x and feature_y). By default in false
    
    Return
        fig (figure plotly): fig of plotly with the plot generated
    """

    # plot scatter plot
    if marginal_hist == True:
        fig = px.scatter(df, x = feature_x, y = feature_y, marginal_x = "histogram", marginal_y="histogram", trendline="ols")
        tittle_plot = f'scatter plot: {feature_x} vs {feature_y}. Marginal distributions'
    else:
        fig = px.scatter(df, x = feature_x, y = feature_y, trendline="ols")
        tittle_plot = f'scatter plot: {feature_x} vs {feature_y}'

    
    # TODO CHANGE ATRIBUTES OF THE FEATURE IN fig.data
    fig.data[-1]['marker']['color'] = '#d62728' # change color to brick red

    # update title
    fig.update_layout(
      title_text = tittle_plot,
      title_x = 0.5, # centrar titulo
      title_font = dict(size = 20)
    )

    return fig

# Transversal functions to transform data

### 1. Segmentation data: custom segmentation - percentile segmentation

In [7]:
##### SEGMENTATION DATA CUSTOM SEGMENTATION - PERCENTILE SEGMENTATION
""""""""""""""""""""""""""""""""""""""""""""""""" SEGMENTATION DATA CUSTOM """""""""""""""""""""""""""""""""""""""""""""""""
def custom_segmentation(df, var_segment, intervals_segments, labels_segments):
    """
    Given a dataframe, generate a new column with a categorical values that divide the data in differents segments. 
    Segment the data by a certain variable with a custom segmentation
    
    Args
        df (dataframe): dataframe input
        var_segment (string): variable feature/target used to segment the data
        intervals_segments (list of numbers): list with the thresholds used to segment the data
        labels_segments (list of strings): list with the names of the differents segments generated. Shape: len(intervals_segments) - 1

    Return
        df(dataframe): the input dataframe with a new column with the segment
    """

    # apply pd.cut to generate intervals
    df[f'{var_segment}_segments'] = pd.cut(df[var_segment], 
                                           bins = intervals_segments, 
                                           labels = labels_segments, 
                                           include_lowest = True
                                          )

    # order data by the custom segmentation - to generate plots it is neccesary to sort the data
    # if the plot show a temporal relation like trends plots, it is necessary sort the data by index
    df = df.sort_values(by = [var_segment])
    
    return df
"""""""""""""""""""""""""""""""""""""""""""""""""  """""""""""""""""""""""""""""""""""""""""""""""""


""""""""""""""""""""""""""""""""""""""""""""""""" SEGMENTATION DATA PERCENTILES """""""""""""""""""""""""""""""""""""""""""""""""
def generate_labels_percentile_segmentation(df, var_segment, list_percentile, list_labels_percentile_base):
    """
    Given a dataframe and a feature to segment in percentiles, calculate the labels of the segmentation
    
    Choices of labels:
        labels_percentile: ['q1', 'q2', 'q3', 'q4']
        labels_values: ['(0.15-1.2)', '(1.2-1.8)', '(1.8-2.65)', '(2.65-5.0)']
        labels_percentile_values: ['q1 - (0.15-1.2)', 'q2 - (1.2-1.8)', 'q3 - (1.8-2.65)', 'q4 - (2.65-5.0)']
        
    Args
        df (dataframe): dataframe input
        var_segment (string): variable feature/target used to segment the data
        list_percentile (list): list of floats with the percentiles to divide the data
        list_labels_percentile_base (list): list of strings with the base labels of percentiles to divide the data 

    Return
        list_labels_percentile_base, list_labels_values_range, list_labels_percentile_values_range (lists). list of the 3 types of labels generated
    """

    # get values of each percentile
    list_percentile_values = [df[var_segment].quantile(x).round(2) for x in list_percentile]
    
    # generate a list of string with the start value and end value of each interval
    list_percentile_start_end = [] 
    for index in range(len(list_percentile_values)-1): 
        start_value = list_percentile_values[index]
        end_value = list_percentile_values[index+1]
        string_start_end = f'{start_value}-{end_value}'
        list_percentile_start_end.append(string_start_end)
    
    # output final v0 - base
    #list_labels_percentile_base
    
    # output final v1 - only values start end
    list_labels_values_range = []
    for index in range(len(list_labels_percentile_base)):
        string_output = f'({list_percentile_start_end[index]})'
        list_labels_values_range.append(string_output)
    
    # output final v2 - percentile and values start end
    list_labels_percentile_values_range = []
    for index in range(len(list_labels_percentile_base)):
        string_output = f'{list_labels_percentile_base[index]} - ({list_percentile_start_end[index]})'
        list_labels_percentile_values_range.append(string_output)
    
    return list_labels_percentile_base, list_labels_values_range, list_labels_percentile_values_range



def percentile_segmentation(df, var_segment, type_percentile):
    """
    Given a dataframe, generate a new column with a categorical values that divide the data in differents segments. 
    Segment the data by a certain variable with a percentile segmentation. the segmentation could be by quartiles, quintiles, deciles
    
    Args
        df (dataframe): dataframe input that will be modified
        var_segment (string): variable feature/target used to segment the data
        type_percentile(string): type of percentile segmentation
    
    Return
        df(dataframe): the input dataframe with a new column with the segment

    TODO: THE LABELS GERATED AND USED ARE ONLY ['q1 - (0.15-1.2)', 'q2 - (1.2-1.8)', 'q3 - (1.8-2.65)', 'q4 - (2.65-5.0)']
    ADD A ARGS TO SELECT THE KIND OF LABELS
    """

    # validate input - TODO: create a decent unit test
    choices_segmentation = ['quartile', 'quintile', 'decile']
    if type_percentile not in choices_segmentation:
        print('error in choices of segmentation')
        print(f'Possibles choices: {choices_segmentation}')
        return 0

    # quartile
    if type_percentile == 'quartile':
        quartile = [0, 0.25, 0.5, 0.75, 1]
        labels_quartile_base = ['q1', 'q2', 'q3', 'q4']
        _, _,  labels_quartile = generate_labels_percentile_segmentation(df, var_segment, quartile, labels_quartile_base)
        df[f'quartile_{var_segment}'] = pd.qcut(df[var_segment], q = quartile, labels = labels_quartile)
    
    # quintile
    if type_percentile == 'quintile':
        quintile = [0, 0.2, 0.4, 0.6, 0.8, 1]
        labels_quintile_base = ['q1', 'q2', 'q3', 'q4', 'q5']
        _, _,  labels_quintile = generate_labels_percentile_segmentation(df, var_segment, quintile, labels_quintile_base)
        df[f'quintile_{var_segment}'] = pd.qcut(df[var_segment], q = quintile, labels = labels_quintile)


    # decile
    if type_percentile == 'decile':
        decile = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
        labels_decile_base = ['d1', 'd2', 'd3', 'd4', 'd5', 'd6', 'd7', 'd8', 'd9', 'd10']
        _, _,  labels_decile = generate_labels_percentile_segmentation(df, var_segment, decile, labels_decile_base)
        df[f'decile_{var_segment}'] = pd.qcut(df[var_segment], q = decile, labels = labels_decile)

    return df
"""""""""""""""""""""""""""""""""""""""""""""""""  """""""""""""""""""""""""""""""""""""""""""""""""

'  '

### 2. map pair of features. generate a list of tuple of each combination of features (feature_x, feature_y)

In [8]:
# auxiliar function to map features in a pair (feature_x, feature_y)
def list_map_combinations_features(list_features, dim_combinations = 2):
    """
    Given a list of features of a dataframe, map all the combinations between each features. combinations without replace and (a,b) is the same (b,a)
    IN PREVIOUS CODES THERE ARE OTHER WAY TO MAP THE FEATURESS, ACUALLY THIS WAY IS BETTER

    Args:
        list_features (list): list of features that will generate the combinations
        dim_combinations (string): dimensions of combinations. default 2 -> generate a pair of features (feature_x, feature_y)

    Return
        list_tuple_combinations (list): list where each element is a tuple with the combination
    """
    # get all the possible combinations withtout repeteat
    todas_combinaciones = combinations(list_features, dim_combinations)
    
    # generate output
    list_tuple_combinations = []
    for comb in todas_combinaciones:
        list_tuple_combinations.append(comb)

    return list_tuple_combinations