In [1]:
"""
Work done by Anand Chauhan, for Cleantech Solar's Data Analyst Internship
assignment.
"""
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.colors import ListedColormap, BoundaryNorm

class CustomDataPlotter:
    def __init__(
        self,
        path_to_data: str,
        list_of_colours: list[str] = ["navy", "lightblue", "orange", "brown"],
        initial_budget: float = 73.9,
        annual_budget_decay: float = 0.008,
        bounds: list = [0, 2, 4, 6, 8]
    ):
        """
        This is a custom, object oriented solution for the assignment. It makes heavy usage of the
        efficiencies and functionalities provided by `matplotlib` and `pandas`, which are the only
        library installs needed to run the code.

        Args:
            `path_to_data` (str): where the data file is located. Must contain the name of the file.

            `list_of_colours` (list): List of colors that need to be applied to the data. Must
            contain some form of color information that `matplotlib` can understand. See matplotlib
            documentation for more information.

            `initial_budget` (float): The initial value of the 'Budget' column.

            `annual_budget_decay` (float): The annual rate of decay for 'Budget'. This will update
            every year exactly on the first day of the data. Prepend a dummy column in the data to
            mitigate this shortcoming. The data must be in the form of a float that can directly be
            multiplied after subtracting from 1. For example, `0.8%` will be written as `0.008`
            after dividing by 100.

            `bounds` (list): List of the boundaries where color changes need to occur. These are
            always inclusive, but the formatting of data requires input list to be padded on both
            sides. This will result in this list being 1 larger than the `list_of_colours` parameter.
            For example, If the points of divide are 2, 4, and 6. `[0, 2, 4, 6, 8]` can be passed
            along with 4 colours. The padding must be uniform in order for the tick labels to perfectly
            line up with the text.

            Please note, that a more sophisticated solution merging `list_of_colours` and `bounds` can
            be developed with advanced type-checking. But this approach contains the least amount of
            hard-coding that is possible for this problem already, short of the structure of the core
            data itself. Therefore, I feel that this should suffice as a full implementatio of the
            assignment.
        """
        self.filepath = path_to_data
        self.list_of_colours = list_of_colours
        self.init_budget = initial_budget
        self.decay = annual_budget_decay
        self.bounds = bounds
        self.df = self.load_file()

    def load_file(self) -> pd.DataFrame:
        """
        Method that runs at the end of the class's `__init__` method, which loads the data from the
        provided filepath and appends the budget column as per specification. Please Note, that I have
        used a division method for dividing

        Returns:
            pd.DataFrame: The main dataframe that contains all the data, as well as the budget value
            for each row. This has to be hard-coded, however it is not a significant drawback as data
            generated like this is not difficult to edit. Even if it is, this code is also easy to edit.
        """
        df = pd.read_excel(self.filepath)
        df["Date"] = pd.to_datetime(df["Date"])
        df.set_index("Date", inplace=True)
        # Direct converstion is no longer supported because 'years' are ambiguous so we'll have to
        # use division by 365.25 to find the difference in years. This approach shows no errors for
        # the assignment.
        years_since_start = (df.index - df.index[0]).days // 365.25
        df["Budget"] = self.init_budget * ((1 - self.decay) ** years_since_start)
        return df

    def generate_colours(self) -> pd.Series:
        """
        Applies the `colour_util` method to every row of the dataframe that has been read. This uses
        the `pandas.DataFrame.apply` method, which is significanly faster than enumeration or iteration.

        Returns:
            pd.Series: A series with index set to the GHI values from the dataframe, and the values
            set to the corresponding color defined when creating the
        """
        temp_df =  self.df["GHI"].apply(self.colour_util)
        return temp_df.set_axis(self.df["GHI"])

    def colour_util(self, value: float) -> str:
        """
        Utility function that determines colour based on the `value`, `list_of_colours`, and
        `bounds` attributes. This function, along with the `colorbar`, require the bounds list to have
        such an unintuitive construction. This function is called by the `generate_colours` function
        as part of the generating the scatter plot.

        Args:
            value (int): The GHI value for which colours are to be generated using the list.

        Returns:
            str: The corresponding member of the `list_of_colours` attribute passed when creating
            the instance.
        """
        for i, bound in enumerate(self.bounds[1:-1]):
            if value <= bound:
                return self.list_of_colours[i]
        return self.list_of_colours[-1]

    def set_title(self):
        """
        Generate required subplots and set title dynamically based on data.
        """
        self.fig, self.ax = plt.subplots(figsize=(16, 9))
        self.fig.set_facecolor('white')
        from_date = self.df.index[0].strftime('%Y-%m-%d')
        to_date = self.df.index[-1].strftime('%Y-%m-%d')
        title = f"""Performance Ratio Evolution\nFrom {from_date} to {to_date}"""
        plt.title(title, fontweight='bold')

    def plot_lines(self):
        """
        A straightforward function that generates labels for lines, plots them, and generates the
        scatterplot for the data.
        """
        label_30d = "30-d Moving Average of PR"
        self.ax.plot(self.df['PR'].rolling(window=30).mean(), color="red", linewidth=4, label=label_30d)

        label_bud = self.generate_budget_label()
        self.ax.plot(self.df["Budget"], color="darkgreen", linewidth=3, label=label_bud)

        # extra label for stats
        count_above_budget = len(self.df[self.df['PR'] > self.df['Budget']])
        target_perc = count_above_budget*100/len(self.df)
        label = f"Points above target budget PR = {count_above_budget}/{len(self.df)} = {target_perc:.1f}%"
        plt.plot([], [], ' ', label=label)

        plt.ylim([0, 110]) # fine to hardcode since y axis is a percentage.
        plt.locator_params(axis='y', nbins=11)
        plt.ylabel("Performance Ratio (%)")
        plt.xlim([self.df.index[0], self.df.index[-1]])

        # Scatter plot
        colormap = ListedColormap(self.list_of_colours)
        norm = BoundaryNorm(self.bounds, colormap.N)
        self.ax.scatter(self.df.index, self.df["PR"], c=self.generate_colours(),
                        cmap=colormap, s=35, norm=norm)

    def add_colorbar(self):
        """
        Generates colorbar using `bounds`, `list_of_colours`, and the data. Called internally.
        """
        colormap = ListedColormap(self.list_of_colours)
        norm = BoundaryNorm(self.bounds, colormap.N)
        colorbar = plt.colorbar(cm.ScalarMappable(norm=norm, cmap=colormap), spacing='uniform',
                                label='Daily Irradiation [kWh/m2]', boundaries=self.bounds)
        tick_positions = np.linspace(self.bounds[0], self.bounds[-1], 2*len(self.bounds)-1)
        colorbar.set_ticks(tick_positions)
        ticklabels = self.generate_ticklabels()
        colorbar.set_ticklabels(ticklabels)

    def generate_budget_label(self) -> str:
        """
        Utility function that generates the label from the Budget column of the dataframe, making
        a string like the one in specification mentioning yearwise budget values. Please Note, that for
        sufficiently long amounts of data, this label can get long.

        Returns:
            str: the raw label as per specification that can be appended on to the chart.
        """
        base = f"Target Budget Yield Performance Ratio ["
        ylist = []
        for i, budget in enumerate(self.df['Budget'].unique()):
            ylist.append(f"{i+1}Y-{budget:.1f}%")
        return base + ', '.join(ylist) + ']'

    def generate_ticklabels(self) -> list:
        """
        Creates a list of ticklabels that are to be appended onto the colorbar.
        Called by `add_colorbar`, and is one of the reasons why the `bounds` attributes needs padding
        on both sides.

        Returns:
            list: A list of labels for ticks, assuming that the labels need to be in the center of
            the area of the colorbar. This will, therefore, contain `2*len(list_of_colours) + 1` labels
            logically.
        """
        ticklabels = []
        ticklabels.append("")
        for i, bound in enumerate(self.bounds):
            if i == 1:
                ticklabels.extend([f"< {bound}", ""])
            elif (i > 1) and (i < len(self.bounds) - 1):
                ticklabels.extend([f"{self.bounds[i-1]}-{bound}", ""])
            elif i == len(self.bounds) - 1:
                ticklabels.extend([f"> {self.bounds[i-1]}", ""])
        return ticklabels

    def configure_legend_and_text(self):
        """
        Generates all the matplotlib elements for the legends (the line graphs that have been
        predefined), as well as all the textual data presented and required in the graph. This
        function unfortunately has to be hard-coded for now, but it is still modular enough that
        anyone can edit it, and with more time I can easily convert this into something that accepts
        a dictionary or arguments.
        """
        plt.legend(loc='center', bbox_to_anchor=(0.4, 0.38))
        plt.grid(True, 'major', alpha=0.5)
        text = self.generate_mean_stats()
        text = plt.text(self.df.index[-300], 2, text, fontsize=14, linespacing=2)
        text.set_bbox(dict(facecolor='white', alpha=0.95, linewidth=0))

    def generate_mean_stats(self) -> str:
        """
        Generates the stats and the text for the last 7, 30, 60, 90, 365 and Lifetime days of data.
        """
        stats_text = ""
        windows = [7, 30, 60, 90, 365, 'Lifetime']
        for i in windows:
            if type(i) != str:
                stats_text += f"Average PR last {i}-d: {self.df['PR'][-i:].mean():.1f}%\n"
            else:
                stats_text += f"Average PR {i}: {self.df['PR'].mean():.1f}%"
        return stats_text

    def save_plot(self, save_path: str = None, save_format='svg'):
        """
        Utility function that runs at the of the `plot_data` runner method. This function uses
        matplotlib's 'matplotlib.pyplot.savefig' function to save the graph in a specified format.
        The format defaults to svg, but accepts most image file formats such as jpg and png. See
        matplotlib's documentation for more information.

        Args:
            `save_format` (str, optional): Passed from `plot data` directly, can be changed to required
            file extension. Defaults to 'svg'.
            `save_path` (str, optional): A subdirectly, if required. If the directory does not exist,
            it will automatically be created. Defaults to None.
        """
        os.makedirs(save_path, exist_ok=True)
        self.fig.savefig(os.path.join(save_path, 'output_plot.') + save_format, format=save_format)

    def plot_data(self, save_to_disk=False, show=True, save_format='svg', save_path=None):
        """
        The main 'Runner' function for the assignment. This method uses all the other methods
        defined in this class and provides a smooth way to visualize data in the requested format.
        This function also has the functionality of being able to generated graphs using user-defined
        formats and filepaths.

        Args:
            `save_to_disk` (bool, optional): The boolean that decides whether to save a generated
            graph to disk or view it. Doing both is also possible by passing both as true. Defaults
            to False.

            `show` (bool, optional): A flag to determine whether. Defaults to True.

            `save_format` (str, optional): Stores desired format for saving the graph. Defaults to
            'svg'.

            `save_path` (str, optional): Stores a subdirectory if required for saving the graph.
            Please note, that if the directory does not exist, it will be created. Defaults to None.
        """
        self.set_title()
        self.plot_lines()
        self.add_colorbar()
        self.configure_legend_and_text()
        if show: plt.show()
        if save_to_disk: self.save_plot(save_path, save_format)

In [2]:
# loading the custom arguments that were defined in the assignment brief.

path_to_data = "/content/Assignment_Dataset.xlsx" # FILE MUST BE UPLOADED,
path can also be a url

# colors taken from the sample image
list_colours = ["#2628ec", "#68baf5", "#eea02d", "#9d6e42"]
initial_budget = 73.9
decay = 0.008
bounds = [0, 2, 4, 6, 8] # please see docstrings to learn how to format this
plotter = CustomDataPlotter(path_to_data, list_colours, initial_budget, decay, bounds)
plotter.plot_data(show=True, save_to_disk=False) # runner function

HTTPError: ignored