In [1]:
import pandas as pd
import numpy as np
import sys
from sqlalchemy import create_engine
from bokeh.plotting import figure, output_notebook, show

In [2]:
class Utility:
    """
    Utility class to store and visualize data

    1. This class creates a sqlite db file and in it creates 3 Tables namely
    Train, Ideal and Test.
    The outline of Train Table is as
        x y1 y2 y3 y4
    The Ideal Table looks like
        x y1 y2 y3 y4 --- y50
    The Test Table looks like
        x y Mapped Ideal Function Deviation

    2. It is also used to plot scatter plots of train dataset along with best
    fit ideal function

    Attributes:
        engine (sqlalchemy.engine.base.Engine): Engine Object Reference
        ideal_df (pandas.core.frame.DataFrame): Ideal dataset
        train_df (pandas.core.frame.DataFrame): Train dataset
    """
    def __init__(self):
        """
        Constructor of Database class
        Sql database is created as part of initialization
        """
        # Create sqlite database
        self.engine = create_engine(f'sqlite:///assignment.db')
        dh = DataHandler.get_instance()
        self.train_df = dh.get_train_data()
        self.ideal_df = dh.get_ideal_data()

    @staticmethod
    def filename(pre):
        return f'{pre}.html'

    def update_db(self, test_results_df):
        """
        This method is used to create and update Tables in the sqlite database
        created while class initialization.
        Train, Ideal and Test Tables

        Parameters:
            test_results_df (pandas.core.frame.DataFrame): Data for Test Table
        """

        # Create and update Train Table in sqlite database
        self.train_df.to_sql('Train', con=self.engine, if_exists='replace',
                             index=False)

        # Create and update Ideal Table in sqlite database
        self.ideal_df.to_sql('Ideal', con=self.engine, if_exists='replace',
                             index=False)

        # Create and update Test results Table in sqlite database
        test_results_df.to_sql('Test', con=self.engine, if_exists='replace',
                               index=False)

    def plot(self, train_results):
        """
        This method is used to plot scatter plot for train data and chosen
        ideal data on single plot so as to check if they fit.

        Parameters:
            train_results (dic): Best fit results
        """
        # For each function in Train dataset
        for col in self.train_df.columns[1:]:
            # Get Chosen ideal function mapped to current train function
            ideal_fn = train_results[col][0]
            # Create a figure to display the scatter plot
            comp_plt = figure(title=f'Ideal Function {ideal_fn} VS  Train '
                                    f'Function {col}',
                              x_axis_label=f'Ideal Function{ideal_fn}',
                              y_axis_label=f'Train Function {col}')
            comp_plt.scatter(self.train_df['x'], self.train_df[col],
                             size=5, color='blue', alpha=0.5)
            comp_plt.scatter(self.ideal_df['x'], self.ideal_df[ideal_fn],
                             size=3, color='yellow', alpha=0.5)
            output_notebook()
            show(comp_plt)


In [3]:
class DataHandler:
    """
    Singleton class for accessing dataset files

    This singleton class is used to share data, loaded from files into pandas
    Dataframe, among multiple classes.

    Attributes:
        train_data (pandas.core.frame.DataFrame): Training dataset
        ideal_data (pandas.core.frame.DataFrame): Ideal dataset
        test_data (pandas.core.frame.DataFrame): Test dataset
    """
    __instance = None
    train_data = pd.DataFrame()
    ideal_data = pd.DataFrame()
    test_data = pd.DataFrame()

    @staticmethod
    def get_instance():
        """
        Static method to get the instance of DataHandler
        :return: DataHandler instance
        """
        # Static access method
        return DataHandler() if DataHandler.__instance is None else \
            DataHandler.__instance

    def __init__(self):
        """
        Constructor of singleton DataHandler class
        It loads training, ideal and test datasets in train_df,
        ideal_df and test_df dataframes.
        """
        # Virtually private constructor
        if DataHandler.__instance is not None:
            raise Exception("This class is a singleton!")
        else:
            DataHandler.__instance = self
            try:
                self.train_data = pd.read_csv(sys.argv[1])
                self.ideal_data = pd.read_csv(sys.argv[2])
                self.test_data = pd.read_csv(sys.argv[3])
            except Exception as e:
                print("Exception while loading data to dataFrame: ", e)
                sys.exit(1)
            else:
                if self.ideal_data.empty:
                    print("Empty Ideal Dataset")
                    sys.exit(1)
                elif self.train_data.empty:
                    print("Empty Train Dataset")
                    sys.exit(1)
                elif self.test_data.empty:
                    print("Empty Test Dataset")
                    sys.exit(1)
                else:
                    print("Data Loading to dataframe successful..")

    def get_train_data(self):
        """
        Used to get Training data frame
        :return: pandas.core.frame.DataFrame
        """
        return self.train_data

    def get_ideal_data(self):
        """
        Used to get Ideal data frame
        :return: pandas.core.frame.DataFrame
        """
        return self.ideal_data

    def get_test_data(self):
        """
        Used to get Test data frame
        :return: pandas.core.frame.DataFrame
        """
        return self.test_data


# Create instance of DataHandler
s = DataHandler()


Data Loading to dataframe successful..


In [4]:
class Train:
    """
    Model Training class
    This class uses training dataset to choose the four ideal functions
    which are the best fit out of the fifty provided in ideal dataset.
    How they minimize the sum of all y-deviations squared (Least-Square) is
    the criteria for choosing the ideal functions for the training function.

    Attributes:
        train_df (pandas.core.frame.DataFrame): Training dataset
        ideal_df (pandas.core.frame.DataFrame): Ideal dataset
        train_results (dict): Stores 4 chosen ideal functions along with
        Maximum deviation, minimum sum of square deviations for each training
        function.
        {'training_function': ('Ideal_function',  'Maximum_deviation',
        'minimum_sum_squared_deviations')}
    """

    def __init__(self):
        """
        Constructor of Train class
        """

        # Load Train dataset
        dh = DataHandler.get_instance()
        self.train_df = dh.get_train_data()
        self.ideal_df = dh.get_ideal_data()
        self.train_results = {}

    def train_model(self):
        """
        This method is used to map each training function to ideal function
        based on minimum sum of squares deviation criteria. It also stores
        the results in a dictionary with following structure
        {'training_function': ('Ideal_function', 'Maximum_deviation',
                                'minimum_sum_squared_deviations')}
        """

        # For each column in train dataset check best fit
        for t_col in self.train_df.columns[1:]:
            # local parameters
            least_square_error = sys.maxsize
            # For each ideal function check
            # if best fit for current train column
            for i_col in self.ideal_df.columns[1:]:
                # Calculate deviation
                div = np.absolute(np.subtract(self.train_df[
                                                  t_col].to_numpy(),
                                              self.ideal_df[i_col].to_numpy()))
                # Calculate sum of squared deviations
                lse = np.sum(np.square(div))
                # Check if current sum of squared deviations is minimum
                # if so update the local parameters
                if lse < least_square_error:
                    least_square_error = lse
                    # Store best fit ideal function
                    # and related parameters in results
                    self.train_results[t_col] = (i_col, div.max(),
                                                 least_square_error)


In [5]:
class Test:
    """
    Test class to map test data to ideal functions chosen while training.
    Determines for each and every x-y pair of values in test dataset whether
    or not they can be assigned to the four chosen ideal function.
    The criteria to check whether or not they can be assigned to the four chosen
    ideal function is that the existing maximum deviation of the calculated
    regression for test data does not exceed the largest deviation between
    training dataset and the ideal function chosen for it by more than factor
    sqrt(2).

    Attributes:
        test_df (pandas.core.frame.DataFrame): Test dataset
        ideal_df (pandas.core.frame.DataFrame): Ideal dataset
        test_results (list): Test results as
            [(x, y, mapped_ideal_function, deviation)]
    """

    def __init__(self):
        """
        Constructor for Test class
        """
        # Load Train dataset
        dh = DataHandler.get_instance()
        self.test_df = dh.get_test_data()
        self.ideal_df = dh.get_ideal_data()
        self.test_results = []

    def test_model(self, train_results):
        """
        This function is used to map each (x,y) pair in test dataset to an
        ideal function chosen while training the model. The criteria to check
        whether or not they can be assigned to the four chosen ideal function
        is that the existing maximum deviation of the calculated regression
        for test data does not exceed the largest deviation between training
        dataset and the ideal function chosen for it by more than factor sqrt(2)
        It also Store test results in following format
        [(x, y, mapped_ideal_function, deviation)]

        Parameters:
            train_results (dict): Training result stored in following format
            {'training_function': ('Ideal_function', 'Maximum_deviation',
            'minimum_sum_squared_deviations')}
        """

        # For each (x-y) pair in Test dataset
        for i in range(self.test_df.shape[0]):
            # For each row in train results
            for k in train_results.keys():
                # Get the chosen ideal function
                ideal_fn = train_results[k][0]
                # Maximum deviation of train data from chosen ideal function
                max_deviation = train_results[k][1]
                # Locate x in ideal dataset
                index = np.where(self.ideal_df['x'] == self.test_df.iloc[i][0])
                # Calculate deviation of test y from chosen ideal function for x
                deviation = np.absolute(np.subtract(self.test_df.iloc[i][1],
                                self.ideal_df[ideal_fn].iloc[index].to_numpy()))
                # Check if deviations is less than sqrt(2) times max_deviation
                if np.sqrt(2)*max_deviation > deviation[0]:
                    # Store the result as test data mapped pass and break
                    self.test_results.append((self.test_df.iloc[i][0],
                                              self.test_df.iloc[i][1],
                                              ideal_fn, deviation[0]))
                    break


In [6]:
class CurveFitting(Train, Test):
    """
    CurveFitting is the main class of the assignment.
    It inherits Test and Train classes and does the following

    1. Train
    Uses training dataset to choose the four ideal functions which are the
    best fit out of the fifty provided in ideal dataset. How they minimize
    the sum of all y-deviations squared (Least-Square) is the criteria for
    choosing the ideal functions for the training function.

    2. Test
    Determines for each and every x-y pair of values in test dataset whether
    or not they can be assigned to the four chosen ideal function. The
    criteria to check whether or not they can be assigned to the four chosen
    ideal function is that the exist-ing maximum deviation of the calculated
    regression for test data does not exceed the largest deviation between
    training dataset and the ideal function chosen for it by more than factor
    sqrt(2).

    3. Store Data in database
    Store training data, ideal function data and tested data in sqlite
    database in following format.
    - Training dataset as
            x y1 y2 y3 y4
    - Ideal function dataset as
            x y1 y2 y3 y4 --- y50
    - Tested data as
            x y Mapped_Ideal_Function Deviation

    Parent Class:
        Train
        Test

    Attributes:
        utility (Class Utility): Database Class Object
    """

    def __init__(self):
        """
        Constructor of CurveFitting class
        It initializes train class, test class and Database class.
        It also loads ideal dataset.
        """

        # Initialize Train Class
        Train.__init__(self)
        # Initialize Test Class
        Test.__init__(self)
        # Create Database object to create sqlite
        self.utility = Utility()

    def start(self):
        """
        This method is the entry point of algorithm.
        Here first model is trained and then tested and finally results are
        stored in database.
        """
        # Implementation of Algorithm
        # Train the model
        self.train_model()
        # Test the model
        self.test_model(self.train_results)
        # Create test result dataframe to be stored in sqlite database
        test_results_df = pd.DataFrame(self.test_results, columns=['x', 'y',
                                                                   'mapping',
                                                                   'deviation'])
        # Create and Update Train, Ideal and Test Tables in sqlite
        self.utility.update_db(test_results_df)
        # Visualize results
        self.utility.plot(self.train_results)


if __name__ == '__main__':
    '''
    if 4 != len(sys.argv):
        print("Usage-----\n curvefitting.py train_dataset_path "
              "ideal_dataset_path test_dataset_path")
        sys.exit(1)
    '''
    try:
        ls = CurveFitting()
    except Exception as e:
        print(str(e))
    else:
        ls.start()
