In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [None]:
filename = '03d - non_param_1.csv'
df = pd.read_csv(filename)
df

In [None]:
plt.figure(figsize=(8, 6))
plt.plot(df['x'], df['y'], label='data', color='blue')
plt.xlabel('X')
plt.ylabel('Y')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
number_of_bins = 4

In [None]:
class StepModel:
    def __init__(self, num_bins = 10):
        self.num_bins = num_bins
        self.x_thresholds = []         # Lower value of items in the bin
        self.y_values = []             # Corresponding y value
        self.x_min = 0
        self.x_max = 0

    def fit(self,x,y):
        x_len = len(x)
        y_len = len(y)
        
        # A little bit of sanity checking
        if(x_len < self.num_bins ):   # more bins that data .. error out
             raise ValueError("Error : More bins that data-points")

        # need to have equal amount of data in both x and y
        if(x_len != y_len ):   
             raise ValueError("Error : x and y data has different lengths")

        # Ensure that x values are sorted in increasing order 
        # (and that y values match!)
        zipped_lists = list(zip(x, y))

        # Sort the zipped list based on the values in x
        sorted_zipped_lists = sorted(zipped_lists, key=lambda x: x[0]) 
        sorted_x, sorted_y = zip(*sorted_zipped_lists) 
        
        # Create lists representing the bins (now sorted)
        x_bins = np.array_split(sorted_x, self.num_bins)
        y_bins = np.array_split(sorted_y, self.num_bins)

        # Populate the model
        for bin_x, bin_y in zip(x_bins, y_bins):
            # The lower threshold for this bin 
            self.x_thresholds.append(min(bin_x))        
            self.y_values.append(sum(bin_y) / len(bin_y))
            
        # Remember the maximum and minimum x values over which
        # the model was trained
        self.x_min = x[0]
        self.x_max = bin_x[-1]
    
        
    def get_results(self,x):
        # print('-----------')
        # print('x = ', x)
        result = self.y_values[-1]
        if (x < self.x_min):
            # Any x less than the model training range 
            # is assumed to be at the low value
            result = self.y_values[0]  
        elif (x > self.x_max):
            # Any x great than the model training range 
            # is assumed to be at the high value
            result = self.y_values[-1] 
        else:
            last_y = self.y_values[0]
            for threshold,y in zip(self.x_thresholds, self.y_values):
                if (x < threshold):
                    result = last_y
                    break
                last_y = y

        return (result)

In [None]:
step_model_obj = StepModel(20)

In [None]:
step_model_obj.fit(df['x'].to_list(), df['y'].to_list())

In [None]:
df['step'] = df['x'].apply(step_model_obj.get_results)

In [None]:
plt.figure(figsize=(8, 6))
plt.plot(df['x'], df['y'], label='data', color='blue')
plt.plot(df['x'], df['step'], label='model', color='red')
plt.xlabel('X')
plt.ylabel('Y')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
filename = '03d - non_param_2.csv'
df = pd.read_csv(filename)

In [None]:
step_model_obj = StepModel(25)
step_model_obj.fit(df['x'].to_list(), df['y'].to_list())
df['step'] = df['x'].apply(step_model_obj.get_results)

In [None]:
plt.figure(figsize=(8, 6))
plt.plot(df['x'], df['y'], label='data', color='blue')
plt.plot(df['x'], df['step'], label='model', color='red')
plt.xlabel('X')
plt.ylabel('Y')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
filename = '03d - non_param_3.csv'
df = pd.read_csv(filename)

In [None]:
step_model_obj = StepModel(30)
step_model_obj.fit(df['x'].to_list(), df['y'].to_list())
df['step'] = df['x'].apply(step_model_obj.get_results)

In [None]:
plt.figure(figsize=(8, 6))
plt.plot(df['x'], df['y'], label='data', color='blue')
plt.plot(df['x'], df['step'], label='model', color='red')
plt.xlabel('X')
plt.ylabel('Y')
plt.legend()
plt.grid(True)
plt.show()