In [1]:
pip install pandas sqlalchemy bokeh numpy matplotlib scipy

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd

# Load the datasets
train_file = r"C:\Users\giras\Downloads\MASTERSAssgn\Dataset2\train.csv"
test_file = r"C:\Users\giras\Downloads\MASTERSAssgn\Dataset2\test.csv"
ideal_file = r"C:\Users\giras\Downloads\MASTERSAssgn\Dataset2\ideal.csv"

train_data = pd.read_csv(train_file)
test_data = pd.read_csv(test_file)
ideal_functions = pd.read_csv(ideal_file)

# Strip any extra spaces in column names
train_data.columns = train_data.columns.str.strip()
test_data.columns = test_data.columns.str.strip()
ideal_functions.columns = ideal_functions.columns.str.strip()

# Step 2: Examine the column names and first few rows
print("Training Data - Columns and First Few Rows:")
print(train_data.columns)
print(train_data.head())

print("\nTest Data - Columns and First Few Rows:")
print(test_data.columns)
print(test_data.head())

print("\nIdeal Functions Data - Columns and First Few Rows:")
print(ideal_functions.columns)
print(ideal_functions.head())


Training Data - Columns and First Few Rows:
Index(['x', 'y1', 'y2', 'y3', 'y4'], dtype='object')
      x         y1         y2         y3        y4
0 -20.0  39.778572 -40.078590 -20.214268 -0.324914
1 -19.9  39.604813 -39.784000 -20.070950 -0.058820
2 -19.8  40.099070 -40.018845 -19.906782 -0.451830
3 -19.7  40.151100 -39.518402 -19.389118 -0.612044
4 -19.6  39.795662 -39.360065 -19.815890 -0.306076

Test Data - Columns and First Few Rows:
Index(['x', 'y'], dtype='object')
      x          y
0  17.5  34.161040
1   0.3   1.215102
2  -8.7 -16.843908
3 -19.2 -37.170870
4 -11.0 -20.263054

Ideal Functions Data - Columns and First Few Rows:
Index(['x', 'y1', 'y2', 'y3', 'y4', 'y5', 'y6', 'y7', 'y8', 'y9', 'y10', 'y11',
       'y12', 'y13', 'y14', 'y15', 'y16', 'y17', 'y18', 'y19', 'y20', 'y21',
       'y22', 'y23', 'y24', 'y25', 'y26', 'y27', 'y28', 'y29', 'y30', 'y31',
       'y32', 'y33', 'y34', 'y35', 'y36', 'y37', 'y38', 'y39', 'y40', 'y41',
       'y42', 'y43', 'y44', 'y45', 'y46', 'y4

In [3]:
class DataProcessor:
    def __init__(self, training_data, test_data, ideal_functions):
        self.training_data = training_data
        self.test_data = test_data
        self.ideal_functions = ideal_functions

    def least_squares_error(self, data, ideal_function_column):
        total_error = 0
        for _, row in data.iterrows():
            x_value = row['x']  # Get the x value from the data
            actual_y_value = row['y'] if 'y' in row else row['y1']  # Use 'y' for test data, 'y1' for training data

            # Find the corresponding ideal y value based on x
            ideal_y_value = ideal_function_column.loc[ideal_function_column['x'] == x_value, ideal_function_column.columns[1]].values

            if len(ideal_y_value) == 0:
                print(f"Warning: x_value {x_value} not found in ideal function. Skipping this value.")
                continue

            total_error += (actual_y_value - ideal_y_value[0]) ** 2  # Calculate the squared error

        return total_error

    def choose_best_ideal_function(self):
        chosen_functions = []

        # Iterate over the ideal functions (y1, y2, ..., y50)
        for i in range(1, 51):
            ideal_function_column = self.ideal_functions[['x', f'y{i}']]  # Get the x and y(i) columns

            # Calculate the least squares error for this ideal function based on test data
            error = self.least_squares_error(self.test_data, ideal_function_column)
            chosen_functions.append((f'y{i}', error))  # Store the function label and its error

        # Sort by error and select the top 4 ideal functions with the least error
        chosen_functions.sort(key=lambda x: x[1])
        return chosen_functions[:4]


# Create DataProcessor instance
data_processor = DataProcessor(training_data=train_data, test_data=test_data, ideal_functions=ideal_functions)

# Get the best 4 ideal functions based on the least squares error
best_ideal_functions = data_processor.choose_best_ideal_function()

# Display the best 4 ideal functions and their errors
print("\nBest 4 Ideal Functions based on Least Squares Error:")
for func, err in best_ideal_functions:
    print(f"{func}: {err}")


Best 4 Ideal Functions based on Least Squares Error:
y11: 39863.734507262954
y9: 41354.676560322
y2: 41937.21139680377
y50: 42000.62336956572
