In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# Load the data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
ideal = pd.read_csv("ideal.csv")

print("Train data:")
print(train.head())
print("\nTest data:")
print(test.head())
print("\nIdeal data:")
print(ideal.head())

# Plot relationship between X and each Y value separately
y_columns = ['y1', 'y2', 'y3', 'y4']
for y_col in y_columns:
    plt.figure(figsize=(8, 6))
    plt.scatter(train['x'], train[y_col])
    plt.xlabel('x')
    plt.ylabel(y_col)
    plt.title(f'Relationship between x and {y_col}')
    plt.show()

# Split the data into features (X) and targets (y)
X = train[['x']]
y = train[['y1', 'y2', 'y3', 'y4']]

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the linear regression model
linear_model = LinearRegression()

# Train the linear regression model
linear_model.fit(X_train, y_train)

# Make predictions using the linear regression model
y_pred_linear = linear_model.predict(X_val)

# Compute the mean squared error for linear regression
mse_linear = mean_squared_error(y_val, y_pred_linear)
print("Validation Mean Squared Error for Linear Regression: ", mse_linear)

# Initialize the random forest regressor model
rf_model = RandomForestRegressor(random_state=42)

# Train the random forest regressor model
rf_model.fit(X_train, y_train)

# Make predictions using the random forest regressor model
y_pred_rf = rf_model.predict(X_val)

# Compute the mean squared error for random forest regressor
mse_rf = mean_squared_error(y_val, y_pred_rf)
print("Validation Mean Squared Error for Random Forest Regression: ", mse_rf)

# Make predictions on the test data using both models
y_test_pred_linear = linear_model.predict(test[['x']])
y_test_pred_rf = rf_model.predict(test[['x']])

# Plot Actual vs Predicted values for Linear Regression on Validation data
plt.figure(figsize=(8, 6))
plt.scatter(y_val['y1'], y_pred_linear[:, 0], color='blue', label='Predicted')
plt.scatter(y_val['y1'], y_val['y1'], color='red', label='Actual')
plt.xlabel('Actual values')
plt.ylabel('Predicted values')
plt.title('Actual vs Predicted values for Linear Regression (Validation data)')
plt.legend()
plt.show()

# Plot Actual vs Predicted values for Random Forest Regression on Validation data
plt.figure(figsize=(8, 6))
plt.scatter(y_val['y1'], y_pred_rf[:, 0], color='blue', label='Predicted')
plt.scatter(y_val['y1'], y_val['y1'], color='red', label='Actual')
plt.xlabel('Actual values')
plt.ylabel('Predicted values')
plt.title('Actual vs Predicted values for Random Forest Regression (Validation data)')
plt.legend()
plt.show()

# Get the corresponding ideal values for the test data
ideal_values = ideal[['y1', 'y2', 'y3', 'y4']].iloc[:len(test)]

# Calculate the mean squared error between predictions and ideal values
mse_linear_ideal = mean_squared_error(ideal_values, y_test_pred_linear)
mse_rf_ideal = mean_squared_error(ideal_values, y_test_pred_rf)

print("Mean Squared Error for Linear Regression with Ideal values: ", mse_linear_ideal)
print("Mean Squared Error for Random Forest Regression with Ideal values: ", mse_rf_ideal)

# Plot Ideal vs Predicted values for Linear Regression
plt.figure(figsize=(8, 6))
plt.scatter(ideal_values, y_test_pred_linear, color='blue', label='Predicted')
plt.scatter(ideal_values, ideal_values, color='red', label='Actual')
plt.xlabel('Ideal values')
plt.ylabel('Predicted values')
plt.title('Ideal vs Predicted values for Linear Regression')
plt.legend()
plt.show()

# Plot Ideal vs Predicted values for Random Forest Regression
plt.figure(figsize=(8, 6))
plt.scatter(ideal_values, y_test_pred_rf, color='blue', label='Predicted')
plt.scatter(ideal_values, ideal_values, color='red', label='Actual')
plt.xlabel('Ideal values')
plt.ylabel('Predicted values')
plt.title('Ideal vs Predicted values for Random Forest Regression')
plt.legend()
plt.show()


FileNotFoundError: ignored