In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import seaborn as sns

cleaned_data = pd.read_csv('Cleaned_Plant_2_Data.csv')


target = 'AC_POWER'
X = cleaned_data.drop(columns=[target])
y = cleaned_data[target]


X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


linear_model = LinearRegression()
linear_model.fit(X_train, y_train)


y_pred = linear_model.predict(X_test)

#Check Linearity

residuals = y_test - y_pred
for column in X.columns:
    plt.figure(figsize=(8, 6))
    plt.scatter(cleaned_data[column], cleaned_data[target], alpha=0.5)
    plt.xlabel(column)
    plt.ylabel(target)
    plt.title(f'Scatter Plot of {column} vs {target}')
    plt.show()


In [None]:
# Check for Homoscedasticity: Residuals vs Predicted values plot
plt.figure(figsize=(8, 6))
plt.scatter(y_pred, residuals, alpha=0.5)
plt.axhline(0, color='red', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Homoscedasticity Check: Residuals vs Predicted Values')
plt.show()

In [None]:
#Check Randomnes
residuals_mean = np.mean(residuals)
print(f"Mean of Residuals (should be close to zero): {residuals_mean:.4f}")

In [None]:
# Correlation matrix for features
correlation_matrix = cleaned_data.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm')
plt.title("Correlation Matrix")
plt.show()

In [None]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error: {mae:.4f}")
print(f"Mean Squared Error: {mse:.4f}")
print(f"R-squared: {r2:.4f}")