<h1>palAI & mAIs: Comparative Analysis of Machine
Learning Algorithms for Forecasting Palay and
Corn Production in Region VI (Western Visayas)</h1>

<h3>CMSC 197 Mini Project</h3>
<h4>AI Powered-Team (Manejo, Pajarilla, Vito)</h4>

<h3>Table of Contents</h3>

<ul>
    <li>Data Exploration</li>
    <li>Preprocessing</li>
    <li>Implementation of Machine Learning Algorithms
        <ul>
		<li>Linear Regression</li>
            <li>Random Forest Regression</li>
            <li>Support Vector Regression</li>
            <li> K-Nearest Neighbor (KNN)</li>
            <li>XGBoost</li>
            <li>Artificial Neural Network (ANN)</li>
        </ul>
    </li>
</ul>

<hr>
<h2> Preprocessing</h2>
<hr>

<h4> Importing Libraries </h4>

In [1]:
# standard libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#### machine learning algorithms ####
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
# from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.neighbors import KNeighborsRegressor

# for ANN
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense
# from tensorflow.keras.optimizers import Adam

# for preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

#### metrics of evaluation ####
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

### time series ###
from statsmodels.tsa.seasonal import seasonal_decompose

<h4> Load production volume and area harvested dataset </h4>

In [2]:
production_data = pd.read_csv("https://raw.githubusercontent.com/kazeulo/Project_197/main/dataset/Production_volume.csv")
area_data = pd.read_csv("https://raw.githubusercontent.com/kazeulo/Project_197/main/dataset/Area_harvested.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'dataset/Production_volume.csv'

In [None]:
production_data.head(5)

In [None]:
area_data.head(5)

<h4>Transforming and merging data</h4>

In [None]:
production_data = pd.melt(production_data,
                        id_vars=['Croptype', 'Province'],
                        var_name='Quarter and Year',
                        value_name='Production')

area_data = pd.melt(area_data,
                        id_vars=['Croptype', 'Province'],
                        var_name='Quarter and Year',
                        value_name='Area')

# fill values for croptype
production_data['Croptype'] = production_data['Croptype'].ffill()
area_data['Croptype'] = area_data['Croptype'].ffill()

dataset = pd.merge(production_data, area_data, on=['Croptype', 'Province', 'Quarter and Year'], how='left')

In [None]:
dataset.head(5)

In [None]:
# split year and quarter into separte columns
dataset['Year'] = dataset['Quarter and Year'].apply(lambda x: int(x.split(' ')[-1]))
dataset['Quarter'] = dataset['Quarter and Year'].apply(lambda x: int(x.split(' ')[1][-1]))

# drop 'Quarter and Year' column
dataset.drop(columns=['Quarter and Year'], inplace=True)

# reorder columns
dataset = dataset[['Croptype', 'Province', 'Quarter', 'Year', 'Area', 'Production']]

In [None]:
dataset.head(5)

In [None]:
rows, columns = dataset.shape

# Print the number of rows
print("Number of rows:", rows)

<h4>Create Date column</h4>

In [None]:
quarter_to_month = {1: 1, 2: 4, 3: 7, 4: 10}

# Create the 'Date' column
dataset['Date'] = pd.to_datetime(dataset['Year'].astype(str) +
                                  dataset['Quarter'].map(quarter_to_month).astype(str) + '01',
                                  format='%Y%m%d')

# Reorder columns so 'Date' is the leftmost column
dataset = dataset[['Date'] + [col for col in dataset.columns if col != 'Date']]

In [None]:
dataset.head(5)

Since Guimaras was included in Iloilo prior to 1994, we'll drop the data for those years to ensure accuracy.

In [None]:
dataset = dataset[~((dataset['Province'].isin(['Guimaras', 'Iloilo'])) & (dataset['Year'] <= 1994))]

In [None]:
dataset.hist(figsize = (10, 10))

In [None]:
# descriptive statistics
desc_stats = dataset.describe()

print("Descriptive Statistics:")
desc_stats

In [None]:
# frequency distribution of categorical variables
print(dataset['Croptype'].value_counts())
print(dataset['Province'].value_counts())

In [None]:
# check for zero values in the entire DataFrame
print((dataset == 0).sum())

Handling zero values. Using median since it is less sensitive to outliers.

In [None]:
# replace 0 with NaN
dataset.replace(0, np.nan, inplace=True)

# imputer = SimpleImputer(strategy='median')
# dataset[['Production', 'Area']] = imputer.fit_transform(dataset[['Production', 'Area']])

In [None]:
# drop rows that contains Nan
dataset = dataset.dropna()

In [None]:
rows, columns = dataset.shape

# Print the number of rows
print("Number of rows:", rows)

<b>Remove outliers.</b>

In [None]:
Q1 = dataset['Production'].quantile(0.25)
Q3 = dataset['Production'].quantile(0.75)
IQR = Q3 - Q1

In [None]:
dataset = dataset[(dataset['Production'] >= (Q1 - 1.5 * IQR)) & (dataset['Production'] <= (Q3 + 1.5 * IQR))]

In [None]:
rows, columns = dataset.shape

# Print the number of rows
print("Number of rows:", rows)

In [None]:
# visualizing the trend of 'Production' over time
plt.figure(figsize=(10, 4))
dataset['Production'].plot(title="Volume of Production (tons) over Time", linewidth=2)
plt.xlabel('Date')
plt.ylabel('Production')
plt.grid(True)
plt.show()

In [None]:
# plot Area and Production over time (Year)
plt.figure(figsize=(11, 6))

# plot Area over time
plt.subplot(2, 1, 1)
sns.lineplot(data=dataset, x='Year', y='Area', hue='Province', marker='o')
plt.title('Area Over Time by Province')
plt.xlabel('Year')
plt.ylabel('Area')

# plot Production over time
plt.subplot(2, 1, 2)
sns.lineplot(data=dataset, x='Year', y='Production', hue='Province', marker='o')
plt.title('Production Over Time by Province')
plt.xlabel('Year')
plt.ylabel('Production')

plt.tight_layout()
plt.show()

In [None]:
# aggregate data by Year and Quarter to see seasonal patterns
quarterly_data = dataset.groupby(['Year', 'Quarter'])[['Area', 'Production']].sum().reset_index()

# plot Area and Production for each quarter across the years
plt.figure(figsize=(11, 6))

# plot Area by Quarter
plt.subplot(2, 1, 1)
sns.lineplot(data=quarterly_data, x='Year', y='Area', hue='Quarter', marker='o')
plt.title('Area by Quarter Across Years')
plt.xlabel('Year')
plt.ylabel('Area')

# plot Production by Quarter
plt.subplot(2, 1, 2)
sns.lineplot(data=quarterly_data, x='Year', y='Production', hue='Quarter', marker='o')
plt.title('Production by Quarter Across Years')
plt.xlabel('Year')
plt.ylabel('Production')

plt.tight_layout()
plt.show()

In [None]:
#\ visualizing production across Provinces with Croptype
plt.figure(figsize=(10, 4))

sns.barplot(x='Province', y='Production', hue='Croptype', data=dataset)

# adding titles and labels
plt.title('Production by Province and Croptype')
plt.xlabel('Province')
plt.ylabel('Production (tons)')

# Show the plot
plt.tight_layout()
plt.show()

In [None]:
# calculate the correlation between Area and Production
corr = dataset[['Area', 'Production', 'Quarter', 'Year']].corr()
print(corr)

# visualize correlation matrix
plt.figure(figsize=(6, 4))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix between Area, Production, Quarter, and Year')
plt.show()

In [None]:
# Boxplot for Area and Production
plt.figure(figsize=(14, 8))

# Boxplot for Area
plt.subplot(2, 2, 1)
sns.boxplot(x='Croptype', y='Production', data=dataset)
plt.title('Boxplot of Production by Croptype')

# Boxplot for Production
plt.subplot(2, 2, 2)
sns.boxplot(x='Province', y='Production', data=dataset)
plt.title('Boxplot of Production by Province')

plt.tight_layout()
plt.show()

In [None]:
aggregated_data = dataset.groupby(['Croptype', 'Province', 'Quarter', 'Year']).agg(
    total_production=('Production', 'sum'),
    total_area=('Area', 'sum'),
).reset_index()

<h4> Feature Engineering</h4>

In [None]:
dataset = dataset.sort_values(by='Date', ascending=True)

In [None]:
# # Seasonal features
# dataset['Quarter_sin'] = np.sin(2 * np.pi * dataset['Quarter'] / 4)
# dataset['Quarter_cos'] = np.cos(2 * np.pi * dataset['Quarter'] / 4)

In [None]:
# # Create lag features
# dataset['Lag_1'] = dataset.groupby(['Province', 'Croptype', 'Quarter'])['Production'].shift(1)
# dataset['Lag_2'] = dataset.groupby(['Province', 'Croptype', 'Quarter'])['Production'].shift(2)
# dataset['Lag_3'] = dataset.groupby(['Province', 'Croptype', 'Quarter'])['Production'].shift(3)
# dataset['Lag_4'] = dataset.groupby(['Province', 'Croptype', 'Quarter'])['Production'].shift(4)

In [None]:
# # Create rolling mean features
# dataset['Rolling_Mean_1'] = dataset['Production'].rolling(window=2).mean()
# dataset['Rolling_Mean_4'] = dataset['Production'].rolling(window=4).mean()
# dataset['Rolling_Mean_2'] = dataset['Production'].rolling(window=2).mean()

In [None]:
dataset.head(5)

In [None]:
# encoder = LabelEncoder()

# # Label encode Croptype and Province
# dataset['Croptype'] = encoder.fit_transform(dataset['Croptype'])
# dataset['Province'] = encoder.fit_transform(dataset['Province'])

encoding = OneHotEncoder(sparse_output=False).set_output(transform='pandas')
encode_data = encoding.fit_transform(dataset[['Croptype', 'Province']])

dataset = pd.concat([dataset.reset_index(drop=True), encode_data.reset_index(drop=True)], axis=1)
dataset = dataset.drop(columns=['Croptype', 'Province'])

In [None]:
dataset['Log_Production'] = np.log(dataset['Production'] + 1)

In [None]:
dataset.head(5)

<h4>Split train/test</h4>

In [None]:
# initialize the scaler
scaler = StandardScaler()

X = dataset.drop(['Production', 'Log_Production', 'Date'], axis=1)
y = dataset['Log_Production']

# pplit into train and test (70% train, 30% test)
train_size = int(0.7 * len(dataset))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=False)

# fit the scaler on the training data and transform X_train
X_train_scaled = scaler.fit_transform(X_train)

# transform X_test using the same scaler (do not fit again)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Plotting the entire production data
plt.figure(figsize=(10, 6))

# Plot training data (first 70%)
plt.plot(dataset['Date'][:train_size], dataset['Production'][:train_size], label='Training Data', color='blue')

# Plot testing data (last 30%)
plt.plot(dataset['Date'][train_size:], dataset['Production'][train_size:], label='Testing Data', color='red')

# Adding labels and title
plt.xlabel('Date')
plt.ylabel('Production')
plt.title('Time Series Data Split: Training vs Testing')
plt.legend()
plt.xticks(rotation=45)  # Rotate x-axis labels for readability

# Show the plot
plt.tight_layout()
plt.show()

<hr>
<h2> Implementing Machine Learning Algorithms</h2>
<hr>

<h4>Extreme Gradient Boosting (XGboost)</h4>

In [None]:
# perform gradientsearch

param_grid = {
     'learning_rate': [0.01, 0.1, 0.2],
     'max_depth': [3, 4, 5],
     'n_estimators': [100, 200, 500],
     'subsample': [0.8, 1.0],
     'reg_alpha': [0.01, 0.1, 0.2],
     'reg_lambda': [0.01, 0.1, 0.2]
}

grid_search = GridSearchCV(estimator=xgb.XGBRegressor(), param_grid=param_grid, scoring='neg_mean_squared_error', cv=3, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Best parameters found by GridSearchCV
print(grid_search.best_params_)

# Use the best model
model_xgb = grid_search.best_estimator_

In [None]:
# make predictions
y_pred = model_xgb.predict(X_test_scaled)

In [None]:
y_pred_original = np.exp(y_pred) - 1
y_test_original = np.exp(y_test) - 1

In [None]:
results = pd.DataFrame({'Actual': y_test_original, 'Predicted': y_pred_original})
results.head(5)

In [None]:
# Calculate performance metrics on the test data
mae_train = mean_absolute_error(y_test_original, y_pred_original)
mse_train = mean_squared_error(y_test_original, y_pred_original)
r2_train = r2_score(y_test_original, y_pred_original)
mape = np.mean(np.abs((y_test_original - y_pred_original) / y_test_original) * 100)

# Create a DataFrame to store the results
performance_xgb = pd.DataFrame({
    'Model': ['MAE', 'MSE', 'MAPE', 'R²'],
    'XG Boost': [mae_train, mse_train, mape, r2_train]
})

# Transpose to make the performance horizontal
performance_xgb = performance_xgb.set_index('Model').T

# Display the DataFrame
performance_xgb

In [None]:
# import matplotlib.pyplot as plt

# # Plot feature importance
# xgb.plot_importance(model_xgb)
# plt.show()

In [None]:
# 6. Plot actual vs predicted values
plt.figure(figsize=(10, 6))
plt.plot(y_test_original.index, y_test_original, label='Actual', color='blue')
plt.plot(y_test_original.index, y_pred_original, label='Predicted', color='red', linestyle='--')
plt.xlabel('Index')
plt.ylabel('Production')
plt.legend()
plt.title('Actual vs Predicted Production')
plt.show()

<h4>Random Forest Regression</h4>

In [None]:
# model_rf = RandomForestRegressor(n_estimators=1000, max_features=4, random_state=1)
# model_rf.fit(X_train_scaled, y_train)

In [None]:
param_grid = {
     'n_estimators': [100, 200, 500],
     'max_depth': [3, 4, 5, 6],
     'min_samples_split': [2, 5, 10],
     'min_samples_leaf': [1, 2, 4],
     'max_features': ['sqrt', 'log2', None, 0.5, 1.0],
     'bootstrap': [True, False]
}

# Perform Grid Search with RandomForestRegressor
grid_search_rf = GridSearchCV(estimator=RandomForestRegressor(), param_grid=param_grid, scoring='neg_mean_squared_error', cv=3, n_jobs=-1)
grid_search_rf.fit(X_train_scaled, y_train)

# Best parameters found by GridSearchCV
print(grid_search_rf.best_params_)

# Use the best model
model_rf = grid_search_rf.best_estimator_

In [None]:
# make predictions
y_pred = model_rf.predict(X_test_scaled)

In [None]:
y_pred_original = np.exp(y_pred) - 1
y_test_original = np.exp(y_test) - 1

In [None]:
results = pd.DataFrame({'Actual': y_test_original, 'Predicted': y_pred_original})
results.head(5)

In [None]:
# Calculate performance metrics on the test data
mae_train = mean_absolute_error(y_test_original, y_pred_original)
mse_train = mean_squared_error(y_test_original, y_pred_original)
r2_train = r2_score(y_test_original, y_pred_original)
mape = np.mean(np.abs((y_test_original - y_pred_original) / y_test_original) * 100)

# Create a DataFrame to store the results
performance_rf = pd.DataFrame({
    'Model': ['MAE', 'MSE', 'MAPE', 'R²'],
    'Random Forest': [mae_train, mse_train, mape, r2_train]
})

# Transpose to make the performance horizontal
performance_rf = performance_rf.set_index('Model').T

# Display the DataFrame
performance_rf

In [None]:
# 6. Plot actual vs predicted values
plt.figure(figsize=(10, 6))
plt.plot(y_test_original.index, y_test_original, label='Actual', color='blue')
plt.plot(y_test_original.index, y_pred_original, label='Predicted', color='red', linestyle='--')
plt.xlabel('Index')
plt.ylabel('Production')
plt.legend()
plt.title('Actual vs Predicted Production')
plt.show()

<h4>Linear Regression</h4>

In [None]:
# from sklearn.linear_model import Ridge

# ridge = Ridge (alpha=1, solver='saga', tol=0.001)
# ridge.fit(X_train_scaled, y_train)

linear_model = LinearRegression()

# Fit the model on the training data
linear_model.fit(X_train_scaled, y_train)

In [None]:
# make predictions
y_pred = linear_model.predict(X_test_scaled)
# y_pred = ridge.predict(X_test_scaled)

In [None]:
y_pred_original = np.exp(y_pred) - 1
y_test_original = np.exp(y_test) - 1

In [None]:
results = pd.DataFrame({'Actual': y_test_original, 'Predicted': y_pred_original})
results.head(5)

In [None]:
# Calculate performance metrics on the test data
mae_train = mean_absolute_error(y_test_original, y_pred_original)
mse_train = mean_squared_error(y_test_original, y_pred_original)
r2_train = r2_score(y_test_original, y_pred_original)
mape = np.mean(np.abs((y_test_original - y_pred_original) / y_test_original) * 100)

# Create a DataFrame to store the results
performance_lr = pd.DataFrame({
    'Model': ['MAE', 'MSE', 'MAPE', 'R²'],
    'Linear Regression': [mae_train, mse_train, mape, r2_train]
})

# Transpose to make the performance horizontal
performance_lr = performance_lr.set_index('Model').T

# Display the DataFrame
performance_lr

In [None]:
# 6. Plot actual vs predicted values
plt.figure(figsize=(10, 6))
plt.plot(y_test_original.index, y_test_original, label='Actual', color='blue')
plt.plot(y_test_original.index, y_pred_original, label='Predicted', color='red', linestyle='--')
plt.xlabel('Index')
plt.ylabel('Production')
plt.legend()
plt.title('Actual vs Predicted Production')
plt.show()

<h4>K-Nearest Neighbor</h4>

In [None]:
param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

grid_search_knn = GridSearchCV(estimator=KNeighborsRegressor(), param_grid=param_grid, scoring='neg_mean_squared_error', cv=3, n_jobs=-1)
grid_search_knn.fit(X_train_scaled, y_train)

# Best parameters found by GridSearchCV
print("Best Parameters:", grid_search_knn.best_params_)

# Use the best model
model_knn = grid_search_knn.best_estimator_

In [None]:
# model_knn = KNeighborsRegressor(n_neighbors=20)
knn = KNeighborsRegressor(n_neighbors=7)
knn.fit(X_train, y_train)

In [None]:
# make predictions
predictions = knn.predict(X_test)
y_pred = predictions

In [None]:
y_pred_original = np.exp(y_pred) - 1
y_test_original = np.exp(y_test) - 1

In [None]:
results = pd.DataFrame({'Actual': y_test_original, 'Predicted': y_pred_original})
results.head(5)

In [None]:
# Calculate performance metrics on the test data
mae_train = mean_absolute_error(y_test_original, y_pred_original)
mse_train = mean_squared_error(y_test_original, y_pred_original)
r2_train = r2_score(y_test_original, y_pred_original)
mape = np.mean(np.abs((y_test_original - y_pred_original) / y_test_original) * 100)

# Create a DataFrame to store the results
performance_knn = pd.DataFrame({
    'Model': ['MAE', 'MSE', 'MAPE', 'R²'],
    'K-Nearest Neighbor': [mae_train, mse_train, mape, r2_train]
})

# Transpose to make the performance horizontal
performance_knn = performance_knn.set_index('Model').T

# Display the DataFrame
performance_knn

In [None]:
# 6. Plot actual vs predicted values
plt.figure(figsize=(10, 6))
plt.plot(y_test_original.index, y_test_original, label='Actual', color='blue')
plt.plot(y_test_original.index, y_pred_original, label='Predicted', color='red', linestyle='--')
plt.xlabel('Index')
plt.ylabel('Production')
plt.legend()
plt.title('Actual vs Predicted Production')
plt.show()

<h4>Support Vector Regression</h4>

In [None]:
param_grid = {
     'C': [1, 10, 50, 100],
     'epsilon': [0.01, 0.1, 0.2, 0.5],
     'kernel': ['rbf', 'linear', 'poly']
}

# Create a GridSearchCV object to search for the best hyperparameters
grid_search = GridSearchCV(estimator=SVR(), param_grid=param_grid, scoring='neg_mean_squared_error', cv=3, n_jobs=-1)

# Fit GridSearchCV on the training data
grid_search.fit(X_train, y_train)

# Best parameter combination found by GridSearchCV
print("Best parameters:", grid_search.best_params_)

# Use the best model found by GridSearchCV
model_svr = grid_search.best_estimator_

In [None]:
# model_svr = SVR(kernel='rbf', C=100, epsilon=0.01)

# model_svr.fit(X_train, y_train)

In [None]:
y_pred = model_svr.predict(X_test)

In [None]:
y_pred_original = np.exp(y_pred) - 1
y_test_original = np.exp(y_test) - 1

In [None]:
results = pd.DataFrame({'Actual': y_test_original, 'Predicted': y_pred_original})
results.head(5)

In [None]:
# Calculate performance metrics on the test data
mae_train = mean_absolute_error(y_test_original, y_pred_original)
mse_train = mean_squared_error(y_test_original, y_pred_original)
r2_train = r2_score(y_test_original, y_pred_original)
mape = np.mean(np.abs((y_test_original - y_pred_original) / y_test_original) * 100)

# Create a DataFrame to store the results
performance_svr = pd.DataFrame({
    'Model': ['MAE', 'MSE', 'MAPE', 'R²'],
    'Support Vector Regression': [mae_train, mse_train, mape, r2_train]
})

# Transpose to make the performance horizontal
performance_svr = performance_svr.set_index('Model').T

# Display the DataFrame
performance_svr

In [None]:
# 6. Plot actual vs predicted values
plt.figure(figsize=(10, 6))
plt.plot(y_test_original.index, y_test_original, label='Actual', color='blue')
plt.plot(y_test_original.index, y_pred_original, label='Predicted', color='red', linestyle='--')
plt.xlabel('Index')
plt.ylabel('Production')
plt.legend()
plt.title('Actual vs Predicted Production')
plt.show()

<h4>Aritifical Neural Network</h4>

In [None]:
# from tensorflow.keras.models import Sequential
# # Build the Artificial Neural Network (ANN) model
# model_ann = Sequential()

In [None]:
# # Input layer (first hidden layer) with 64 neurons and ReLU activation
# model_ann.add(Dense(units=64, input_dim=X_train.shape[1], activation='relu'))

# # Add a second hidden layer with 32 neurons and ReLU activation
# model_ann.add(Dense(units=32, activation='relu'))

# # Output layer with 1 neuron (for regression task)
# model_ann.add(Dense(units=1))

# # Compile the model with Mean Squared Error loss function and Adam optimizer
# model_ann.compile(optimizer=Adam(), loss='mean_squared_error')

# # Fit the model on the training data (training the neural network)
# history = model_ann.fit(X_train, y_train, epochs=50, batch_size=16, validation_data=(X_test, y_test), verbose=1)

In [None]:
# y_pred = model_ann.predict(X_test)

In [None]:
# y_pred_original = np.exp(y_pred) - 1
# y_test_original = np.exp(y_test) - 1

In [None]:
# results = pd.DataFrame({'Actual': y_test_original, 'Predicted': y_pred_original})
# results.head(5)

In [None]:
# # Calculate performance metrics on the test data
# mae_train = mean_absolute_error(y_test_original, y_pred_original)
# mse_train = mean_squared_error(y_test_original, y_pred_original)
# r2_train = r2_score(y_test_original, y_pred_original)
# mape = np.mean(np.abs((y_test_original - y_pred_original) / y_test_original) * 100)

# # Create a DataFrame to store the results
# performance_ann = pd.DataFrame({
#     'Model': ['MAE', 'MSE', 'MAPE', 'R²'],
#     'Linear Regression': [mae_train, mse_train, mape, r2_train]
# })

# # Transpose to make the performance horizontal
# performance_ann = performance_ann.set_index('Model').T

# # Display the DataFrame
# performance_ann

In [None]:
# # 6. Plot actual vs predicted values
# plt.figure(figsize=(10, 6))
# plt.plot(y_test_original.index, y_test_original, label='Actual', color='blue')
# plt.plot(y_test_original.index, y_pred_original, label='Predicted', color='red', linestyle='--')
# plt.xlabel('Index')
# plt.ylabel('Production')
# plt.legend()
# plt.title('Actual vs Predicted Production')
# plt.show()

<h4>Comparison of Model Performance</h4>

In [None]:
merged_performance = pd.concat([performance_xgb, performance_rf, performance_lr, performance_svr, performance_knn], axis=0)

In [None]:
merged_performance