In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
data = pd.read_csv('faostat_livestock_emissions.csv')

# Preview the dataset
print(data.head())

# Define the response variable (Y) and the explanatory variables (X)
# Replace 'Methane_Emissions' with the actual column name for methane emissions in your dataset
Y = data['Methane_Emissions']

# Replace the list of factors with the actual column names from your dataset
X = data[['Feed_Type', 'Environmental_Conditions', 'Milk_Production', 'Taste', 'Other_Factor1', 'Other_Factor2']]

# Add a constant to the model (intercept)
X = sm.add_constant(X)

# Split the dataset into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Train the Multiple Linear Regression model
model = sm.OLS(Y_train, X_train).fit()

# Print the model summary
print(model.summary())

# Make predictions on the test set
Y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(Y_test, Y_pred)
r2 = r2_score(Y_test, Y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

# Visualize the results

# Plot actual vs predicted methane emissions
plt.figure(figsize=(10, 6))
plt.scatter(Y_test, Y_pred, color='blue', edgecolor='k')
plt.plot([Y_test.min(), Y_test.max()], [Y_test.min(), Y_test.max()], 'k--', lw=3)
plt.xlabel('Actual Methane Emissions')
plt.ylabel('Predicted Methane Emissions')
plt.title('Actual vs Predicted Methane Emissions')
plt.show()

# Plot the residuals
residuals = Y_test - Y_pred
plt.figure(figsize=(10, 6))
sns.histplot(residuals, kde=True, color='purple')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.title('Residuals Distribution')
plt.show()


ModuleNotFoundError: No module named 'statsmodels'