In [15]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import statsmodels.api as sm

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [9]:
data = pd.read_csv('ml_data_variable.csv')

# extract the independent variables (X) and dependent variable (y)
X = data[['COAL, Thousand Short Tons', 'NATURALGAS, Billion Cubic Feet', 'ELECTRICITY, Million Kilowatthours',
         'PETRO_INDUSTRIAL, Thousand Barrels per Day', 'PETRO_RESIDENTIAL_COMMERCIAL, Thousand Barrels per Day',
         'PETRO_TRANSPORTATION_ELECTRICPOWER, Thousand Barrels per Day']]
y = data['CO2, Million Metric Tons']

# add a constant term to the independent variables
X = sm.add_constant(X)

# fit the linear regression model
model = sm.OLS(y, X).fit()

# print the model summary
print(model.summary())


                               OLS Regression Results                               
Dep. Variable:     CO2, Million Metric Tons   R-squared:                       0.967
Model:                                  OLS   Adj. R-squared:                  0.967
Method:                       Least Squares   F-statistic:                     2917.
Date:                      Fri, 21 Apr 2023   Prob (F-statistic):               0.00
Time:                              01:28:02   Log-Likelihood:                -2166.0
No. Observations:                       600   AIC:                             4346.
Df Residuals:                           593   BIC:                             4377.
Df Model:                                 6                                         
Covariance Type:                  nonrobust                                         
                                                                   coef    std err          t      P>|t|      [0.025      0.975]
---------------------

In [26]:

from sklearn.utils import shuffle

# Shuffle data
X, y = shuffle(X, y, random_state = 42)
# Split your data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 100)

# Check the shape of the training and testing sets
print('X_train shape:', X_train.shape)
print('y_train shape:', y_train.shape)
print('X_test shape:', X_test.shape)
print('y_test shape:', y_test.shape)


# Create an instance of the LinearRegression class
reg = LinearRegression()

# Fit the linear regression model to the training data
reg.fit(X_train, y_train)

# Use the predict method to make predictions on the testing data
y_pred = reg.predict(X_test)

print('=============================================')

from sklearn.metrics import mean_squared_error, r2_score

# Evaluate the performance of the model using various metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('Coefficients:', reg.coef_)
print('Intercept:', reg.intercept_)
print('Mean squared error:', mse)
print('R-squared:', r2)


X_train shape: (420, 7)
y_train shape: (420,)
X_test shape: (180, 7)
y_test shape: (180,)
Coefficients: [0.00000000e+00 2.46812250e-03 5.40107084e-02 2.22313653e-05
 1.75909775e-02 2.57854447e-02 1.71392480e-02]
Intercept: 45.736617737721076
Mean squared error: 75.95305816150588
R-squared: 0.9690328945320782
