# Simple linear regression using  stats model without transformation

In [1]:


import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

data = pd.read_csv('delivery_time.csv')

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['Sorting Time'], data['Delivery Time'], test_size=0.2, random_state=42)

# Fit a linear regression model to the training data
X_train = sm.add_constant(X_train)  # add a constant term to the model if not added it the constant coefficient is empty in the summary
model = sm.OLS(y_train, X_train)
results = model.fit()

# Print the model summary
print(results.summary())

# Make predictions on the testing set
X_test = sm.add_constant(X_test)
y_pred = results.predict(X_test)

# Evaluate the performance of the model on the testing set
print('R-squared:', r2_score(y_test, y_pred))
print('RMSE:', np.sqrt(mean_squared_error(y_test, y_pred)))
X_train


                            OLS Regression Results                            
Dep. Variable:          Delivery Time   R-squared:                       0.760
Model:                            OLS   Adj. R-squared:                  0.742
Method:                 Least Squares   F-statistic:                     44.22
Date:                Sat, 15 Apr 2023   Prob (F-statistic):           1.10e-05
Time:                        14:20:35   Log-Likelihood:                -38.507
No. Observations:                  16   AIC:                             81.01
Df Residuals:                      14   BIC:                             82.56
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const            4.6823      1.945      2.408   



Unnamed: 0,const,Sorting Time
5,1.0,6
11,1.0,4
3,1.0,9
18,1.0,2
16,1.0,6
13,1.0,3
2,1.0,6
9,1.0,9
20,1.0,5
4,1.0,10


# Model with Apply log transformation to both variables

In [13]:
# Apply log transformation to both variables
data['Delivery Time'] = np.log(data['Delivery Time'])
data['Sorting Time'] = np.log(data['Sorting Time'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['Sorting Time'], data['Delivery Time'], test_size=0.2, random_state=42)

# Fit a linear regression model to the training data
X_train = sm.add_constant(X_train)  # add a constant term to the model
model = sm.OLS(y_train, X_train)
results = model.fit()

# Print the model summary
print(results.summary())

# Make predictions on the testing set
X_test = sm.add_constant(X_test)
y_pred = results.predict(X_test)

# Apply the inverse log transformation to the predicted values
y_pred = np.exp(y_pred)

# Evaluate the performance of the model on the testing set
print('R-squared:', r2_score(np.exp(y_test), y_pred))
print('RMSE:', np.sqrt(mean_squared_error(np.exp(y_test), y_pred)))


                            OLS Regression Results                            
Dep. Variable:          Delivery Time   R-squared:                       0.839
Model:                            OLS   Adj. R-squared:                  0.828
Method:                 Least Squares   F-statistic:                     73.14
Date:                Wed, 12 Apr 2023   Prob (F-statistic):           6.24e-07
Time:                        14:32:07   Log-Likelihood:                 24.062
No. Observations:                  16   AIC:                            -44.12
Df Residuals:                      14   BIC:                            -42.58
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const           -0.1813      0.025     -7.131   



# Apply log transformation to the response variable

In [16]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error



# Transform the response variable
data['Delivery Time'] = np.log(data['Delivery Time'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['Sorting Time'], data['Delivery Time'], test_size=0.2, random_state=42)

# Fit a linear regression model to the training data
X_train = sm.add_constant(X_train)  # add a constant term to the model
model = sm.OLS(y_train, X_train)
results = model.fit()

# Print the model summary
print(results.summary())

# Make predictions on the testing set
X_test = sm.add_constant(X_test)
y_pred = results.predict(X_test)

# Transform the predicted values back to the original scale
y_pred = np.exp(y_pred)

# Evaluate the performance of the model on the testing set
print('R-squared:', r2_score(np.exp(y_test), y_pred))
print('RMSE:', np.sqrt(mean_squared_error(np.exp(y_test), y_pred)))


                            OLS Regression Results                            
Dep. Variable:          Delivery Time   R-squared:                       0.783
Model:                            OLS   Adj. R-squared:                  0.768
Method:                 Least Squares   F-statistic:                     50.57
Date:                Wed, 12 Apr 2023   Prob (F-statistic):           5.24e-06
Time:                        14:43:58   Log-Likelihood:                 22.343
No. Observations:                  16   AIC:                            -40.69
Df Residuals:                      14   BIC:                            -39.14
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const            0.7201      0.043     16.606   



# Apply log transformation to the independent variable

In [18]:


# Apply log transformation to the independent variable
data['Sorting Time'] = np.log(data['Sorting Time'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['Sorting Time'], data['Delivery Time'], test_size=0.2, random_state=42)

# Fit a linear regression model to the training data
X_train = sm.add_constant(X_train)  # add a constant term to the model
model = sm.OLS(y_train, X_train)
results = model.fit()

# Print the model summary
print(results.summary())

# Make predictions on the testing set
X_test = sm.add_constant(X_test)
y_pred = results.predict(X_test)

# Evaluate the performance of the model on the testing set
print('R-squared:', r2_score(y_test, y_pred))
print('RMSE:', np.sqrt(mean_squared_error(y_test, y_pred)))


                            OLS Regression Results                            
Dep. Variable:          Delivery Time   R-squared:                       0.823
Model:                            OLS   Adj. R-squared:                  0.811
Method:                 Least Squares   F-statistic:                     65.16
Date:                Wed, 12 Apr 2023   Prob (F-statistic):           1.23e-06
Time:                        15:00:55   Log-Likelihood:                 23.973
No. Observations:                  16   AIC:                            -43.95
Df Residuals:                      14   BIC:                            -42.40
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const            0.5807      0.055     10.611   



# Apply square transformation to the response variable

In [17]:
# Apply square transformation to the response variable

y_train_square = np.square(y_train)
model_square = sm.OLS(y_train_square, X_train)
results_square = model_square.fit()
print(results_square.summary())
y_test_square = np.square(y_test)
y_pred_square = results_square.predict(X_test)
print('R-squared (square):', r2_score(y_test_square, y_pred_square))
print('RMSE (square):', np.sqrt(mean_squared_error(y_test_square, y_pred_square)))




                            OLS Regression Results                            
Dep. Variable:          Delivery Time   R-squared:                       0.787
Model:                            OLS   Adj. R-squared:                  0.772
Method:                 Least Squares   F-statistic:                     51.79
Date:                Wed, 12 Apr 2023   Prob (F-statistic):           4.59e-06
Time:                        14:45:26   Log-Likelihood:                 11.790
No. Observations:                  16   AIC:                            -19.58
Df Residuals:                      14   BIC:                            -18.03
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const            0.4691      0.084      5.594   



# Apply square root transformation to the response variable

In [None]:
# Apply square root transformation to the response variable
y_train_sqrt = np.sqrt(y_train)
model_sqrt = sm.OLS(y_train_sqrt, X_train)
results_sqrt = model_sqrt.fit()
y_test_sqrt = np.sqrt(y_test)
y_pred_sqrt = results_sqrt.predict(X_test)
print('R-squared (sqrt):', r2_score(y_test_sqrt, y_pred_sqrt))
print('RMSE (sqrt):', np.sqrt(mean_squared_error(y_test_sqrt, y_pred_sqrt)))

reference for negative r2 why is it negative and how to interpret
https://help.desmos.com/hc/en-us/articles/202529139-Why-am-I-seeing-a-negative-R-2-value-#:~:text=In%20practice%2C%20R2%20will,the%20mean%20of%20the%20data.