# dealing with a log target:

https://library.virginia.edu/data/articles/interpreting-log-transformations-in-a-linear-model

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge

test_array = np.array([1,2,3,4,5])


# Set a random seed for reproducibility
np.random.seed(42)

# Number of samples
num_samples = 10000

# Generate synthetic data
x1 = np.random.rand(num_samples)
x2 = np.random.rand(num_samples)
x3 = np.random.rand(num_samples)
noise = np.random.normal(loc=0, scale=0.1, size=num_samples)

# Calculate target based on the formula: target = x1 * 1 + x2 * 2 + x3 * 3 + noise
target = x1 * 1 + x2 * 2 + x3 * 3 + noise

# Create a pandas DataFrame
data = pd.DataFrame({'x1': x1, 'x2': x2, 'x3': x3, 'noise': noise, 'target': target})

# Display the DataFrame
print(data.head())

         x1        x2        x3     noise    target
0  0.374540  0.373641  0.729998 -0.063069  3.248747
1  0.950714  0.332912  0.184512  0.214031  2.384105
2  0.731994  0.176154  0.346640  0.166568  2.290789
3  0.598658  0.607267  0.663281  0.073094  3.876127
4  0.156019  0.476624  0.482089 -0.034308  2.521227


In [2]:
X = data[['x1','x2','x3','noise']]
y = data['target']

model = LinearRegression()

model.fit(X,y)

model.coef_

array([1., 2., 3., 1.])

In [3]:
X = data[['x1','x2','x3','noise']]
y = np.log(data['target'])

model = Ridge(fit_intercept = False, alpha = 0.02, tol = 0.000001)

model.fit(X,y)

model.coef_

array([0.31355088, 0.6989442 , 1.05462518, 0.39284309])

In [4]:
np.exp(model.coef_)

array([1.36827508, 2.0116277 , 2.87089889, 1.48118595])

In [5]:
test_df = data.copy()

cols = ['x1','x2','x3','noise']
coefs = model.coef_

test_df['prediction'] = 1
test_df['log_pred'] = 0
test_df['log_target'] = np.log(test_df['target'])

for i in range(4):
    test_df['log_pred'] = test_df[cols[i]] * coefs[i] + test_df['log_pred']
    test_df['prediction'] = np.exp(test_df[cols[i]] * coefs[i])*test_df['prediction']
    
test_df.head()
    

Unnamed: 0,x1,x2,x3,noise,target,prediction,log_pred,log_target
0,0.37454,0.373641,0.729998,-0.063069,3.248747,3.076184,1.12369,1.17827
1,0.950714,0.332912,0.184512,0.214031,2.384105,2.246685,0.809456,0.868824
2,0.731994,0.176154,0.34664,0.166568,2.290789,2.189447,0.783649,0.828896
3,0.598658,0.607267,0.663281,0.073094,3.876127,3.820503,1.340382,1.354837
4,0.156019,0.476624,0.482089,-0.034308,2.521227,2.403677,0.877,0.924746
