# OLS Linear Regression

In [2]:
# Basic libraries
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import statistics as stats
import time

# Deep learning libraries
from sklearn.model_selection import train_test_split,RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from scipy.stats import uniform, randint
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV

# Import data
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")
sample_sub =  pd.read_csv("data/sample_submission.csv")

# Remove NA column from training data
train_df = train_df.drop(columns='Unnamed: 12')

# Fix column names error in test_df
test_df = test_df.rename(columns={'TA1':'TA1.x'})

In [3]:
# Assign features
X = train_df.drop(columns='DIC')
y = train_df['DIC']

# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=808) 

# Scale the data
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X.columns)
X_val_scaled = pd.DataFrame(scaler.transform(X_val), columns=X.columns)

In [8]:
# Create OLS instance and fit it
ols = LinearRegression()
ols.fit(X_train_scaled, y_train)

y_pred = ols.predict(X_val_scaled)

rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"Final RMSE: {rmse:.4f}")

Final RMSE: 6.7242


In [16]:
X_total = train_df.drop(columns='DIC')
y_total = train_df['DIC']

ols.fit(X_total, y_total)
y_pred_total = ols.predict(test_df)


In [18]:
test_df['DIC'] = y_pred_total
submission = test_df[['id', 'DIC']]
submission.head()


Unnamed: 0,id,DIC
0,1455,2172.877273
1,1456,2194.918214
2,1457,2326.047119
3,1458,1992.288162
4,1459,2147.398529


In [19]:
submission.to_csv('submission.csv', index=False)

### Trying ridge regression

In [12]:
# Define the three alpha values
# alphas = [0.1, 1.0, 10.0]
alphas =  np.logspace(-4, 4, 100)

# Fit RidgeCV
ridge_cv = RidgeCV(alphas=alphas, cv=10).fit(X_train_scaled, y_train) 
# using ten fold cross validation. 30 models total

# Print best alpha
print(f"Best alpha: {ridge_cv.alpha_}")
# print(f"All alphas: {ridge_cv.alphas}")

# Once we know what the best alpha is, we can move on to the test phase and see how well it does
# Evaluate model with the best alpha
y_pred_cv = ridge_cv.predict(X_val_scaled)
rmse_ridge = np.sqrt(mean_squared_error(y_val, y_pred_cv))
print(f"RMSE Ridge with best alpha: {rmse_ridge:.4f}")

Best alpha: 0.9111627561154896
RMSE Ridge with best alpha: 6.7328


In [14]:
# Define the three alpha values
alphas = [0.1, 1.0, 10.0]
# alphas =  np.logspace(-4, 4, 100)

# Fit RidgeCV
lasso_cv = LassoCV(alphas=alphas, cv=10).fit(X_train_scaled, y_train) 
# using ten fold cross validation. 30 models total

# Print best alpha
print(f"Best alpha: {lasso_cv.alpha_}")
# print(f"All alphas: {lasso_cv.alphas}")

# Once we know what the best alpha is, we can move on to the test phase and see how well it does
# Evaluate model with the best alpha
y_pred_cv = lasso_cv.predict(X_val_scaled)
rmse_ridge = np.sqrt(mean_squared_error(y_val, y_pred_cv))
print(f"RMSE Ridge with best alpha: {rmse_ridge:.4f}")

Best alpha: 1.0
RMSE Ridge with best alpha: 7.1573
