## Baseline Multiple Linear Regression

## 1. Pacakge Installation

In [1]:
import numpy as np
import pandas as pd
import tqdm
import matplotlib.pyplot as plt
import xgboost as xgb

import statsmodels.api as sm
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score, ShuffleSplit, RepeatedKFold
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SelectFromModel
from Preprocessing import preprocessor

import warnings

warnings.filterwarnings("ignore")
%config InlineBackend.figure_format='retina'

## 2. Read In Data

In [2]:
df_wine = pd.read_csv('data/df_wine_clean.csv')
df_wine = df_wine.iloc[:,1:]
df_wine.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,title,variety,...,130,131,132,133,134,135,136,137,138,139
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,39.5,Sicily & Sardinia,Etna,,Nicosia 2013 Vulkà Bianco (Etna),White Blend,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,...,0.0,0.0,0.0,0.081144,0.0,0.0,0.0,0.0,0.0,0.0
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,...,0.0,0.0,0.0,0.349908,0.092807,0.0,0.0,0.0,0.0,0.0
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,...,0.0,0.0,0.0,0.0,0.0,0.048212,0.0,0.0,0.0,0.0


In [None]:
df_wine.shape

In [None]:
df_california = df_wine[df_wine['province'] == 'California'][['province', 'normalized rating']]
df_others = df_wine[df_wine['province'] != 'California'][['province', 'normalized rating']]
df_others['province'] = ['Others' for i in df_others['province']]

In [None]:
import seaborn as sns
fig, ax = plt.subplots()
ax = sns.violinplot(x='province', y='normalized rating', data=pd.concat([df_california, df_others], axis=0))
fig.show()

## 3. Apply MLR

In [3]:
X = df_wine[['year', 'price']]
y = df_wine['normalized rating']

In [None]:
%%time
mlr = LinearRegression()
cv = ShuffleSplit(n_splits=5, test_size=0.2)
scores = cross_val_score(mlr, preprocessor(X), y, cv=cv, scoring='neg_root_mean_squared_error')
print(f'The model has a mean RMSE: {-np.mean(scores)}, with standard deviation: {np.std(scores)}')

In [4]:
X = sm.add_constant(X)
md1 = sm.OLS(endog=y, exog=X).fit()
md1.summary()

0,1,2,3
Dep. Variable:,normalized rating,R-squared:,0.179
Model:,OLS,Adj. R-squared:,0.179
Method:,Least Squares,F-statistic:,13090.0
Date:,"Sat, 17 Dec 2022",Prob (F-statistic):,0.0
Time:,18:51:05,Log-Likelihood:,-197990.0
No. Observations:,119928,AIC:,396000.0
Df Residuals:,119925,BIC:,396000.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-66.9383,2.018,-33.172,0.000,-70.893,-62.983
year,0.0354,0.001,35.300,0.000,0.033,0.037
price,0.0144,8.98e-05,160.650,0.000,0.014,0.015

0,1,2,3
Omnibus:,53124.537,Durbin-Watson:,0.374
Prob(Omnibus):,0.0,Jarque-Bera (JB):,4476134.595
Skew:,-1.237,Prob(JB):,0.0
Kurtosis:,32.827,Cond. No.,1110000.0


## 4. Baseline XGBOOST

In [None]:
%%time
model_xgb = xgb.XGBRegressor()
cv = ShuffleSplit(n_splits=5, test_size=0.2)
scores = cross_val_score(model_xgb, preprocessor(X), y, cv=cv, scoring='neg_root_mean_squared_error')
print(f'The model has a mean RMSE: {-np.mean(scores)}, with standard deviation: {np.std(scores)}')

In [None]:
df_wine = pd.read_csv('data/df_wine_ready.csv')
df_wine = df_wine.iloc[:,1:]
X, y = df_wine.iloc[:, 1:], df_wine.iloc[:, 0]

In [None]:
%%time
model_xgb = xgb.XGBRegressor(objective ='reg:squarederror', tree_method = 'gpu_hist')
cv = ShuffleSplit(n_splits=5, test_size=0.2)
scores = cross_val_score(model_xgb, X, y, cv=cv, scoring='neg_root_mean_squared_error')
print(f'The model has a mean RMSE: {-np.mean(scores)}, with standard deviation: {np.std(scores)}')

In [None]:
X