## Diamonds

Tasks:

1. Fit a linear regression (price as outcome) including all columns !
2. Get the R^2 on the test data and compare to training
3. Which features seem important
4. What about x,y,z ? Do they make sense to include in a linear fashion ?


In [1]:
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

# -------------------------------------------------------
# 1. Load diamonds data
# -------------------------------------------------------
df = sm.datasets.get_rdataset("diamonds", "ggplot2").data

# Outcome:
y = df["price"]

# Predictors:
X = df.drop(columns=["price"])

# Identify categorical vs numeric columns
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numeric_cols = X.select_dtypes(include=["float64", "int64"]).columns.tolist()

print("Categorical:", categorical_cols)
print("Numeric:", numeric_cols)

# -------------------------------------------------------
# 2. Preprocessor: OneHotEncoder for categoricals
# -------------------------------------------------------
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(drop="first", sparse_output=False), categorical_cols),
        ("num", "passthrough", numeric_cols)
    ]
)

# -------------------------------------------------------
# 3. Full pipeline
# -------------------------------------------------------
pipe = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", LinearRegression())
])

# -------------------------------------------------------
# 4. Train–test split and fit
# -------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

pipe.fit(X_train, y_train)

print("Train R²:", pipe.score(X_train, y_train))
print("Test R²:", pipe.score(X_test, y_test))

# -------------------------------------------------------
# 5. Get coefficient names
# -------------------------------------------------------
# 1. Get names from the ColumnTransformer
ohe = pipe.named_steps["preprocess"].named_transformers_["cat"]
ohe_feature_names = ohe.get_feature_names_out(categorical_cols)

feature_names = list(ohe_feature_names) + numeric_cols

# 2. Get coefficients from model
coefs = pipe.named_steps["model"].coef_

coef_df = pd.DataFrame({
    "feature": feature_names,
    "coefficient": coefs
})

print(coef_df)


Categorical: ['cut', 'color', 'clarity']
Numeric: ['carat', 'depth', 'table', 'x', 'y', 'z']
Train R²: 0.919994314909212
Test R²: 0.9189331350419379
          feature   coefficient
0        cut_Good    591.797169
1       cut_Ideal    858.815946
2     cut_Premium    781.928178
3   cut_Very Good    749.952180
4         color_E   -218.198603
5         color_F   -279.716403
6         color_G   -495.581527
7         color_H   -999.086408
8         color_I  -1479.584470
9         color_J  -2372.019835
10     clarity_IF   5365.944596
11    clarity_SI1   3675.414552
12    clarity_SI2   2701.439970
13    clarity_VS1   4579.905541
14    clarity_VS2   4263.615635
15   clarity_VVS1   5015.292916
16   clarity_VVS2   4958.211449
17          carat  11280.784327
18          depth    -65.091015
19          table    -26.600021
20              x  -1008.041596
21              y     -3.528450
22              z    -36.463370


In [2]:
pipe.named_steps["model"].coef_



array([ 5.91797169e+02,  8.58815946e+02,  7.81928178e+02,  7.49952180e+02,
       -2.18198603e+02, -2.79716403e+02, -4.95581527e+02, -9.99086408e+02,
       -1.47958447e+03, -2.37201983e+03,  5.36594460e+03,  3.67541455e+03,
        2.70143997e+03,  4.57990554e+03,  4.26361564e+03,  5.01529292e+03,
        4.95821145e+03,  1.12807843e+04, -6.50910151e+01, -2.66000210e+01,
       -1.00804160e+03, -3.52845036e+00, -3.64633703e+01])

In [3]:
df.columns

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y',
       'z'],
      dtype='object')

In [4]:

import statsmodels.formula.api as smf

model = smf.ols("price ~ carat + depth + cut + color + clarity+ depth+ table + x + y+ z", data=df).fit()

print(model.summary())#.tables[1])


                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.920
Model:                            OLS   Adj. R-squared:                  0.920
Method:                 Least Squares   F-statistic:                 2.688e+04
Date:                Wed, 19 Nov 2025   Prob (F-statistic):               0.00
Time:                        15:35:06   Log-Likelihood:            -4.5573e+05
No. Observations:               53940   AIC:                         9.115e+05
Df Residuals:                   53916   BIC:                         9.117e+05
Df Model:                          23                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept         2184.4774    408.197  

In [5]:
model = smf.ols("carat ~  x + y+ z", data=df).fit()

print(model.summary())#.tables[1])

                            OLS Regression Results                            
Dep. Variable:                  carat   R-squared:                       0.952
Model:                            OLS   Adj. R-squared:                  0.952
Method:                 Least Squares   F-statistic:                 3.536e+05
Date:                Wed, 19 Nov 2025   Prob (F-statistic):               0.00
Time:                        15:35:06   Log-Likelihood:                 45412.
No. Observations:               53940   AIC:                        -9.082e+04
Df Residuals:                   53936   BIC:                        -9.078e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -1.5668      0.002   -669.401      0.0

## Interactions

Also known in the business context as
**synergies**

In [6]:
import statsmodels.formula.api as smf

model = smf.ols("price ~ carat + depth", data=df).fit()

print(model.summary())#.tables[1])

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.851
Model:                            OLS   Adj. R-squared:                  0.851
Method:                 Least Squares   F-statistic:                 1.536e+05
Date:                Wed, 19 Nov 2025   Prob (F-statistic):               0.00
Time:                        15:35:06   Log-Likelihood:            -4.7249e+05
No. Observations:               53940   AIC:                         9.450e+05
Df Residuals:                   53937   BIC:                         9.450e+05
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept   4045.3332    286.205     14.134      0.0

In [12]:
model = smf.ols("price ~ carat * depth", data=df).fit()

print(model.summary())#.tables[1])

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.852
Model:                            OLS   Adj. R-squared:                  0.852
Method:                 Least Squares   F-statistic:                 1.036e+05
Date:                Wed, 19 Nov 2025   Prob (F-statistic):               0.00
Time:                        15:37:46   Log-Likelihood:            -4.7223e+05
No. Observations:               53940   AIC:                         9.445e+05
Df Residuals:                   53936   BIC:                         9.445e+05
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept   -7823.7383    592.049    -13.215      

In [8]:
import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import PolynomialFeatures

# Load data

df = sm.datasets.get_rdataset("diamonds", "ggplot2").data

df= pd.get_dummies(df)

sklearn

In [None]:
df.columns

In [10]:
import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

# Load diamonds
#df = sm.datasets.get_rdataset("diamonds", "ggplot2").data

X = df[["carat", "depth"]]   # same variables as in statsmodels formula
y = df["price"]

# Pipeline: interaction terms only
pipe = Pipeline([
    ("interaction", PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)),
    ("model", LinearRegression())
])

pipe.fit(X, y)

print("Coefficients:", pipe.named_steps["model"].coef_)
print("Intercept:", pipe.named_steps["model"].intercept_)


Coefficients: [20742.59987133    90.04321842  -210.07533218]
Intercept: -7823.738251003058


show the expanded feature matrix

In [11]:
import numpy as np

X_trans = pipe.named_steps["interaction"].fit_transform(X)
print(X_trans[:5])

[[ 0.23  61.5   14.145]
 [ 0.21  59.8   12.558]
 [ 0.23  56.9   13.087]
 [ 0.29  62.4   18.096]
 [ 0.31  63.3   19.623]]
