In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
import statsmodels.formula.api as smf

#### Example dataframe

In [2]:
df = pd.DataFrame(columns=["cat", "x", "y"])
df["x"] = np.random.normal(size=10)
df["y"] = np.random.normal(size=10)
df["cat"] = ["a", "b", "a", "c", "b", "c", "a", "c", "b", "b"]
df

Unnamed: 0,cat,x,y
0,a,1.434484,-0.370325
1,b,1.079859,0.311202
2,a,0.592733,0.041804
3,c,-1.518686,-0.402556
4,b,1.533031,-0.070859
5,c,0.292722,0.797709
6,a,-0.071529,0.42751
7,c,2.244942,0.677399
8,b,0.338788,-0.22894
9,b,-0.395337,-0.319741


#### Method 1: Pandas get dummies

In [3]:
d2 = df.join(pd.get_dummies(df.cat))

model = LinearRegression(fit_intercept=False)

model.fit(
    X=d2[["a", "b", "c", "x"]],
    y=d2["y"]
)
model.coef_

array([-0.08465177, -0.19242083,  0.29621893,  0.18047108])

#### Method 2: One hot encoder

In [4]:
transform = make_column_transformer(
    (OneHotEncoder(), ["cat"]),
    remainder="passthrough"
)

model = LinearRegression(fit_intercept=False)
pipe = make_pipeline(transform, model)

pipe.fit(
    X=df[["cat", "x"]], 
    y=df["y"]
)
model.coef_

array([-0.08465177, -0.19242083,  0.29621893,  0.18047108])

#### Get dummies with polynomial features

In [5]:
model = LinearRegression(fit_intercept=False)
poly = PolynomialFeatures(degree=2, include_bias=False)
pipe = make_pipeline(poly, model)

pipe.fit(
    X=d2[["a", "b", "c", "x"]],
    y=d2["y"]
)
model.coef_

array([ 1.79011684e-01, -8.73797699e-02,  3.38351895e-01,  1.45990580e-01,
        1.79011684e-01,  2.77555756e-17,  5.55111512e-17, -4.14023093e-01,
       -8.73797699e-02,  0.00000000e+00,  2.83550756e-01,  3.38351895e-01,
        2.76462918e-01, -1.86767494e-01])

#### One hot encoder with polynomial features

In [6]:
transform = make_column_transformer(
    (OneHotEncoder(), ["cat"]),
    remainder="passthrough"
)
model = LinearRegression(fit_intercept=False)
poly = PolynomialFeatures(degree=2, include_bias=False)
pipe = make_pipeline(transform, poly, model)
pipe.fit(
    X=df[["cat", "x"]], 
    y=df["y"]
)
model.coef_

array([ 1.79011684e-01, -8.73797699e-02,  3.38351895e-01,  1.45990580e-01,
        1.79011684e-01,  2.77555756e-17,  5.55111512e-17, -4.14023093e-01,
       -8.73797699e-02,  0.00000000e+00,  2.83550756e-01,  3.38351895e-01,
        2.76462918e-01, -1.86767494e-01])