`data_transformed = dmatrix(formula,data)`

In [3]:
from patsy import dmatrix
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import statsmodels.api as sm

In [4]:
np.random.seed(0)
x1 = np.random.rand(5) + 10
x2 = np.random.rand(5) * 10
y = x1 + 2 * x2 + np.random.randn(5)
df = pd.DataFrame(np.array([x1, x2, y]).T, columns=["x1", "x2", "y"])

In [5]:
dmatrix('x1',df)

DesignMatrix with shape (5, 2)
  Intercept        x1
          1  10.54881
          1  10.71519
          1  10.60276
          1  10.54488
          1  10.42365
  Terms:
    'Intercept' (column 0)
    'x1' (column 1)

In [6]:
dmatrix('x1+0',df)

DesignMatrix with shape (5, 1)
        x1
  10.54881
  10.71519
  10.60276
  10.54488
  10.42365
  Terms:
    'x1' (column 0)

In [9]:
dmatrix('x1-1',df)

DesignMatrix with shape (5, 1)
        x1
  10.54881
  10.71519
  10.60276
  10.54488
  10.42365
  Terms:
    'x1' (column 0)

In [10]:
dmatrix('x1+x2',df)

DesignMatrix with shape (5, 3)
  Intercept        x1       x2
          1  10.54881  6.45894
          1  10.71519  4.37587
          1  10.60276  8.91773
          1  10.54488  9.63663
          1  10.42365  3.83442
  Terms:
    'Intercept' (column 0)
    'x1' (column 1)
    'x2' (column 2)

In [11]:
dmatrix('x1+x2-1',df)

DesignMatrix with shape (5, 2)
        x1       x2
  10.54881  6.45894
  10.71519  4.37587
  10.60276  8.91773
  10.54488  9.63663
  10.42365  3.83442
  Terms:
    'x1' (column 0)
    'x2' (column 1)

In [13]:
dmatrix('x1:x2-1',df)

DesignMatrix with shape (5, 1)
      x1:x2
   68.13417
   46.88830
   94.55258
  101.61711
   39.96862
  Terms:
    'x1:x2' (column 0)

In [14]:
df2 = pd.DataFrame(["A", "B", "C", "D"], columns=["x3"])
df2

Unnamed: 0,x3
0,A
1,B
2,C
3,D


In [15]:
dmatrix('x3-1',df2)

DesignMatrix with shape (4, 4)
  x3[A]  x3[B]  x3[C]  x3[D]
      1      0      0      0
      0      1      0      0
      0      0      1      0
      0      0      0      1
  Terms:
    'x3' (columns 0:4)

In [16]:
df3 = pd.DataFrame([1, 2, 3, 4], columns=["x4"])
df3

Unnamed: 0,x4
0,1
1,2
2,3
3,4


In [18]:
dmatrix('C(x4)-1',df3)

DesignMatrix with shape (4, 4)
  C(x4)[1]  C(x4)[2]  C(x4)[3]  C(x4)[4]
         1         0         0         0
         0         1         0         0
         0         0         1         0
         0         0         0         1
  Terms:
    'C(x4)' (columns 0:4)

In [22]:
np.random.seed(0)
x1 = np.random.rand(20) + 10
x2 = np.random.rand(20) * 10
y = x1 + 2 * x2 + np.random.randn(20)
df4 = pd.DataFrame(np.array([x1, x2, y]).T, columns=["x1", "x2", "y"])

In [25]:
model = sm.OLS.from_formula('y~x1+x2',data=df4)
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.967
Model:                            OLS   Adj. R-squared:                  0.963
Method:                 Least Squares   F-statistic:                     246.8
Date:                Tue, 03 Jul 2018   Prob (F-statistic):           2.75e-13
Time:                        03:18:31   Log-Likelihood:                -29.000
No. Observations:                  20   AIC:                             64.00
Df Residuals:                      17   BIC:                             66.99
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      1.4226     10.140      0.140      0.8

In [27]:
from sklearn.datasets import load_boston

In [32]:
boston = load_boston()
dfX = pd.DataFrame(boston.data,columns=boston.feature_names)
dfy = pd.DataFrame(boston.target,columns=['MEDV'])
dfX.columns
df = pd.concat([dfX,dfy],axis=1)

In [41]:
model = sm.OLS.from_formula('MEDV~CRIM + ZN + INDUS + C(CHAS) + \
NOX + RM + AGE + DIS + RAD + TAX + PTRATIO + B + LSTAT + 0',df)

In [42]:
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                   MEDV   R-squared:                       0.741
Model:                            OLS   Adj. R-squared:                  0.734
Method:                 Least Squares   F-statistic:                     108.1
Date:                Tue, 03 Jul 2018   Prob (F-statistic):          6.95e-135
Time:                        03:29:08   Log-Likelihood:                -1498.8
No. Observations:                 506   AIC:                             3026.
Df Residuals:                     492   BIC:                             3085.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
C(CHAS)[0.0]    36.4911      5.104      7.149   

In [49]:
model2 = sm.OLS(dfy,sm.add_constant(dfX))
result2 = model2.fit()

In [50]:
result2 = model2.fit()
print(result2.summary())

                            OLS Regression Results                            
Dep. Variable:                   MEDV   R-squared:                       0.741
Model:                            OLS   Adj. R-squared:                  0.734
Method:                 Least Squares   F-statistic:                     108.1
Date:                Tue, 03 Jul 2018   Prob (F-statistic):          6.95e-135
Time:                        03:30:31   Log-Likelihood:                -1498.8
No. Observations:                 506   AIC:                             3026.
Df Residuals:                     492   BIC:                             3085.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         36.4911      5.104      7.149      0.0