# Creating Model Descriptions with Patsy

- module **Patsy**
- string: **formula syntax** (R&S)

    y ~ x0 + x1

### Design matrix

$$\mathbf{Y = X\boldsymbol{\beta+\varepsilon}}$$

<img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/31804dd9fded0fcaa26152bd9bdb73f8eade9cf8"
style="float:middle">

### Design matrix

In [3]:
import numpy as np
import pandas as pd

data = pd.DataFrame({'x0': [1, 2, 3, 4, 5],
                     'x1': [0.01, -0.01, 0.25, -4.1, 0.], 
                     'y': [-1.5, 0., 3.6, 1.3, -2.]})
data

Unnamed: 0,x0,x1,y
0,1,0.01,-1.5
1,2,-0.01,0.0
2,3,0.25,3.6
3,4,-4.1,1.3
4,5,0.0,-2.0


In [7]:
import patsy
y, X = patsy.dmatrices('y ~ x0 + x1', data)
type(patsy.dmatrices('y ~ x0 + x1', data))

y;X

tuple

DesignMatrix with shape (5, 1)
     y
  -1.5
   0.0
   3.6
   1.3
  -2.0
  Terms:
    'y' (column 0)

DesignMatrix with shape (5, 3)
  Intercept  x0     x1
          1   1   0.01
          1   2  -0.01
          1   3   0.25
          1   4  -4.10
          1   5   0.00
  Terms:
    'Intercept' (column 0)
    'x0' (column 1)
    'x1' (column 2)

In [73]:
np.asarray(y)
np.asarray(X)

array([[-1.5],
       [ 0. ],
       [ 3.6],
       [ 1.3],
       [-2. ]])

### Regression without intercept

In [10]:
patsy.dmatrices('y ~ x0 + x1 + 0', data)[0]
patsy.dmatrices('y ~ x0 + x1 + 0', data)[1]

DesignMatrix with shape (5, 1)
     y
  -1.5
   0.0
   3.6
   1.3
  -2.0
  Terms:
    'y' (column 0)

DesignMatrix with shape (5, 2)
  x0     x1
   1   0.01
   2  -0.01
   3   0.25
   4  -4.10
   5   0.00
  Terms:
    'x0' (column 0)
    'x1' (column 1)

### calculation: `numpy.linalg.lstsq`

In [11]:
coef, resid, _, _ = np.linalg.lstsq(X, y)

coef

array([[ 0.31290976],
       [-0.07910564],
       [-0.26546384]])

In [12]:
coef = pd.Series(coef.squeeze(), index=X.design_info.column_names)
coef

Intercept    0.312910
x0          -0.079106
x1          -0.265464
dtype: float64

# Data Transformations in Patsy Formulas

In [18]:
data
y, X = patsy.dmatrices('y ~ x0 + np.log(np.abs(x1) + 1)', data)
y;X

Unnamed: 0,x0,x1,y
0,1,0.01,-1.5
1,2,-0.01,0.0
2,3,0.25,3.6
3,4,-4.1,1.3
4,5,0.0,-2.0


DesignMatrix with shape (5, 1)
     y
  -1.5
   0.0
   3.6
   1.3
  -2.0
  Terms:
    'y' (column 0)

DesignMatrix with shape (5, 3)
  Intercept  x0  np.log(np.abs(x1) + 1)
          1   1                 0.00995
          1   2                 0.00995
          1   3                 0.22314
          1   4                 1.62924
          1   5                 0.00000
  Terms:
    'Intercept' (column 0)
    'x0' (column 1)
    'np.log(np.abs(x1) + 1)' (column 2)

In [19]:
data
y, X = patsy.dmatrices('y ~ standardize(x0) + center(x1)', data)
y;X

Unnamed: 0,x0,x1,y
0,1,0.01,-1.5
1,2,-0.01,0.0
2,3,0.25,3.6
3,4,-4.1,1.3
4,5,0.0,-2.0


DesignMatrix with shape (5, 1)
     y
  -1.5
   0.0
   3.6
   1.3
  -2.0
  Terms:
    'y' (column 0)

DesignMatrix with shape (5, 3)
  Intercept  standardize(x0)  center(x1)
          1         -1.41421        0.78
          1         -0.70711        0.76
          1          0.00000        1.02
          1          0.70711       -3.33
          1          1.41421        0.77
  Terms:
    'Intercept' (column 0)
    'standardize(x0)' (column 1)
    'center(x1)' (column 2)

#### summation: `I`

In [20]:
y, X = patsy.dmatrices('y ~ I(x0 + x1)', data)
y;X

DesignMatrix with shape (5, 1)
     y
  -1.5
   0.0
   3.6
   1.3
  -2.0
  Terms:
    'y' (column 0)

DesignMatrix with shape (5, 2)
  Intercept  I(x0 + x1)
          1        1.01
          1        1.99
          1        3.25
          1       -0.10
          1        5.00
  Terms:
    'Intercept' (column 0)
    'I(x0 + x1)' (column 1)


#  Categorical Data and Patsy

In [22]:
data = pd.DataFrame({'key1': ['a', 'a', 'b', 'b', 'a', 'b', 'c', 'c'], 
                     'key2': [0, 1, 0, 1, 0, 1, 2, 2], 
                     'v1': [1, 2, 3, 4, 5, 6, 7, 8],
                     'v2': [-1, 0, 2.5, -0.5, 4.0, -1.2, 0.2, -1.7] })
data.head()

Unnamed: 0,key1,key2,v1,v2
0,a,0,1,-1.0
1,a,1,2,0.0
2,b,0,3,2.5
3,b,1,4,-0.5
4,a,0,5,4.0


In [23]:
y, X = patsy.dmatrices('v2 ~ key1', data)
X

DesignMatrix with shape (8, 3)
  Intercept  key1[T.b]  key1[T.c]
          1          0          0
          1          0          0
          1          1          0
          1          1          0
          1          0          0
          1          1          0
          1          0          1
          1          0          1
  Terms:
    'Intercept' (column 0)
    'key1' (columns 1:3)

In [24]:
y, X = patsy.dmatrices('v2 ~ key1 + 0', data)
X

DesignMatrix with shape (8, 3)
  key1[a]  key1[b]  key1[c]
        1        0        0
        1        0        0
        0        1        0
        0        1        0
        1        0        0
        0        1        0
        0        0        1
        0        0        1
  Terms:
    'key1' (columns 0:3)

In [27]:
data.head()
y, X = patsy.dmatrices('v2 ~ C(key2)', data)
X

Unnamed: 0,key1,key2,v1,v2
0,a,0,1,-1.0
1,a,1,2,0.0
2,b,0,3,2.5
3,b,1,4,-0.5
4,a,0,5,4.0


DesignMatrix with shape (8, 3)
  Intercept  C(key2)[T.1]  C(key2)[T.2]
          1             0             0
          1             1             0
          1             0             0
          1             1             0
          1             0             0
          1             1             0
          1             0             1
          1             0             1
  Terms:
    'Intercept' (column 0)
    'C(key2)' (columns 1:3)

### interactions

In [28]:
data['key2'] = data['key2'].map({0: 'zero', 1: 'one', 2: 'two'})
data

Unnamed: 0,key1,key2,v1,v2
0,a,zero,1,-1.0
1,a,one,2,0.0
2,b,zero,3,2.5
3,b,one,4,-0.5
4,a,zero,5,4.0
5,b,one,6,-1.2
6,c,two,7,0.2
7,c,two,8,-1.7


In [29]:
y, X = patsy.dmatrices('v2 ~ key1 + key2', data)
X

DesignMatrix with shape (8, 5)
  Intercept  key1[T.b]  key1[T.c]  key2[T.two]  key2[T.zero]
          1          0          0            0             1
          1          0          0            0             0
          1          1          0            0             1
          1          1          0            0             0
          1          0          0            0             1
          1          1          0            0             0
          1          0          1            1             0
          1          0          1            1             0
  Terms:
    'Intercept' (column 0)
    'key1' (columns 1:3)
    'key2' (columns 3:5)

In [30]:
y, X = patsy.dmatrices('v2 ~ key1 + key2 + key1:key2', data)
X

DesignMatrix with shape (8, 9)
  Columns:
    ['Intercept',
     'key1[T.b]',
     'key1[T.c]',
     'key2[T.two]',
     'key2[T.zero]',
     'key1[T.b]:key2[T.two]',
     'key1[T.c]:key2[T.two]',
     'key1[T.b]:key2[T.zero]',
     'key1[T.c]:key2[T.zero]']
  Terms:
    'Intercept' (column 0)
    'key1' (columns 1:3)
    'key2' (columns 3:5)
    'key1:key2' (columns 5:9)
  (to view full data, use np.asarray(this_obj))