In [48]:
from patsy import dmatrix, demo_data, ContrastMatrix, Poly

data = demo_data("a", nlevels=3)
data

{'a': ['a1', 'a2', 'a3', 'a1', 'a2', 'a3']}

In [49]:
dmatrix("a", data)

DesignMatrix with shape (6, 3)
  Intercept  a[T.a2]  a[T.a3]
          1        0        0
          1        1        0
          1        0        1
          1        0        0
          1        1        0
          1        0        1
  Terms:
    'Intercept' (column 0)
    'a' (columns 1:3)

In [50]:
l = ["a3", "a2", "a1"]
dmatrix("C(a, levels=l)", data)

DesignMatrix with shape (6, 3)
  Intercept  C(a, levels=l)[T.a2]  C(a, levels=l)[T.a1]
          1                     0                     1
          1                     1                     0
          1                     0                     0
          1                     0                     1
          1                     1                     0
          1                     0                     0
  Terms:
    'Intercept' (column 0)
    'C(a, levels=l)' (columns 1:3)

In [52]:
l = ["a3", "a2", "blue"]
dmatrix("C(a, levels=l)", data)

PatsyError: Error converting data to categorical: observation with value 'a1' does not match any of the expected levels (expected: ['a3', 'a2', 'blue'])
    C(a, levels=l)
    ^^^^^^^^^^^^^^

In [4]:
 dmatrix("C(a, Poly)", data)

DesignMatrix with shape (6, 3)
  Intercept  C(a, Poly).Linear  C(a, Poly).Quadratic
          1           -0.70711               0.40825
          1           -0.00000              -0.81650
          1            0.70711               0.40825
          1           -0.70711               0.40825
          1           -0.00000              -0.81650
          1            0.70711               0.40825
  Terms:
    'Intercept' (column 0)
    'C(a, Poly)' (columns 1:3)

In [5]:
contrast = [[1, 2], [3, 4], [5, 6]]
dmatrix("C(a, contrast)", data)
dmatrix("C(a, [[1], [2], [-4]])", data)

DesignMatrix with shape (6, 2)
  Intercept  C(a, [[1], [2], [-4]])[custom0]
          1                                1
          1                                2
          1                               -4
          1                                1
          1                                2
          1                               -4
  Terms:
    'Intercept' (column 0)
    'C(a, [[1], [2], [-4]])' (column 1)

In [6]:
contrast_mat = ContrastMatrix(contrast, ["[pretty0]", "[pretty1]"])
dmatrix("C(a, contrast_mat)", data)

DesignMatrix with shape (6, 3)
  Intercept  C(a, contrast_mat)[pretty0]  C(a, contrast_mat)[pretty1]
          1                            1                            2
          1                            3                            4
          1                            5                            6
          1                            1                            2
          1                            3                            4
          1                            5                            6
  Terms:
    'Intercept' (column 0)
    'C(a, contrast_mat)' (columns 1:3)

In [7]:
import pandas
url = 'http://www.ats.ucla.edu/stat/data/hsb2.csv'
hsb2 = pandas.read_table(url, delimiter=",")

In [8]:
from patsy.contrasts import Treatment

levels = [1,2,3,4]
contrast = Treatment(reference=0).code_without_intercept(levels)

print contrast.matrix

[[ 0.  0.  0.]
 [ 1.  0.  0.]
 [ 0.  1.  0.]
 [ 0.  0.  1.]]


In [9]:
contrast.matrix[hsb2.race-1, :][:20]

array([[ 0.,  0.,  1.],
       [ 0.,  0.,  1.],
       [ 0.,  0.,  1.],
       [ 0.,  0.,  1.],
       [ 0.,  0.,  1.],
       [ 0.,  0.,  1.],
       [ 0.,  1.,  0.],
       [ 0.,  0.,  0.],
       [ 0.,  0.,  1.],
       [ 0.,  1.,  0.],
       [ 0.,  0.,  1.],
       [ 0.,  0.,  1.],
       [ 0.,  0.,  1.],
       [ 0.,  0.,  1.],
       [ 0.,  1.,  0.],
       [ 0.,  0.,  1.],
       [ 0.,  0.,  1.],
       [ 0.,  0.,  1.],
       [ 0.,  0.,  1.],
       [ 0.,  0.,  1.]])

In [10]:
from statsmodels.formula.api import ols
mod = ols("write ~ C(race, Treatment)", data=hsb2)
res = mod.fit()
print res.summary()

                            OLS Regression Results                            
Dep. Variable:                  write   R-squared:                       0.107
Model:                            OLS   Adj. R-squared:                  0.093
Method:                 Least Squares   F-statistic:                     7.833
Date:                Sat, 04 Feb 2017   Prob (F-statistic):           5.78e-05
Time:                        23:06:19   Log-Likelihood:                -721.77
No. Observations:                 200   AIC:                             1452.
Df Residuals:                     196   BIC:                             1465.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [95.0% Conf. Int.]
-------------------------------------------------------------------------------------------
Intercept                 

In [43]:
from statsmodels.formula.api import ols
mod = ols("write ~ C(race, Treatment(2))", data=hsb2)
res = mod.fit()
print res.summary()

                            OLS Regression Results                            
Dep. Variable:                  write   R-squared:                       0.107
Model:                            OLS   Adj. R-squared:                  0.093
Method:                 Least Squares   F-statistic:                     7.833
Date:                Sat, 04 Feb 2017   Prob (F-statistic):           5.78e-05
Time:                        23:32:53   Log-Likelihood:                -721.77
No. Observations:                 200   AIC:                             1452.
Df Residuals:                     196   BIC:                             1465.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
----------------------------------------------------------------------------------------------
Intercept           

In [46]:
from patsy.contrasts import Sum

contrast = Sum().code_without_intercept(levels)
print contrast.matrix
mod = ols("write ~ C(race, Sum(0))", data=hsb2)
res = mod.fit()
print res.summary()

[[ 1.  0.  0.]
 [ 0.  1.  0.]
 [ 0.  0.  1.]
 [-1. -1. -1.]]
                            OLS Regression Results                            
Dep. Variable:                  write   R-squared:                       0.107
Model:                            OLS   Adj. R-squared:                  0.093
Method:                 Least Squares   F-statistic:                     7.833
Date:                Sat, 04 Feb 2017   Prob (F-statistic):           5.78e-05
Time:                        23:34:32   Log-Likelihood:                -721.77
No. Observations:                 200   AIC:                             1452.
Df Residuals:                     196   BIC:                             1465.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------

In [15]:
hsb2.groupby('race')['write'].mean().mean()

51.67837643678162

In [16]:
from patsy.contrasts import Diff
contrast = Diff().code_without_intercept(levels)
print contrast.matrix
mod = ols("write ~ C(race, Diff)", data=hsb2)
res = mod.fit()
print res.summary()

[[-0.75 -0.5  -0.25]
 [ 0.25 -0.5  -0.25]
 [ 0.25  0.5  -0.25]
 [ 0.25  0.5   0.75]]
                            OLS Regression Results                            
Dep. Variable:                  write   R-squared:                       0.107
Model:                            OLS   Adj. R-squared:                  0.093
Method:                 Least Squares   F-statistic:                     7.833
Date:                Sat, 04 Feb 2017   Prob (F-statistic):           5.78e-05
Time:                        23:22:27   Log-Likelihood:                -721.77
No. Observations:                 200   AIC:                             1452.
Df Residuals:                     196   BIC:                             1465.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [95.0% Conf. Int.]
--------------------------------------

In [17]:
hsb2.groupby('race').mean()["write"][2] - \
    hsb2.groupby('race').mean()["write"][1]

11.541666666666664

In [18]:
from patsy.contrasts import Helmert
contrast = Helmert().code_without_intercept(levels)
print contrast.matrix
mod = ols("write ~ C(race, Helmert)", data=hsb2)
res = mod.fit()
print res.summary()

[[-1. -1. -1.]
 [ 1. -1. -1.]
 [ 0.  2. -1.]
 [ 0.  0.  3.]]
                            OLS Regression Results                            
Dep. Variable:                  write   R-squared:                       0.107
Model:                            OLS   Adj. R-squared:                  0.093
Method:                 Least Squares   F-statistic:                     7.833
Date:                Sat, 04 Feb 2017   Prob (F-statistic):           5.78e-05
Time:                        23:23:48   Log-Likelihood:                -721.77
No. Observations:                 200   AIC:                             1452.
Df Residuals:                     196   BIC:                             1465.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [95.0% Conf. Int.]
-----------------------------------------------------------

In [19]:
grouped = hsb2.groupby('race')
grouped.mean()["write"][4] - grouped.mean()["write"][:3].mean()

3.1690613026819818

In [21]:
k = 4
1./k * (grouped.mean()["write"][k] - grouped.mean()["write"][:k-1].mean())


0.79226532567049546

In [25]:
import numpy as np

_, bins = np.histogram(hsb2.read, 3)
try: # requires numpy master
    readcat = np.digitize(hsb2.read, bins, True)
except:
    readcat = np.digitize(hsb2.read, bins)

hsb2['readcat'] = readcat
hsb2.groupby('readcat').mean()['write']

readcat
0    46.000000
1    44.980392
2    53.356436
3    60.127660
Name: write, dtype: float64

In [26]:
from patsy.contrasts import Poly
levels = hsb2.readcat.unique().tolist()
contrast = Poly().code_without_intercept(levels)
print contrast.matrix
mod = ols("write ~ C(readcat, Poly)", data=hsb2)
res = mod.fit()
print res.summary()

[[-0.67082039  0.5        -0.2236068 ]
 [-0.2236068  -0.5         0.67082039]
 [ 0.2236068  -0.5        -0.67082039]
 [ 0.67082039  0.5         0.2236068 ]]
                            OLS Regression Results                            
Dep. Variable:                  write   R-squared:                       0.320
Model:                            OLS   Adj. R-squared:                  0.309
Method:                 Least Squares   F-statistic:                     30.73
Date:                Sat, 04 Feb 2017   Prob (F-statistic):           2.51e-16
Time:                        23:27:17   Log-Likelihood:                -694.54
No. Observations:                 200   AIC:                             1397.
Df Residuals:                     196   BIC:                             1410.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                                 coef    std err     

In [45]:
from patsy.contrasts import Poly
levels = hsb2.readcat.unique().tolist()
contrast = Poly().code_without_intercept(levels)
print contrast.matrix
mod = ols("write ~ C(readcat, Poly([1,2,10,11]))", data=hsb2)
res = mod.fit()
print res.summary()

[[-0.67082039  0.5        -0.2236068 ]
 [-0.2236068  -0.5         0.67082039]
 [ 0.2236068  -0.5        -0.67082039]
 [ 0.67082039  0.5         0.2236068 ]]
                            OLS Regression Results                            
Dep. Variable:                  write   R-squared:                       0.320
Model:                            OLS   Adj. R-squared:                  0.309
Method:                 Least Squares   F-statistic:                     30.73
Date:                Sat, 04 Feb 2017   Prob (F-statistic):           2.51e-16
Time:                        23:33:51   Log-Likelihood:                -694.54
No. Observations:                 200   AIC:                             1397.
Df Residuals:                     196   BIC:                             1410.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                                                 coef

In [39]:
from patsy.contrasts import ContrastMatrix
def _name_levels(prefix, levels):
    return ["[%s%s]" % (prefix, level) for level in levels]

class Simple(object):
    def _simple_contrast(self, levels):
        nlevels = len(levels)
        contr = -1./nlevels * np.ones((nlevels, nlevels-1))
        contr[1:][np.diag_indices(nlevels-1)] = (nlevels-1.)/nlevels
        return contr

    def code_with_intercept(self, levels):
            contrast = np.column_stack((np.ones(len(levels)),
                                        self._simple_contrast(levels)))
            return ContrastMatrix(contrast, _name_levels("Simp.", levels))
        
    def code_without_intercept(self, levels):
        contrast = np.column_stack((np.ones(len(levels)),
                                        self._simple_contrast(levels)))
        return ContrastMatrix(contrast, _name_levels("Simp.", levels))
        


In [42]:

mod = ols("write ~ C(race, Simple) - 1", data=hsb2)
res = mod.fit()
print res.summary()

                            OLS Regression Results                            
Dep. Variable:                  write   R-squared:                       0.107
Model:                            OLS   Adj. R-squared:                  0.093
Method:                 Least Squares   F-statistic:                     7.833
Date:                Sat, 04 Feb 2017   Prob (F-statistic):           5.78e-05
Time:                        23:31:48   Log-Likelihood:                -721.77
No. Observations:                 200   AIC:                             1452.
Df Residuals:                     196   BIC:                             1465.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [95.0% Conf. Int.]
-------------------------------------------------------------------------------------------
C(race, Simple)[Simp.1]   