<h3>Interfacing between pandas and Model code</h3>

In [2]:
import pandas as pd
import numpy as np

In [7]:
# using Numpy arrays to interface pandas and modeling tools
data = pd.DataFrame({
    "x0": [1, 2, 3, 4, 5],
    "x1": [0.01, -0.01, 0.25, -4.1, 0.],
    "y": [-1.5, 0., 3.6, 1.3, -2.]
})
data

Unnamed: 0,x0,x1,y
0,1,0.01,-1.5
1,2,-0.01,0.0
2,3,0.25,3.6
3,4,-4.1,1.3
4,5,0.0,-2.0


In [8]:
data.columns

Index(['x0', 'x1', 'y'], dtype='object')

In [6]:
data.to_numpy()

array([[ 1.  ,  0.01, -1.5 ],
       [ 2.  , -0.01,  0.  ],
       [ 3.  ,  0.25,  3.6 ],
       [ 4.  , -4.1 ,  1.3 ],
       [ 5.  ,  0.  , -2.  ]])

In [9]:
# converting back to dataframe
df = pd.DataFrame(
    data.to_numpy(),
    columns=["x0", "x1", "y"],
)
df

Unnamed: 0,x0,x1,y
0,1.0,0.01,-1.5
1,2.0,-0.01,0.0
2,3.0,0.25,3.6
3,4.0,-4.1,1.3
4,5.0,0.0,-2.0


In [13]:
df3 = data.copy()
df3

Unnamed: 0,x0,x1,y
0,1,0.01,-1.5
1,2,-0.01,0.0
2,3,0.25,3.6
3,4,-4.1,1.3
4,5,0.0,-2.0


In [14]:
df3["strings"] = ['a', 'b', 'c', 'd', 'e']
df3

Unnamed: 0,x0,x1,y,strings
0,1,0.01,-1.5,a
1,2,-0.01,0.0,b
2,3,0.25,3.6,c
3,4,-4.1,1.3,d
4,5,0.0,-2.0,e


In [16]:
# indexing a subset of columns
model_cols = ["x0", "x1"]
model_cols

['x0', 'x1']

In [17]:
data.loc[:, model_cols]

Unnamed: 0,x0,x1
0,1,0.01
1,2,-0.01
2,3,0.25
3,4,-4.1
4,5,0.0


In [19]:
# categoricals
data['category'] = pd.Categorical(['a', 'b', 'a', 'a', 'b'],
                                  categories=['a', 'b'])
data

Unnamed: 0,x0,x1,y,category
0,1,0.01,-1.5,a
1,2,-0.01,0.0,b
2,3,0.25,3.6,a
3,4,-4.1,1.3,a
4,5,0.0,-2.0,b


In [20]:
# using dummies
dummies = pd.get_dummies(data.category,
                         prefix="category")
dummies

Unnamed: 0,category_a,category_b
0,True,False
1,False,True
2,True,False
3,True,False
4,False,True


In [21]:
data_with_dummies = data.drop("category", axis=1).join(dummies)
data_with_dummies

Unnamed: 0,x0,x1,y,category_a,category_b
0,1,0.01,-1.5,True,False
1,2,-0.01,0.0,False,True
2,3,0.25,3.6,True,False
3,4,-4.1,1.3,True,False
4,5,0.0,-2.0,False,True


In [22]:
# using patsy library
data = pd.DataFrame({
    "x0": [1,2,3,4,5],
    "x1":[0.01, -0.01, 0.25, -4.1, 0.],
    "y":[-1.5, 0., 3.6, 1.3,-2.]})
data

Unnamed: 0,x0,x1,y
0,1,0.01,-1.5
1,2,-0.01,0.0
2,3,0.25,3.6
3,4,-4.1,1.3
4,5,0.0,-2.0


In [25]:
import patsy
y, X = patsy.dmatrices("y~x0+x1", data)

In [27]:
X

DesignMatrix with shape (5, 3)
  Intercept  x0     x1
          1   1   0.01
          1   2  -0.01
          1   3   0.25
          1   4  -4.10
          1   5   0.00
  Terms:
    'Intercept' (column 0)
    'x0' (column 1)
    'x1' (column 2)

In [28]:
y

DesignMatrix with shape (5, 1)
     y
  -1.5
   0.0
   3.6
   1.3
  -2.0
  Terms:
    'y' (column 0)

In [32]:
# suppressing the intercept
y, X = patsy.dmatrices("y~x0+x1+0",
                data)
X

DesignMatrix with shape (5, 2)
  x0     x1
   1   0.01
   2  -0.01
   3   0.25
   4  -4.10
   5   0.00
  Terms:
    'x0' (column 0)
    'x1' (column 1)

In [33]:
# passing patsy objects into algorithms
wef, resid, _, _ = np.linalg.lstsq(X, y, rcond=None)

  wef, resid, _, _ = np.linalg.lstsq(X, y)
