In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

In [2]:
np.random.seed(1)
categorías=['Guadalajara','Zapopan','Ciudad de México','Monterrey','Morelia','Mexicali']
ponderación=[1/4,1/5,1/4,7/40,1/20,3/40]
limites=[(3500,6000),(3000,12000),(4000,18000),(3000,6000),(2000,5000),(3500,5000)]
ciudad=np.random.choice(categorías,size=20,p=ponderación)
rentas=[]
for c in ciudad:
    rentas.append(np.random.randint(*limites[categorías.index(c)]))
df=pd.DataFrame()
df['Rentas']=rentas
df['Ciudad']=ciudad
df.head()

Unnamed: 0,Rentas,Ciudad
0,9285,Zapopan
1,5025,Monterrey
2,4531,Guadalajara
3,7415,Zapopan
4,4837,Guadalajara


#### One-Hot

In [7]:
pd.get_dummies(df) # Codifica todas las variables categóricas


Unnamed: 0,Rentas,Ciudad_Ciudad de México,Ciudad_Guadalajara,Ciudad_Monterrey,Ciudad_Morelia,Ciudad_Zapopan
0,9285,0,0,0,0,1
1,5025,0,0,1,0,0
2,4531,0,1,0,0,0
3,7415,0,0,0,0,1
4,4837,0,1,0,0,0
5,5420,0,1,0,0,0
6,3816,0,1,0,0,0
7,11920,0,0,0,0,1
8,9771,0,0,0,0,1
9,4431,1,0,0,0,0


In [6]:
pd.get_dummies(df['Ciudad'])

Unnamed: 0,Ciudad de México,Guadalajara,Monterrey,Morelia,Zapopan
0,0,0,0,0,1
1,0,0,1,0,0
2,0,1,0,0,0
3,0,0,0,0,1
4,0,1,0,0,0
5,0,1,0,0,0
6,0,1,0,0,0
7,0,0,0,0,1
8,0,0,0,0,1
9,1,0,0,0,0


In [11]:
OH=pd.get_dummies(df,columns=['Ciudad'],prefix='',prefix_sep='') # Codifica 'Ciudad' y me regresa el resto del DF
OH.head()

Unnamed: 0,Rentas,Ciudad de México,Guadalajara,Monterrey,Morelia,Zapopan
0,9285,0,0,0,0,1
1,5025,0,0,1,0,0
2,4531,0,1,0,0,0
3,7415,0,0,0,0,1
4,4837,0,1,0,0,0


#### Codificacion ficticia

In [12]:
F=pd.get_dummies(df,prefix='',prefix_sep='',drop_first=True)
F.head()

Unnamed: 0,Rentas,Guadalajara,Monterrey,Morelia,Zapopan
0,9285,0,0,0,1
1,5025,0,1,0,0
2,4531,1,0,0,0
3,7415,0,0,0,1
4,4837,1,0,0,0


In [24]:
CR, = (set(OH.columns)-set(F.columns))
CR

'Ciudad de México'

#### Codificacion de efectos

In [25]:
E = pd.get_dummies(df,prefix='',prefix_sep='', drop_first=True)
E.head()

Unnamed: 0,Rentas,Guadalajara,Monterrey,Morelia,Zapopan
0,9285,0,0,0,1
1,5025,0,1,0,0
2,4531,1,0,0,0
3,7415,0,0,0,1
4,4837,1,0,0,0


In [26]:
cr_idx,=np.where(df['Ciudad']==CR)
cr_idx

array([ 9, 11, 15, 17], dtype=int64)

In [27]:
E.dtypes

Rentas         int64
Guadalajara    uint8
Monterrey      uint8
Morelia        uint8
Zapopan        uint8
dtype: object

In [28]:
E=E.astype('int')
E.dtypes

Rentas         int32
Guadalajara    int32
Monterrey      int32
Morelia        int32
Zapopan        int32
dtype: object

In [30]:
E.iloc[cr_idx,1:]=-1
E

Unnamed: 0,Rentas,Guadalajara,Monterrey,Morelia,Zapopan
0,9285,0,0,0,1
1,5025,0,1,0,0
2,4531,1,0,0,0
3,7415,0,0,0,1
4,4837,1,0,0,0
5,5420,1,0,0,0
6,3816,1,0,0,0
7,11920,0,0,0,1
8,9771,0,0,0,1
9,4431,-1,-1,-1,-1


### Regresion Lineal

#### One-Hot

In [37]:
linOH = LinearRegression()
linOH.fit(OH.drop(columns='Rentas'),OH.Rentas)

LinearRegression()

In [38]:
linOH.coef_

array([ 4525.88333333, -1799.11666667, -1357.36666667, -3287.36666667,
        1917.96666667])

In [39]:
linOH.intercept_

6382.366666666668

In [40]:
p = df.groupby('Ciudad').mean()
p

Unnamed: 0_level_0,Rentas
Ciudad,Unnamed: 1_level_1
Ciudad de México,10908.25
Guadalajara,4583.25
Monterrey,5025.0
Morelia,3095.0
Zapopan,8300.333333


In [42]:
p.mean()

Rentas    6382.366667
dtype: float64

El promedio de la variable p corresponde a la interseccion en la regresion lineal

In [44]:
(p - p.mean(),linOH.coef_)

(                       Rentas
 Ciudad                       
 Ciudad de México  4525.883333
 Guadalajara      -1799.116667
 Monterrey        -1357.366667
 Morelia          -3287.366667
 Zapopan           1917.966667,
 array([ 4525.88333333, -1799.11666667, -1357.36666667, -3287.36666667,
         1917.96666667]))

Los coeficientes de la regresion lineal corresponden a la diferencia entre el promedio de la renta por ciudad y la renta por ciudades

#### Codificacion ficticia

In [47]:
LinF = LinearRegression()
LinF.fit(F.drop(columns='Rentas'),F.Rentas)

LinearRegression()

In [48]:
LinF.coef_

array([-6325.        , -5883.25      , -7813.25      , -2607.91666667])

In [49]:
LinF.intercept_

10908.249999999998

In [51]:
p.loc[CR] #Promedio de la categoria de referencia


Rentas    10908.25
Name: Ciudad de México, dtype: float64

En la codificacion ficticia al hacer una regresion lineal la interseccion representa el promedio de la categoria de referencia

In [52]:
p - p.loc[CR]

Unnamed: 0_level_0,Rentas
Ciudad,Unnamed: 1_level_1
Ciudad de México,0.0
Guadalajara,-6325.0
Monterrey,-5883.25
Morelia,-7813.25
Zapopan,-2607.916667


In [53]:
LinF.coef_

array([-6325.        , -5883.25      , -7813.25      , -2607.91666667])

Los coeficientes corresponden a la diferencia entre los promedios de las rentas por ciudad y la interseccion

#### Codificacion de Efectos

In [54]:
linE = LinearRegression()
linE = linE.fit(E.drop(columns='Rentas'),E.Rentas)

In [57]:
linE.coef_

array([-1799.11666667, -1357.36666667, -3287.36666667,  1917.96666667])

In [58]:
linOH.coef_[1:]

array([-1799.11666667, -1357.36666667, -3287.36666667,  1917.96666667])

In [59]:
linE.intercept_

6382.366666666668

In [60]:
linOH.intercept_

6382.366666666668

In [62]:
linE.intercept_-linE.coef_.sum()

10908.250000000002