In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

df = pd.read_csv("data/house_prices.csv")
df2 = df.copy()
df.tail(5)

Unnamed: 0,house_id,neighborhood,area,bedrooms,bathrooms,style,price
6023,4373,B,757,0,0,lodge,385420
6024,4422,C,3540,5,3,victorian,890627
6025,5894,B,1518,2,1,lodge,760829
6026,5591,C,2270,4,2,ranch,575515
6027,6211,C,3355,5,3,victorian,844747


In [2]:
# The below function creates 1, 0, -1 coded dummy variables.


def dummy_cat(df, col):
    """
    INPUT:
    df - the dataframe where col is stored
    col - the categorical column you want to dummy (as a string)
    OUTPUT:
    df - the dataframe with the added columns
         for dummy variables using 1, 0, -1 coding
    """
    for idx, val_0 in enumerate(df[col].unique()):
        if idx + 1 < df[col].nunique():
            df[val_0] = df[col].apply(lambda x: 1 if x == val_0 else 0)
        else:
            df[val_0] = df[col].apply(lambda x: -1 if x == val_0 else 0)
            for idx, val_1 in enumerate(df[col].unique()):
                if idx + 1 < df[col].nunique():
                    df[val_1] = df[val_0] + df[val_1]
                else:
                    del df[val_1]
    return df

In [3]:
new_df = dummy_cat(df, "style")  # Use on style
new_df.tail(5)

Unnamed: 0,house_id,neighborhood,area,bedrooms,bathrooms,style,price,ranch,victorian
6023,4373,B,757,0,0,lodge,385420,-1,-1
6024,4422,C,3540,5,3,victorian,890627,0,1
6025,5894,B,1518,2,1,lodge,760829,-1,-1
6026,5591,C,2270,4,2,ranch,575515,1,0
6027,6211,C,3355,5,3,victorian,844747,0,1


In [4]:
new_df["intercept"] = 1

lm = sm.OLS(new_df["price"], new_df[["intercept", "ranch", "victorian"]])
results = lm.fit()
results.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.339
Model:,OLS,Adj. R-squared:,0.339
Method:,Least Squares,F-statistic:,1548.0
Date:,"Wed, 30 Nov 2022",Prob (F-statistic):,0.0
Time:,13:42:48,Log-Likelihood:,-86683.0
No. Observations:,6028,AIC:,173400.0
Df Residuals:,6025,BIC:,173400.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,6.421e+05,5854.251,109.677,0.000,6.31e+05,6.54e+05
ranch,-6.695e+04,8233.489,-8.131,0.000,-8.31e+04,-5.08e+04
victorian,4.04e+05,7377.372,54.763,0.000,3.9e+05,4.18e+05

0,1,2,3
Omnibus:,1340.12,Durbin-Watson:,2.004
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3232.81
Skew:,1.23,Prob(JB):,0.0
Kurtosis:,5.611,Cond. No.,1.84


In [5]:
6.421e05 * 1 + -6.695e04 * -1 + 4.04e05 * -1

305050.0

In [6]:
style_dummies = pd.get_dummies(df["style"])
new_df2 = df2.join(style_dummies)
new_df2.tail(5)

Unnamed: 0,house_id,neighborhood,area,bedrooms,bathrooms,style,price,lodge,ranch,victorian
6023,4373,B,757,0,0,lodge,385420,1,0,0
6024,4422,C,3540,5,3,victorian,890627,0,0,1
6025,5894,B,1518,2,1,lodge,760829,1,0,0
6026,5591,C,2270,4,2,ranch,575515,0,1,0
6027,6211,C,3355,5,3,victorian,844747,0,0,1


In [7]:
new_df2["intercept"] = 1

lm2 = sm.OLS(new_df2["price"], new_df2[["intercept", "ranch", "victorian"]])
results2 = lm2.fit()
results2.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.339
Model:,OLS,Adj. R-squared:,0.339
Method:,Least Squares,F-statistic:,1548.0
Date:,"Wed, 30 Nov 2022",Prob (F-statistic):,0.0
Time:,13:43:02,Log-Likelihood:,-86683.0
No. Observations:,6028,AIC:,173400.0
Df Residuals:,6025,BIC:,173400.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,3.05e+05,1.21e+04,25.120,0.000,2.81e+05,3.29e+05
ranch,2.701e+05,1.57e+04,17.153,0.000,2.39e+05,3.01e+05
victorian,7.411e+05,1.44e+04,51.396,0.000,7.13e+05,7.69e+05

0,1,2,3
Omnibus:,1340.12,Durbin-Watson:,2.004
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3232.81
Skew:,1.23,Prob(JB):,0.0
Kurtosis:,5.611,Cond. No.,4.77
