In [1]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn.datasets
import statsmodels.api as sm
from sklearn.datasets import fetch_openml

import re

from sklearn.linear_model import LinearRegression

# Load in Data

In [2]:
ames_housing = fetch_openml(name="house_prices", as_frame=True)

In [3]:
ames_housing_df = pd.DataFrame(ames_housing.data, columns = ames_housing.feature_names)
ames_housing_df['ave_sale_price'] = ames_housing.target

# Rename columns with underscores
# for col in ames_housing_df.columns:
#     ames_housing_df.rename(columns = {col: col.replace(' ', '_')}, inplace = True)
    
ames_housing_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,ave_sale_price
0,1.0,60.0,RL,65.0,8450.0,Pave,,Reg,Lvl,AllPub,...,0.0,,,,0.0,2.0,2008.0,WD,Normal,208500.0
1,2.0,20.0,RL,80.0,9600.0,Pave,,Reg,Lvl,AllPub,...,0.0,,,,0.0,5.0,2007.0,WD,Normal,181500.0
2,3.0,60.0,RL,68.0,11250.0,Pave,,IR1,Lvl,AllPub,...,0.0,,,,0.0,9.0,2008.0,WD,Normal,223500.0
3,4.0,70.0,RL,60.0,9550.0,Pave,,IR1,Lvl,AllPub,...,0.0,,,,0.0,2.0,2006.0,WD,Abnorml,140000.0
4,5.0,60.0,RL,84.0,14260.0,Pave,,IR1,Lvl,AllPub,...,0.0,,,,0.0,12.0,2008.0,WD,Normal,250000.0


In [4]:
# Make various string replacements to covert columns to snake_case
ames_housing_df.rename(columns = lambda x: x.replace('MS', 'ms'), inplace = True)
ames_housing_df.rename(columns = lambda x: x.replace('SF', '_sf'), inplace = True)
ames_housing_df.rename(columns = lambda x: x.replace('QC', '_qc'), inplace = True)
ames_housing_df.rename(columns = lambda x: x.replace('1st', '_first'), inplace = True)
ames_housing_df.rename(columns = lambda x: x.replace('2nd', '_second'), inplace = True)
ames_housing_df.rename(columns = lambda x: x.replace('1', '_one'), inplace = True)
ames_housing_df.rename(columns = lambda x: x.replace('2', '_two'), inplace = True)
ames_housing_df.rename(columns = lambda x: x.replace('3', 'three'), inplace = True)
ames_housing_df.rename(columns = lambda x: re.sub(r'(?<!^)(?=[A-Z])', '_', x).lower(), inplace = True) #lower case
ames_housing_df.rename(columns = lambda x: x.strip('_'), inplace = True)
ames_housing_df.head()

Unnamed: 0,id,ms_sub_class,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,utilities,...,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type,sale_condition,ave_sale_price
0,1.0,60.0,RL,65.0,8450.0,Pave,,Reg,Lvl,AllPub,...,0.0,,,,0.0,2.0,2008.0,WD,Normal,208500.0
1,2.0,20.0,RL,80.0,9600.0,Pave,,Reg,Lvl,AllPub,...,0.0,,,,0.0,5.0,2007.0,WD,Normal,181500.0
2,3.0,60.0,RL,68.0,11250.0,Pave,,IR1,Lvl,AllPub,...,0.0,,,,0.0,9.0,2008.0,WD,Normal,223500.0
3,4.0,70.0,RL,60.0,9550.0,Pave,,IR1,Lvl,AllPub,...,0.0,,,,0.0,2.0,2006.0,WD,Abnorml,140000.0
4,5.0,60.0,RL,84.0,14260.0,Pave,,IR1,Lvl,AllPub,...,0.0,,,,0.0,12.0,2008.0,WD,Normal,250000.0


# Slope and Intercept Calculations for Single Linear Regression
Example using sklearn's linear regression. Credit to Statology for the code below - https://www.statology.org/sklearn-regression-coefficients/

In [5]:
model_uni = LinearRegression()

In [6]:
uni = ames_housing_df[['lot_frontage', 'ave_sale_price']]
uni= uni.dropna()
uni.head()

Unnamed: 0,lot_frontage,ave_sale_price
0,65.0,208500.0
1,80.0,181500.0
2,68.0,223500.0
3,60.0,140000.0
4,84.0,250000.0


In [7]:
X_uni, y_uni = uni[['lot_frontage']], uni[['ave_sale_price']]
model_uni.fit(X_uni, y_uni)

LinearRegression()

In [8]:
pd.DataFrame(zip(X_uni.columns, model_uni.coef_[0]), columns = ['Column', 'Slope'])

Unnamed: 0,Column,Slope
0,lot_frontage,1208.015549


In [9]:
pd.DataFrame(zip(X_uni.columns, model_uni.intercept_), columns = ['Column', 'Intercept'])

Unnamed: 0,Column,Intercept
0,lot_frontage,96149.041503


In [10]:
import statsmodels.api as sm

uni_X = sm.add_constant(uni.lot_frontage)
model = sm.OLS(uni.ave_sale_price, uni_X)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:         ave_sale_price   R-squared:                       0.124
Model:                            OLS   Adj. R-squared:                  0.123
Method:                 Least Squares   F-statistic:                     169.4
Date:                Tue, 24 Oct 2023   Prob (F-statistic):           2.60e-36
Time:                        23:52:44   Log-Likelihood:                -15233.
No. Observations:                1201   AIC:                         3.047e+04
Df Residuals:                    1199   BIC:                         3.048e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const         9.615e+04   6881.966     13.971   

# Slope and Intercept Calculations for Multiple Linear Regression

In [11]:
model_multi = LinearRegression()

In [12]:
multi = ames_housing_df[['lot_frontage', 'lot_area', 'ave_sale_price']]
multi= multi.dropna()
multi.head()

Unnamed: 0,lot_frontage,lot_area,ave_sale_price
0,65.0,8450.0,208500.0
1,80.0,9600.0,181500.0
2,68.0,11250.0,223500.0
3,60.0,9550.0,140000.0
4,84.0,14260.0,250000.0


In [13]:
X_multi, y_multi = multi[['lot_frontage', 'lot_area']], multi[['ave_sale_price']]
model_multi.fit(X_multi, y_multi)

LinearRegression()

In [14]:
pd.DataFrame(zip(X_multi.columns, model_multi.coef_[0]), columns = ['Column', 'Slope'])

Unnamed: 0,Column,Slope
0,lot_frontage,919.273012
1,lot_area,2.076702


In [15]:
pd.DataFrame(zip(X_multi.columns, model_multi.intercept_), columns = ['Column', 'Intercept'])

Unnamed: 0,Column,Intercept
0,lot_frontage,95708.735102


In [16]:
multi_X = sm.add_constant(X_multi)
model = sm.OLS(uni.ave_sale_price, multi_X)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:         ave_sale_price   R-squared:                       0.156
Model:                            OLS   Adj. R-squared:                  0.154
Method:                 Least Squares   F-statistic:                     110.4
Date:                Tue, 24 Oct 2023   Prob (F-statistic):           9.79e-45
Time:                        23:52:44   Log-Likelihood:                -15211.
No. Observations:                1201   AIC:                         3.043e+04
Df Residuals:                    1198   BIC:                         3.044e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const         9.571e+04   6758.772     14.161   

# Excel Outputs

In [17]:
# Output to excel for cross-validation with regression tool there
# ames_housing_df.to_excel('ames_housing.xlsx', index = False)