In [11]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [12]:
ames = pd.read_csv('ames.csv')
ames_normal = ames[(ames['SaleCondition'] == 'Normal') & (ames['GrLivArea'] <= 4000)]

In [39]:
#Linear Regression Proof of Concept: 5-Fold Cross-Validation Manually
#'GrLivArea' vs 'SalePrice' 

X = ames_normal[['GrLivArea']].values
y = ames_normal['SalePrice'].values
model = LinearRegression()
kf = KFold(n_splits=5)
scores = []
coefs = []
intercept = []

# Manually perform K-Fold Cross-Validation
for fold, (train_index, test_index) in enumerate(kf.split(X), start=1):
    # Split the data into training and testing sets
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Fit the model, obtain fold performance and coefficient
    model.fit(X_train, y_train)
    scores.append(model.score(X_test, y_test))
    coefs.append(model.coef_)
    intercept.append(model.intercept_)

mean_score = np.mean(scores)
print(f"Mean CV R² = {mean_score:.4f}")
mean_coefs = np.mean(coefs)
print(f"Mean Coefficient = {mean_coefs:.4f}")
mean_intercept = np.mean(intercept)
print(f"Mean Y-intercept = {mean_intercept:.0f}")

Mean CV R² = 0.5306
Mean Coefficient = 106.7591
Mean Y-intercept = 17730


In [40]:
# Rank neighborhoods by their mean sale price
neighbor_stats = ames_normal.groupby('Neighborhood')['SalePrice'].agg(['count', 'mean']).sort_values(by='mean')
print(neighbor_stats.round(0).astype(int))

              count    mean
Neighborhood               
MeadowV          34   96836
BrDale           26  107360
IDOTRR           68  111615
BrkSide          98  126674
OldTown         204  127898
Edwards         155  131223
SWISU            40  132519
Landmrk           1  137000
Sawyer          135  137190
NPkVill          22  140743
Blueste          10  143590
NAmes           395  145778
Mitchel         101  163818
SawyerW         106  187753
Gilbert         130  188606
NWAmes          117  192515
Greens            8  193531
Blmngtn          19  195853
CollgCr         226  196502
Crawfor          89  199951
ClearCr          38  216559
Somerst         114  223900
Timber           52  241773
Veenker          22  255866
StoneBr          35  276269
GrnHill           2  280000
NridgHt         100  302479
NoRidge          66  319616


In [41]:
# One Hot Encoding for 'Neighborhood', Note: drop=['MeadowV']
encoder = OneHotEncoder(sparse=False, drop=['MeadowV'])
X = encoder.fit_transform(ames_normal[['Neighborhood']])
y = ames_normal['SalePrice'].values

# Setup KFold and initialize storage
kf = KFold(n_splits=5)
scores = []
coefficients = []
intercept = []

# Perform the KFold cross-validation
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model = LinearRegression()
    model.fit(X_train, y_train)
    # Append the results for each fold
    scores.append(model.score(X_test, y_test))
    coefficients.append(model.coef_)
    intercept.append(model.intercept_)
    
mean_score = np.mean(scores)
print(f"Mean CV R² = {mean_score:.4f}")
mean_coefficients = np.mean(coefficients, axis=0)
mean_intercept = np.mean(intercept)
print(f"Mean Y-intercept = {mean_intercept:.0f}")

#mean_coefficients

Mean CV R² = 0.5437
Mean Y-intercept = 96842




In [42]:
#Display mean_coefficients

# Retrieve neighborhood names from the encoder, adjusting for the dropped category
neighborhoods = encoder.categories_[0]
if 'MeadowV' in neighborhoods:
    neighborhoods = [name for name in neighborhoods if name != 'MeadowV']

#DataFrame to display neighborhoods with their average coefficients

coefficients_df = pd.DataFrame({
    'Neighborhood': neighborhoods,
    'Average Coefficient': mean_coefficients.round(0).astype(int)
})
# Print or return the DataFrame
print(coefficients_df.sort_values(by='Average Coefficient').reset_index(drop=True))

   Neighborhood  Average Coefficient
0        BrDale                10479
1        IDOTRR                14783
2       BrkSide                29868
3       OldTown                31037
4       Landmrk                32133
5       Edwards                34359
6         SWISU                35629
7        Sawyer                40323
8       NPkVill                43976
9       Blueste                46623
10        NAmes                48955
11      Mitchel                66982
12      SawyerW                90900
13      Gilbert                91783
14       NWAmes                95708
15       Greens                96625
16      Blmngtn                99083
17      CollgCr                99640
18      Crawfor               103129
19      ClearCr               119882
20      Somerst               127042
21       Timber               145038
22      Veenker               159325
23      StoneBr               179165
24      GrnHill               183158
25      NridgHt               205603
2

In [43]:
#Fuller- numeric plus categorical

features = ames_normal[['Neighborhood', 'HQSF', 'TotalBath', 'BedroomAbvGr', 'FireplaceYN', 'GarageCars']]
target = ames_normal['SalePrice']

In [44]:
# Preprocess features using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', ['HQSF', 'TotalBath', 'BedroomAbvGr', 'FireplaceYN', 'GarageCars']),
        ('cat', OneHotEncoder(sparse=False, drop=['MeadowV'], handle_unknown='ignore'), ['Neighborhood'])
    ])
# Fit and transform the features
X_transformed = preprocessor.fit_transform(features)
feature_names = ['HQSF', 'TotalBath', 'BedroomAbvGr', 'FireplaceYN', 'GarageCars'] + list(preprocessor.named_transformers_['cat'].get_feature_names_out())



In [45]:
# Initialize KFold
kf = KFold(n_splits=5)
# Initialize variables to store results
coefficients_list = []
intercepts_list = []
scores = []

#INTERCEPT

# Perform the KFold cross-validation
for train_index, test_index in kf.split(X_transformed):
    X_train, X_test = X_transformed[train_index], X_transformed[test_index]
    y_train, y_test = target.iloc[train_index], target.iloc[test_index]
    # Initialize the linear regression model
    model = LinearRegression()
    # Fit the model on the training data
    model.fit(X_train, y_train)
    # Store coefficients and intercepts
    coefficients_list.append(model.coef_)
    intercepts_list.append(model.intercept_)
    # Evaluate the model
    scores.append(model.score(X_test, y_test))
# Calculate the mean of scores, coefficients, and intercepts
average_score = np.mean(scores)
average_coefficients = np.mean(coefficients_list, axis=0)
# Display the average R² score across all folds
print(f"Mean CV R² Score of Combined Model: {average_score:.4f}")
mean_intercept = np.mean(intercepts_list)
print(f"Mean Y-intercept = {mean_intercept:.0f}")

Mean CV R² Score of Combined Model: 0.8296
Mean Y-intercept = -2857


In [46]:
# Create a DataFrame for the coefficients
df_coefficients = pd.DataFrame({
    'Feature': feature_names,
    'Average Coefficient': average_coefficients
    }).sort_values(by='Average Coefficient').reset_index(drop=True)
# Display the DataFrame
print("Coefficients for Combined Model:")
print(df_coefficients)

Coefficients for Combined Model:
                 Feature  Average Coefficient
0           BedroomAbvGr         -7668.779704
1   Neighborhood_NPkVill         -3166.915860
2                   HQSF            51.407016
3     Neighborhood_SWISU           303.965880
4   Neighborhood_OldTown          1226.506602
5    Neighborhood_IDOTRR          5906.802106
6   Neighborhood_Edwards          8514.217954
7   Neighborhood_Blmngtn          8944.353340
8    Neighborhood_BrDale          9344.843035
9   Neighborhood_Landmrk          9572.198158
10    Neighborhood_NAmes          9799.939598
11             TotalBath         10984.979831
12   Neighborhood_NWAmes         11187.928602
13            GarageCars         11480.599834
14           FireplaceYN         11576.038719
15  Neighborhood_Blueste         11672.562553
16   Neighborhood_Sawyer         12020.493101
17  Neighborhood_BrkSide         12090.502102
18  Neighborhood_Mitchel         15535.627792
19  Neighborhood_SawyerW         20930.303202
2

In [49]:
pd.set_option('display.max_columns', 500)
ames_normal.sample(5)

Unnamed: 0.1,Unnamed: 0,PID,GrLivArea,SalePrice,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,TotalBath,TotalSF,HQSF,yrsbtwn,BedxBath,RoomsxBath,FireplaceYN,TotalPorchSF
595,596,527356020,2784,250000,60,RL,80.0,16692,Pave,No Alley,IR1,Lvl,AllPub,Inside,Gtl,NWAmes,RRAn,Norm,1Fam,2Story,7,5,1978,1978,Gable,CompShg,Plywood,Plywood,BrkFace,184.0,TA,TA,CBlock,Gd,TA,No,BLQ,790.0,LwQ,469.0,133.0,1392.0,GasA,TA,Y,SBrkr,1392,1392,0,1.0,0.0,3,1,5,1,Gd,12,Typ,2,TA,Attchd,1978.0,RFn,2.0,564.0,TA,TA,Y,0,112,0,0,440,519,Fa,MnPrv,TenC,2000,7,2006,WD,Normal,4.5,4176.0,4176.0,0,22.5,54.0,1,552
1848,32,531385060,1677,190500,60,RL,65.0,8450,Pave,No Alley,Reg,Lvl,AllPub,Inside,Gtl,SawyerW,Norm,Norm,1Fam,2Story,6,5,2001,2001,Gable,CompShg,VinylSd,VinylSd,No MasVnr,0.0,Gd,TA,PConc,Gd,TA,No,GLQ,472.0,Unf,0.0,355.0,827.0,GasA,Ex,Y,SBrkr,827,850,0,1.0,0.0,2,1,3,1,Gd,7,Typ,0,No Fireplace,Attchd,2001.0,RFn,2.0,627.0,TA,TA,Y,0,68,0,0,0,0,No Pool,No Fence,No Misc,0,9,2008,WD,Normal,3.5,2504.0,2504.0,0,10.5,24.5,0,68
2284,468,528429050,1574,232000,20,RL,75.0,11957,Pave,No Alley,IR1,Lvl,AllPub,Inside,Gtl,Somerst,RRAn,Norm,1Fam,1Story,8,5,2006,2006,Gable,CompShg,VinylSd,VinylSd,BrkFace,53.0,Gd,TA,PConc,Gd,TA,No,GLQ,24.0,Unf,0.0,1550.0,1574.0,GasA,Ex,Y,SBrkr,1574,0,0,0.0,0.0,2,0,3,1,Gd,7,Typ,1,Gd,Attchd,2006.0,RFn,3.0,824.0,TA,TA,Y,144,104,0,0,0,0,No Pool,No Fence,No Misc,0,7,2008,WD,Normal,2.0,3148.0,3148.0,0,6.0,14.0,1,248
2335,519,527451640,987,118000,160,RM,21.0,1680,Pave,No Alley,Reg,Lvl,AllPub,Inside,Gtl,BrDale,Norm,Norm,Twnhs,2Story,6,8,1972,2007,Gable,CompShg,HdBoard,HdBoard,BrkFace,510.0,TA,TA,CBlock,TA,TA,No,ALQ,162.0,Unf,0.0,321.0,483.0,GasA,Gd,Y,SBrkr,483,504,0,0.0,0.0,1,1,2,1,Gd,5,Typ,0,No Fireplace,Detchd,1972.0,Unf,1.0,264.0,TA,TA,Y,250,0,0,0,0,0,No Pool,No Fence,No Misc,0,5,2009,WD,Normal,1.5,1470.0,1470.0,35,3.0,7.5,0,250
938,939,903452025,768,93850,30,RM,61.114286,6291,Grvl,No Alley,IR1,Lvl,AllPub,Inside,Gtl,IDOTRR,RRNe,Norm,1Fam,1Story,6,6,1930,1950,Gable,CompShg,Stucco,Wd Shng,No MasVnr,0.0,Gd,Gd,BrkTil,TA,TA,No,Unf,0.0,Unf,0.0,768.0,768.0,GasA,TA,Y,SBrkr,768,0,0,0.0,0.0,1,0,1,1,TA,4,Typ,0,No Fireplace,Detchd,1930.0,Unf,2.0,440.0,TA,TA,N,0,0,84,0,0,0,No Pool,No Fence,No Misc,0,7,2008,WD,Normal,1.0,1536.0,1536.0,20,1.0,4.0,0,84
