Regression Model

In [107]:
import pandas as pd

In [108]:
train = pd.read_csv("/Users/karissamohr/Documents/GSB-S544/gsb-544-fall-2025-regression/train_new.csv")

In [109]:
train.head()

Unnamed: 0,SalePrice,PID,Lot Frontage,Lot Area,Street,Neighborhood,Bldg Type,House Style,Overall Qual,Overall Cond,...,Full Bath,Half Bath,Bedroom AbvGr,TotRms AbvGrd,Gr Liv Area,Functional,Screen Porch,Pool Area,Yr Sold,Sale Type
0,159000,531363010,80.0,9605,Pave,SawyerW,1Fam,1Story,7,6,...,1,1,3,6,1218,Typ,0,0,2009,WD
1,271900,906203120,90.0,14684,Pave,SawyerW,1Fam,1Story,7,7,...,2,0,3,7,2196,Typ,0,0,2009,WD
2,137500,916176030,,14375,Pave,Timber,1Fam,SLvl,6,6,...,1,0,3,7,1344,Typ,233,0,2009,COD
3,248500,528180130,48.0,6472,Pave,NridgHt,TwnhsE,1Story,9,5,...,2,0,2,6,1456,Typ,0,0,2009,WD
4,167000,528290030,61.0,9734,Pave,Gilbert,1Fam,SLvl,7,5,...,2,1,3,7,1374,Typ,0,0,2009,WD


In [110]:
import numpy as np

In [111]:
X = train.drop(columns=["SalePrice", "PID"])
y = np.log(train["SalePrice"])

In [112]:
train.dtypes

SalePrice          int64
PID                int64
Lot Frontage     float64
Lot Area           int64
Street            object
Neighborhood      object
Bldg Type         object
House Style       object
Overall Qual       int64
Overall Cond       int64
Year Built         int64
Roof Style        object
Heating           object
Central Air       object
Electrical        object
Full Bath          int64
Half Bath          int64
Bedroom AbvGr      int64
TotRms AbvGrd      int64
Gr Liv Area        int64
Functional        object
Screen Porch       int64
Pool Area          int64
Yr Sold            int64
Sale Type         object
dtype: object

In [113]:
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object"]).columns.tolist()

In [114]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

numeric_transformer = Pipeline(
    [("imputer", SimpleImputer(strategy="median")),
     ("standardize", StandardScaler())]
)

In [115]:
from sklearn.preprocessing import OneHotEncoder

categorical_transformer = Pipeline(
    [("imputer", SimpleImputer(strategy="most_frequent")),
     ("dummify", OneHotEncoder(sparse_output=False, handle_unknown='ignore'))]
)

In [116]:
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer(
  [
    ("numeric", numeric_transformer, numeric_features),
    ("categorical", categorical_transformer, categorical_features)
  ],
  remainder="drop"
)

In [117]:
from sklearn.linear_model import Ridge

ridge_pipeline = Pipeline(
  [
    ("preprocessing", ct),
    ("ridge", Ridge(alpha=10))
  ]
)

In [118]:
ridge_pipeline.fit(X, y)

In [119]:
test = pd.read_csv("/Users/karissamohr/Documents/GSB-S544/gsb-544-fall-2025-regression/test_new.csv")

In [120]:
test.head()

Unnamed: 0,PID,Lot Frontage,Lot Area,Street,Neighborhood,Bldg Type,House Style,Overall Qual,Overall Cond,Year Built,...,Full Bath,Half Bath,Bedroom AbvGr,TotRms AbvGrd,Gr Liv Area,Functional,Screen Porch,Pool Area,Yr Sold,Sale Type
0,907135180,60,8070,Pave,CollgCr,1Fam,1Story,4,5,1994,...,1,0,3,5,990,Typ,0,0,2007,WD
1,528181040,40,6792,Pave,NridgHt,TwnhsE,1Story,7,5,2005,...,2,0,2,6,1368,Typ,0,0,2006,New
2,528175010,44,6371,Pave,NridgHt,TwnhsE,1Story,7,5,2009,...,2,0,2,6,1358,Typ,0,0,2010,New
3,531379030,70,8304,Pave,SawyerW,1Fam,2Story,6,5,1997,...,2,1,3,7,1837,Typ,0,0,2006,WD
4,923275090,37,6951,Pave,Mitchel,1Fam,1Story,5,5,1984,...,1,0,3,5,923,Typ,0,0,2008,WD


In [121]:
X_test = test.drop(columns=["PID"])

In [122]:
log_preds = ridge_pipeline.predict(X_test)
final_preds = np.exp(log_preds)

In [124]:
final_preds[:10]

array([126528.92771698, 218017.02023468, 217406.07928437, 185658.3976752 ,
       128836.66926592, 226045.36659687, 147501.12711326, 126351.95252384,
       139175.30387398, 177219.02377854])

In [125]:
from sklearn.model_selection import cross_val_score, KFold

cv = KFold(n_splits=5, shuffle=True, random_state=544)

scores = cross_val_score(
    ridge_pipeline,
    X,
    y,  
    cv=cv,
    scoring="neg_root_mean_squared_error"
)

rmse = -scores.mean()
rmse_std = scores.std()

rmse, rmse_std


(0.1494907172812784, 0.017786686607468894)

Submission CSV

In [126]:
log_preds = ridge_pipeline.predict(X_test)
final_preds = np.exp(log_preds)

In [127]:
submission = pd.DataFrame({
    "PID": test["PID"],
    "SalePrice": final_preds
})

In [128]:
submission.to_csv("submission_ridge.csv", index=False)

In [129]:
submission.head()

Unnamed: 0,PID,SalePrice
0,907135180,126528.927717
1,528181040,218017.020235
2,528175010,217406.079284
3,531379030,185658.397675
4,923275090,128836.669266


In [130]:
submission.to_csv("/Users/karissamohr/Documents/GSB-S544/submission_ridge.csv", index=False)

In [131]:
submission.shape

(605, 2)

In [132]:
submission.columns

Index(['PID', 'SalePrice'], dtype='object')