In [1]:
from google.colab import files

uploaded = files.upload()


Saving data_description.txt to data_description.txt
Saving sample_submission.csv to sample_submission.csv
Saving test.csv to test.csv
Saving train.csv to train.csv


In [2]:
import os

os.listdir("/content")


['.config',
 'data_description.txt',
 'sample_submission.csv',
 'train.csv',
 'test.csv',
 'sample_data']

In [3]:
import pandas as pd
import numpy as np

train = pd.read_csv("/content/train.csv")
test  = pd.read_csv("/content/test.csv")

y = train["SalePrice"]
train.drop("SalePrice", axis=1, inplace=True)

full = pd.concat([train, test], axis=0).reset_index(drop=True)

full.shape


(2919, 80)

In [None]:
import pandas as pd

train = pd.read_csv("train.csv")
train.head(10)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
5,6,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,Shed,700,10,2009,WD,Normal,143000
6,7,20,RL,75.0,10084,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,307000
7,8,60,RL,,10382,Pave,,IR1,Lvl,AllPub,...,0,,,Shed,350,11,2009,WD,Normal,200000
8,9,50,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2008,WD,Abnorml,129900
9,10,190,RL,50.0,7420,Pave,,Reg,Lvl,AllPub,...,0,,,,0,1,2008,WD,Normal,118000


In [4]:
drop_cols = [
    "Alley", "Street", "Utilities", "LandContour",
    "Condition2", "RoofMatl", "Heating",
    "PoolQC", "Fence", "MiscFeature",
    "LowQualFinSF"
]

full.drop(columns=drop_cols, inplace=True)


In [5]:
full["LotFrontage"] = full.groupby("Neighborhood")["LotFrontage"].transform(
    lambda x: x.fillna(x.median())
)


In [6]:
full["TotalLandArea"] = full["LotArea"] + full["WoodDeckSF"] + full["OpenPorchSF"] + \
                        full["EnclosedPorch"] + full["3SsnPorch"] + full["ScreenPorch"]

full["TotalPorchSF"] = full["OpenPorchSF"] + full["EnclosedPorch"] + \
                       full["3SsnPorch"] + full["ScreenPorch"]

full["TotalSF"] = full["GrLivArea"] + full["TotalBsmtSF"]

full.drop(columns=[
    "WoodDeckSF","OpenPorchSF","EnclosedPorch",
    "3SsnPorch","ScreenPorch"
], inplace=True)


In [7]:
full["TotalBath"] = (
    full["FullBath"] +
    0.5 * full["HalfBath"] +
    full["BsmtFullBath"] +
    0.5 * full["BsmtHalfBath"]
)

full.drop(columns=["FullBath","HalfBath","BsmtFullBath","BsmtHalfBath"], inplace=True)


In [8]:
full["HouseAge"] = full["YrSold"] - full["YearBuilt"]
full["RemodelAge"] = full["YrSold"] - full["YearRemodAdd"]

full.drop(columns=["YearBuilt","YearRemodAdd","YrSold"], inplace=True)


In [9]:
qual_map = {"Po":1,"Fa":2,"TA":3,"Gd":4,"Ex":5}

for col in ["ExterQual","ExterCond","HeatingQC","KitchenQual","FireplaceQu","GarageQual","GarageCond"]:
    full[col] = full[col].map(qual_map)

full["QualityScore"] = (
    full["OverallQual"]*2 +
    full["OverallCond"] +
    full["ExterQual"] +
    full["KitchenQual"] +
    full["GarageQual"]
)


In [10]:
for col in full.select_dtypes(include="object"):
    freq = full[col].value_counts(normalize=True)
    rare = freq[freq < 0.01].index
    full[col] = full[col].replace(rare, "Rare")


In [11]:
full = pd.get_dummies(full, drop_first=True)


In [12]:
X_train = full.iloc[:len(y), :]
X_test  = full.iloc[len(y):, :]

X_train.shape, X_test.shape


((1460, 155), (1459, 155))

In [13]:
y_log = np.log1p(y)


In [14]:
from sklearn.model_selection import train_test_split

X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_log, test_size=0.2, random_state=42
)


In [15]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_tr = scaler.fit_transform(X_tr)
X_val = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)


In [16]:
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error
import numpy as np

models = {
    "Ridge": Ridge(alpha=20),
    "Lasso": Lasso(alpha=0.0005),
    "ElasticNet": ElasticNet(alpha=0.0005, l1_ratio=0.9)
}

for name, model in models.items():
    model.fit(X_tr, y_tr)
    preds = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, preds))
    print(name, "RMSE:", rmse)


ValueError: Input X contains NaN.
Ridge does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [17]:
full.isnull().sum().sort_values(ascending=False).head(20)


Unnamed: 0,0
FireplaceQu,1420
QualityScore,160
GarageYrBlt,159
GarageCond,159
GarageQual,159
MasVnrArea,23
TotalBath,2
BsmtUnfSF,1
GarageCars,1
TotalBsmtSF,1


In [18]:
num_cols = full.select_dtypes(include=["int64","float64"]).columns
full[num_cols] = full[num_cols].fillna(0)


In [19]:
cat_cols = full.select_dtypes(include="object").columns
full[cat_cols] = full[cat_cols].fillna("None")


In [20]:
full = pd.get_dummies(full, drop_first=True)


In [21]:

X_train = full.iloc[:len(y), :]
X_test  = full.iloc[len(y):, :]


In [22]:
y_log = np.log1p(y)


After the file is uploaded, you can access it in the Colab environment. For example, if you uploaded a CSV file named `my_data.csv`, you could then load it into a pandas DataFrame like this:

In [25]:
np.isnan(X_tr).sum()


np.int64(809)

In [26]:
nan_cols = X_train.columns[X_train.isnull().any()]
nan_cols


Index([], dtype='object')

In [27]:
y_log = np.log1p(y)

from sklearn.model_selection import train_test_split
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_log, test_size=0.2, random_state=42
)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_tr = scaler.fit_transform(X_tr)
X_val = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

np.isnan(X_tr).sum(), np.isnan(X_val).sum()


(np.int64(0), np.int64(0))

In [28]:
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error
import numpy as np

models = {
    "Ridge": Ridge(alpha=20),
    "Lasso": Lasso(alpha=0.0005),
    "ElasticNet": ElasticNet(alpha=0.0005, l1_ratio=0.9)
}

for name, model in models.items():
    model.fit(X_tr, y_tr)
    preds = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, preds))
    print(name, "RMSE:", rmse)


Ridge RMSE: 0.13949659748380897
Lasso RMSE: 0.13919672918547427
ElasticNet RMSE: 0.13929761179054445


In [29]:
from sklearn.model_selection import GridSearchCV

enet = ElasticNet(max_iter=5000)

param_grid = {
    "alpha": [0.0003, 0.0005, 0.001, 0.003],
    "l1_ratio": [0.7, 0.8, 0.9, 0.95]
}

grid = GridSearchCV(
    enet,
    param_grid,
    cv=5,
    scoring="neg_root_mean_squared_error"
)

grid.fit(X_tr, y_tr)

grid.best_params_, -grid.best_score_


({'alpha': 0.003, 'l1_ratio': 0.95}, np.float64(0.1452457721079593))

In [30]:
best_enet = grid.best_estimator_
best_enet.fit(X_tr, y_tr)

val_preds = best_enet.predict(X_val)
np.sqrt(mean_squared_error(y_val, val_preds))


np.float64(0.13827612901477768)

In [31]:
best_enet = grid.best_estimator_
best_enet.fit(X_tr, y_tr)


In [32]:
best_enet = ElasticNet(alpha=0.0005, l1_ratio=0.9)
best_enet.fit(X_tr, y_tr)


In [33]:
val_preds = best_enet.predict(X_val)


In [34]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

rmse = np.sqrt(mean_squared_error(y_val, val_preds))
r2 = r2_score(y_val, val_preds)

rmse, r2


(np.float64(0.13929761179054445), 0.8960199246718434)

In [36]:
train_preds = best_enet.predict(X_tr)

train_rmse = np.sqrt(mean_squared_error(y_tr, train_preds))
val_rmse = np.sqrt(mean_squared_error(y_val, val_preds))

train_rmse, val_rmse


(np.float64(0.11650083059978947), np.float64(0.13929761179054445))

In [38]:
top_feats = ["TotalSF", "QualityScore", "TotalLandArea", "HouseAge"]

for f in top_feats:
    X_train[f+"_sq"] = X_train[f]**2
    X_test[f+"_sq"] = X_test[f]**2

    X_train[f+"_log"] = np.log1p(X_train[f])
    X_test[f+"_log"] = np.log1p(X_test[f])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[f+"_sq"] = X_train[f]**2
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[f+"_sq"] = X_test[f]**2
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[f+"_log"] = np.log1p(X_train[f])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_in

In [39]:
neigh_price = pd.DataFrame({
    "Neighborhood": train["Neighborhood"],
    "SalePrice": y
}).groupby("Neighborhood")["SalePrice"].mean()

X_train["NeighPrice"] = train["Neighborhood"].map(neigh_price)
X_test["NeighPrice"] = test["Neighborhood"].map(neigh_price)

X_train["NeighPrice"] = X_train["NeighPrice"].fillna(neigh_price.mean())
X_test["NeighPrice"] = X_test["NeighPrice"].fillna(neigh_price.mean())


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train["NeighPrice"] = train["Neighborhood"].map(neigh_price)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test["NeighPrice"] = test["Neighborhood"].map(neigh_price)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train["NeighPrice"] = X_train["NeighPrice"].fillna(neigh_price.mean())
A value 

In [40]:
from sklearn.linear_model import Ridge, ElasticNet

ridge = Ridge(alpha=15)
enet  = ElasticNet(alpha=0.0005, l1_ratio=0.9)

ridge.fit(X_tr, y_tr)
enet.fit(X_tr, y_tr)

ridge_preds = ridge.predict(X_val)
enet_preds  = enet.predict(X_val)

stack_preds = 0.6*enet_preds + 0.4*ridge_preds

np.sqrt(mean_squared_error(y_val, stack_preds))


np.float64(0.13941334942660236)

In [41]:
top_feats = ["TotalSF", "QualityScore", "TotalLandArea", "HouseAge"]

for f in top_feats:
    X_train[f+"_sq"] = X_train[f]**2
    X_test[f+"_sq"] = X_test[f]**2

    X_train[f+"_log"] = np.log1p(X_train[f])
    X_test[f+"_log"] = np.log1p(X_test[f])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[f+"_sq"] = X_train[f]**2
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[f+"_sq"] = X_test[f]**2
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[f+"_log"] = np.log1p(X_train[f])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_in

In [46]:
X_train = X_train.copy()
X_test  = X_test.copy()

X_train.loc[:, "NeighPrice"] = train["Neighborhood"].map(neigh_price)
X_test.loc[:, "NeighPrice"] = test["Neighborhood"].map(neigh_price)

mean_price = neigh_price.mean()
X_train["NeighPrice"] = X_train["NeighPrice"].fillna(mean_price)
X_test["NeighPrice"] = X_test["NeighPrice"].fillna(mean_price)


In [47]:
y_log = np.log1p(y)

from sklearn.model_selection import train_test_split
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_log, test_size=0.2, random_state=42
)


In [49]:
np.log1p(X_test[f])


  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0,HouseAge
1460,3.912023
1461,3.970292
1462,2.639057
1463,2.564949
1464,2.944439
...,...
2914,3.610918
2915,3.610918
2916,3.850148
2917,2.708050


In [50]:
np.isinf(X_test).sum()


Unnamed: 0,0
Id,0
MSSubClass,0
LotFrontage,0
LotArea,0
OverallQual,0
...,...
TotalLandArea_sq,0
TotalLandArea_log,0
HouseAge_sq,0
HouseAge_log,1


In [51]:
inf_cols = X_test.columns[np.isinf(X_test).any()]
inf_cols


Index(['HouseAge_log'], dtype='object')

In [52]:
X_train["HouseAge"] = X_train["HouseAge"].clip(lower=0)
X_test["HouseAge"] = X_test["HouseAge"].clip(lower=0)


In [53]:
X_train["HouseAge_log"] = np.log1p(X_train["HouseAge"])
X_test["HouseAge_log"] = np.log1p(X_test["HouseAge"])


In [54]:
X_train = X_train.drop(columns=["HouseAge_log"], errors="ignore")
X_test  = X_test.drop(columns=["HouseAge_log"], errors="ignore")


In [55]:
X_train = X_train.replace([np.inf, -np.inf], 0).fillna(0)
X_test  = X_test.replace([np.inf, -np.inf], 0).fillna(0)


In [56]:
np.isinf(X_train).sum(), np.isnan(X_train).sum()


(Id                   0
 MSSubClass           0
 LotFrontage          0
 LotArea              0
 OverallQual          0
                     ..
 QualityScore_log     0
 TotalLandArea_sq     0
 TotalLandArea_log    0
 HouseAge_sq          0
 NeighPrice           0
 Length: 163, dtype: int64,
 Id                   0
 MSSubClass           0
 LotFrontage          0
 LotArea              0
 OverallQual          0
                     ..
 QualityScore_log     0
 TotalLandArea_sq     0
 TotalLandArea_log    0
 HouseAge_sq          0
 NeighPrice           0
 Length: 163, dtype: int64)

In [57]:
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_log, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_tr = scaler.fit_transform(X_tr)
X_val = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)


In [58]:
from sklearn.linear_model import Ridge, ElasticNet
from sklearn.metrics import mean_squared_error
import numpy as np

ridge = Ridge(alpha=15)
enet  = ElasticNet(alpha=0.0005, l1_ratio=0.9)

ridge.fit(X_tr, y_tr)
enet.fit(X_tr, y_tr)


  model = cd_fast.enet_coordinate_descent(


In [59]:
ridge_preds = ridge.predict(X_val)
enet_preds  = enet.predict(X_val)

stack_preds = 0.6 * enet_preds + 0.4 * ridge_preds


In [60]:
rmse = np.sqrt(mean_squared_error(y_val, stack_preds))
rmse


np.float64(0.13171796879424477)

In [61]:
!ls


data_description.txt  sample_data  sample_submission.csv  test.csv  train.csv


In [64]:
# Predict on test set using the trained Ridge and ElasticNet
ridge_test = ridge.predict(X_test_scaled)
enet_test  = enet.predict(X_test_scaled)

test_preds_log = 0.6 * enet_test + 0.4 * ridge_test
test_preds = np.expm1(test_preds_log)


In [65]:
submission = pd.read_csv("/content/sample_submission.csv")
submission["SalePrice"] = test_preds
submission.to_csv("submission.csv", index=False)


In [66]:
!ls


data_description.txt  sample_submission.csv  test.csv
sample_data	      submission.csv	     train.csv


In [67]:
from google.colab import files
files.download("submission.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [68]:
import joblib

joblib.dump(ridge, "ridge_model.pkl")
joblib.dump(enet, "elasticnet_model.pkl")
joblib.dump(scaler, "scaler.pkl")


['scaler.pkl']

In [69]:
!ls


data_description.txt  sample_data	     submission.csv
elasticnet_model.pkl  sample_submission.csv  test.csv
ridge_model.pkl       scaler.pkl	     train.csv


In [71]:
from google.colab import files

files.download("ridge_model.pkl")
files.download("elasticnet_model.pkl")
files.download("scaler.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [44]:
X_train["NeighPrice"] = train["Neighborhood"].map(neigh_price)
X_test["NeighPrice"] = test["Neighborhood"].map(neigh_price)

mean_price = neigh_price.mean()
X_train["NeighPrice"] = X_train["NeighPrice"].fillna(mean_price)
X_test["NeighPrice"] = X_test["NeighPrice"].fillna(mean_price)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train["NeighPrice"] = train["Neighborhood"].map(neigh_price)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test["NeighPrice"] = test["Neighborhood"].map(neigh_price)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train["NeighPrice"] = X_train["NeighPrice"].fillna(mean_price)
A value is tryin

In [35]:
real_preds = np.expm1(val_preds)
real_y = np.expm1(y_val)

abs_error = np.mean(np.abs(real_preds - real_y))
abs_error


np.float64(16607.623640430917)