In [1]:
# this needs to be run for each new runtime
# because colab has scikit-learn 1.0.2 pre-installed 
# and we need newer version (1.2.0 and higher)
# to use .set_output() method
!pip install scikit-learn --upgrade

# if you plan on running the whole notebook again during the same runtime
# you can comment the line above

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-learn
  Downloading scikit_learn-1.2.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.8/9.8 MB[0m [31m47.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.0.2
    Uninstalling scikit-learn-1.0.2:
      Successfully uninstalled scikit-learn-1.0.2
Successfully installed scikit-learn-1.2.1


In [2]:
from sklearn import set_config

set_config(transform_output="pandas")

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector

In [4]:
url = "https://drive.google.com/file/d/1vOPw3gdLX-sRlWOfIS0ZJ__fXuLJrt5Z/view?usp=share_link"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
data = pd.read_csv(path)

In [7]:
X = data.drop(columns=["Id"])
y = X.pop("SalePrice")

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [10]:
from sklearn.ensemble import RandomForestRegressor

numeric_pipe = make_pipeline(
    SimpleImputer(strategy="mean"))
 
categoric_pipe = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore",sparse_output=False)
)

preprocessor = make_column_transformer(
        (numeric_pipe,make_column_selector(dtype_include='number')),
        (categoric_pipe, make_column_selector(dtype_include='object'))
)

pipe = make_pipeline(
    (preprocessor),
    (RandomForestRegressor(random_state=123))
)
pipe

In [13]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'randomforestregressor__max_depth':range(15,25),
    'randomforestregressor__min_samples_leaf':[2,4,6],
    'randomforestregressor__n_estimators':range(5,15)
}

grid_search = GridSearchCV(
    pipe,
    param_grid=param_grid,
    cv=5,
    verbose=1,
    scoring='neg_mean_squared_log_error'
)

In [14]:
grid_search.fit(X_train,y_train)

Fitting 5 folds for each of 300 candidates, totalling 1500 fits


In [15]:
test_predictions = grid_search.predict(X_test)

In [21]:
	
from sklearn.metrics import mean_absolute_error

from sklearn.metrics import mean_squared_error

from sklearn.metrics import mean_squared_log_error

from sklearn.metrics import mean_absolute_percentage_error

print(mean_absolute_error(y_test,test_predictions))
print(mean_squared_error(y_test,test_predictions))
print(mean_squared_log_error(y_test,test_predictions,squared=False))
print(mean_absolute_percentage_error(y_test,test_predictions))

17330.706922025845
764647499.4619577
0.13273098669218006
0.09842618848768661


In [22]:
grid_search.best_estimator_.fit(X,y)

In [23]:
url = "https://drive.google.com/file/d/1Z4EAnUyTS3rLKq9ZW7OTCOlPh3fZQ5Mq/view?usp=share_link"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
data_new = pd.read_csv(path)

In [26]:
id_column = data_new.pop('Id')

In [27]:
predictions = grid_search.best_estimator_.predict(data_new)

In [28]:
results = pd.DataFrame({'Id':id_column,'SalePrice':predictions})

In [29]:
results

Unnamed: 0,Id,SalePrice
0,1461,129248.461538
1,1462,154541.025641
2,1463,180289.923077
3,1464,179426.923077
4,1465,189330.879853
...,...,...
1454,2915,79791.065934
1455,2916,83566.402930
1456,2917,164592.617216
1457,2918,119552.655678


In [30]:
from google.colab import files
results.to_csv('filename.csv',index=False)
files.download('filename.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>