<a href="https://colab.research.google.com/github/hst45/DSW-2023Spring-Housing-Price-Prediction/blob/main/dsw_final_proj.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Environment Set-up

In [None]:
!pip install torcheval
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torcheval
  Downloading torcheval-0.0.6-py3-none-any.whl (158 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.4/158.4 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchtnt>=0.0.5 (from torcheval)
  Downloading torchtnt-0.1.0-py3-none-any.whl (87 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.9/87.9 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
Collecting pyre-extensions (from torchtnt>=0.0.5->torcheval)
  Downloading pyre_extensions-0.0.30-py3-none-any.whl (12 kB)
Collecting typing-inspect (from pyre-extensions->torchtnt>=0.0.5->torcheval)
  Downloading typing_inspect-0.8.0-py3-none-any.whl (8.7 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect->pyre-extensions->torchtnt>=0.0.5->torcheval)
  Downloading mypy_extensions-1.0.0-py3-none-any.whl (4.7 kB)
Installing collected packages: mypy-extensions, typing

# Download Data from Kaggle

In [None]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"xiaoyangfang","key":"196b2b5d3cce592cac016c4f15299c7b"}'}

In [None]:
# !rm -r ~/.kaggle
!mkdir ~/.kaggle
!mv ./kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d andradaolteanu/redfin-housing-market-prices
!unzip -q /content/redfin-housing-market-prices.zip

Downloading redfin-housing-market-prices.zip to /content
 98% 835M/856M [00:06<00:00, 155MB/s]
100% 856M/856M [00:06<00:00, 134MB/s]


# Import Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import files
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,  r2_score
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.decomposition import PCA
import torch
from torch import nn
from torch.nn import Linear, ReLU, Dropout, MSELoss
import torch.nn.functional as F
from torch.optim import SGD
from torcheval.metrics.functional import r2_score
import cudf
import cuml
import cupy as cp
from cuml.metrics.regression import mean_squared_error, r2_score
from cuml.linear_model import LinearRegression as cuLR
from cuml.linear_model import Ridge as cuRG
from cuml.ensemble import RandomForestRegressor as cuRF
from cuml.decomposition import PCA as cuPCA
from cuml.cluster import DBSCAN as cuDBSCAN

In [None]:
housing_data = pd.read_csv("./weekly_housing_market_data_most_recent.tsv000", sep='\t')
housing_data.head()

Unnamed: 0,region_id,region_type_id,region_name,region_type,period_begin,period_end,duration,total_homes_sold,total_homes_sold_yoy,average_homes_sold,...,average_adjustment_average_homes_sold,adjusted_average_homes_sold,average_adjustment_average_new_listings,adjusted_average_new_listings,average_adjustment_pending_sales,adjusted_pending_sales,adjusted_average_homes_delisted_yoy,adjusted_average_homes_sold_yoy,adjusted_average_new_listings_yoy,adjusted_pending_sales_yoy
0,1229,5,"Woodford County, KY",county,2018-05-28,2018-06-24,4 weeks,34.0,0.0,8.0,...,,8.0,,9.0,,6.0,-0.5,0.0,-0.25,-0.25
1,18500,-2,"Corning, NY metro area",metro,2020-07-27,2020-08-23,4 weeks,35.0,-0.078947,8.0,...,,8.0,,11.0,,4.0,0.0,-0.111111,-0.266667,0.0
2,2652,5,"Bee County, TX",county,2019-09-09,2019-10-06,4 weeks,,,,...,,,,,,,,,,
3,24420,-2,"Grants Pass, OR metro area",metro,2017-04-17,2017-05-14,4 weeks,106.0,-0.027523,26.0,...,,26.0,,31.0,,12.0,-0.5,-0.037037,0.0,0.090909
4,24420,-2,"Grants Pass, OR metro area",metro,2021-03-22,2021-04-18,4 weeks,91.0,0.123457,22.0,...,,22.0,,26.0,,22.0,0.0,0.1,0.368421,0.833333


In [None]:
housing_data.shape

(2520360, 102)

# Data Preprocessing

In [None]:
date_temp = pd.DataFrame()
for idx in ['period_begin', 'period_end']:
    date_temp[idx] = pd.to_datetime(housing_data[idx])

date_temp['period_begin_year'] = date_temp['period_begin'].dt.year
date_temp['period_begin_month'] = date_temp['period_begin'].dt.month
date_temp['period_begin_day'] = date_temp['period_begin'].dt.day

date_temp['period_end_year'] = date_temp['period_end'].dt.year
date_temp['period_end_month'] = date_temp['period_end'].dt.month
date_temp['period_end_day'] = date_temp['period_end'].dt.day

date_temp.drop(['period_begin', 'period_end'], axis=1, inplace=True)

In [None]:
processed_data = housing_data.copy()
processed_data = pd.concat([housing_data, date_temp], axis=1)
processed_data.drop(['period_begin', 'period_end'], axis=1, inplace=True)

In [None]:
for n in processed_data.columns:
    missing_num = processed_data[n].isnull().sum()
    if missing_num > len(processed_data)/3:
        processed_data = processed_data.drop(labels=n,axis=1)

In [None]:
processed_data = processed_data.dropna(thresh = 37)

In [None]:
# isNumeric_lst = dropped_data.select_dtypes(include=np.number).columns.tolist()
# dropped_data = dropped_data[isNumeric_lst]
processed_data = processed_data.drop(labels=["region_name","region_type", "duration", "last_updated"],axis=1)
processed_data.head()

Unnamed: 0,region_id,region_type_id,total_homes_sold,total_homes_sold_yoy,average_homes_sold,average_homes_sold_yoy,total_homes_sold_with_price_drops,average_homes_sold_with_price_drops,percent_homes_sold_with_price_drops,percent_homes_sold_with_price_drops_yoy,...,adjusted_pending_sales,adjusted_average_homes_sold_yoy,adjusted_average_new_listings_yoy,adjusted_pending_sales_yoy,period_begin_year,period_begin_month,period_begin_day,period_end_year,period_end_month,period_end_day
0,1229,5,34.0,0.0,8.0,0.0,8.0,2.0,0.1625,0.037392,...,6.0,0.0,-0.25,-0.25,2018,5,28,2018,6,24
1,18500,-2,35.0,-0.078947,8.0,-0.111111,0.0,0.0,0.0,0.0,...,4.0,-0.111111,-0.266667,0.0,2020,7,27,2020,8,23
3,24420,-2,106.0,-0.027523,26.0,-0.037037,0.0,0.0,0.0,0.0,...,12.0,-0.037037,0.0,0.090909,2017,4,17,2017,5,14
4,24420,-2,91.0,0.123457,22.0,0.1,2.0,0.0,0.02607,0.02607,...,22.0,0.1,0.368421,0.833333,2021,3,22,2021,4,18
5,3008,5,29.0,0.380952,7.0,0.4,13.0,3.0,0.454167,-0.031944,...,7.0,0.4,-0.25,0.75,2020,1,20,2020,2,16


In [None]:
correlation = processed_data.corr()
correlation['median_sale_price'].sort_values(ascending=False)

median_sale_price                                          1.000000
average_sale_to_list_ratio                                 0.269089
average_percent_off_market_in_two_weeks_listing_updates    0.247314
percent_homes_sold_above_list                              0.228546
average_percent_off_market_in_one_week_listing_updates     0.210736
                                                             ...   
active_listings_yoy                                       -0.024460
inventory_yoy                                             -0.025213
months_of_supply                                          -0.060952
median_days_on_market                                     -0.084217
age_of_inventory                                          -0.188311
Name: median_sale_price, Length: 74, dtype: float64

In [None]:
processed_data.dropna(subset=['median_sale_price'],inplace = True)

In [None]:
covariates = processed_data.cov()
covariates_ranking=covariates['median_sale_price'].abs().sort_values(ascending=False)
cr_top5=covariates_ranking[1:5]

In [None]:
cr_least10 = covariates_ranking[-10:]
covariates_ranking[-20:]

median_new_listing_ppsf_yoy                                    4773.277294
pending_sales_yoy                                              4509.592724
adjusted_pending_sales_yoy                                     4505.076248
pending_sales_to_sales_ratio                                   4168.814242
period_begin_month                                             3268.121906
period_end_day                                                 2251.208496
average_sale_to_list_ratio                                     2067.998414
median_new_listing_price_yoy                                   1860.848750
percent_homes_sold_with_price_drops                            1704.543607
period_end_month                                               1667.813270
percent_homes_sold_above_list_yoy                              1625.575943
median_sale_price_yoy                                          1563.643760
median_sale_ppsf_yoy                                           1269.899365
average_percent_off_marke

In [None]:
for n in cr_top5.keys():
    if n=='median_sale_price':
        continue
    processed_data.drop(labels=n,inplace=True,axis=1)

In [None]:
for n in cr_least10.keys():
    processed_data.drop(labels=n,inplace=True,axis=1)

In [None]:
# for n in processed_data.columns:
#     mean = processed_data[n].mean()
#     processed_data.fillna(value=mean,inplace=True)

# processed_data.head()
processed_data = processed_data.groupby('region_id').apply(lambda x:x.fillna(x.mean()))
for n in processed_data.columns:
    mean = processed_data[n].mean()
    processed_data.fillna(value=mean, inplace=True)

processed_data.head()

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  processed_data = processed_data.groupby('region_id').apply(lambda x:x.fillna(x.mean()))


Unnamed: 0,region_id,region_type_id,total_homes_sold,total_homes_sold_yoy,average_homes_sold,average_homes_sold_yoy,total_homes_sold_with_price_drops,average_homes_sold_with_price_drops,percent_homes_sold_with_price_drops,median_sale_price,...,adjusted_pending_sales,adjusted_average_homes_sold_yoy,adjusted_average_new_listings_yoy,adjusted_pending_sales_yoy,period_begin_year,period_begin_month,period_begin_day,period_end_year,period_end_month,period_end_day
0,1229,5,34.0,0.0,8.0,0.0,8.0,2.0,0.1625,223225.0,...,6.0,0.0,-0.25,-0.25,2018,5,28,2018,6,24
1,18500,-2,35.0,-0.078947,8.0,-0.111111,0.0,0.0,0.0,100468.1,...,4.0,-0.111111,-0.266667,0.0,2020,7,27,2020,8,23
3,24420,-2,106.0,-0.027523,26.0,-0.037037,0.0,0.0,0.0,260625.0,...,12.0,-0.037037,0.0,0.090909,2017,4,17,2017,5,14
4,24420,-2,91.0,0.123457,22.0,0.1,2.0,0.0,0.02607,371437.5,...,22.0,0.1,0.368421,0.833333,2021,3,22,2021,4,18
5,3008,5,29.0,0.380952,7.0,0.4,13.0,3.0,0.454167,234975.0,...,7.0,0.4,-0.25,0.75,2020,1,20,2020,2,16


In [None]:
processed_data = processed_data.apply(lambda x: x - x.min() + 1)
processed_data = np.log(processed_data)

In [None]:
# kurtosis_values = processed_data.kurtosis().sort_values(ascending=False)
# high_kurtosis_values_index = kurtosis_values[kurtosis_values > 3].index
# processed_cudf = cudf.from_pandas(processed_data[high_kurtosis_values_index])

In [None]:
# processed_cudf.iloc[:, 0]

In [None]:
# dbscan = cuDBSCAN(eps=0.5, min_samples=5)
# labels = dbscan.fit_predict(processed_cudf.iloc[:, 0])

## Prepare Dataset

In [None]:
# features = processed_data.copy().drop(['median_sale_price'], axis=1)
# targets = processed_data['median_sale_price']
features = processed_data.drop(labels='median_sale_price',axis=1)
targets = processed_data['median_sale_price']

In [None]:
feature_scaler = StandardScaler()
features = feature_scaler.fit_transform(features)
target_scaler = StandardScaler()
targets = target_scaler.fit_transform(targets.to_numpy().reshape(-1, 1))
features.shape, targets.shape

((2156405, 59), (2156405, 1))

# Data Processing (Simplified)

In [None]:
processed_data = housing_data.copy()
processed_data = processed_data.dropna()
processed_data.head()

Unnamed: 0,region_id,region_type_id,region_name,region_type,period_begin,period_end,duration,total_homes_sold,total_homes_sold_yoy,average_homes_sold,...,average_adjustment_average_homes_sold,adjusted_average_homes_sold,average_adjustment_average_new_listings,adjusted_average_new_listings,average_adjustment_pending_sales,adjusted_pending_sales,adjusted_average_homes_delisted_yoy,adjusted_average_homes_sold_yoy,adjusted_average_new_listings_yoy,adjusted_pending_sales_yoy
2211,42340,-2,"Savannah, GA metro area",metro,2022-06-13,2022-07-10,4 weeks,835.0,-0.091404,208.0,...,1.011774,210.448942,1.018813,213.950762,1.222194,197.995416,-0.428916,-0.081009,-0.00023,0.031226
6277,31140,-2,"Louisville, KY metro area",metro,2022-05-30,2022-06-26,4 weeks,1611.0,-0.098489,402.0,...,1.001802,402.724409,1.002245,521.167386,1.001614,380.613154,0.921744,-0.09703,-0.027673,-0.200393
13906,38060,-2,"Phoenix, AZ metro area",metro,2022-06-06,2022-07-03,4 weeks,6846.0,-0.201726,1711.0,...,1.000037,1711.063988,1.027927,2580.096035,0.993822,1458.930448,1.134003,-0.201929,0.139618,-0.24172
15458,10420,-2,"Akron, OH metro area",metro,2022-06-13,2022-07-10,4 weeks,888.0,0.00339,222.0,...,1.011533,224.560398,1.02532,286.064325,1.040836,224.820648,-0.25453,0.01611,-0.040053,0.060475
15516,24660,-2,"Greensboro, NC metro area",metro,2022-05-30,2022-06-26,4 weeks,915.0,-0.051813,228.0,...,1.002256,228.514391,1.019356,197.755119,1.05344,184.352,0.966902,-0.051808,-0.338612,-0.250602


In [None]:
isNumeric_lst = processed_data.select_dtypes(include=np.number).columns.tolist()
processed_data = processed_data[isNumeric_lst]
processed_data.head()

Unnamed: 0,region_id,region_type_id,total_homes_sold,total_homes_sold_yoy,average_homes_sold,average_homes_sold_yoy,total_homes_sold_with_price_drops,total_homes_sold_with_price_drops_yoy,average_homes_sold_with_price_drops,average_homes_sold_with_price_drops_yoy,...,average_adjustment_average_homes_sold,adjusted_average_homes_sold,average_adjustment_average_new_listings,adjusted_average_new_listings,average_adjustment_pending_sales,adjusted_pending_sales,adjusted_average_homes_delisted_yoy,adjusted_average_homes_sold_yoy,adjusted_average_new_listings_yoy,adjusted_pending_sales_yoy
2211,42340,-2,835.0,-0.091404,208.0,-0.091703,84.0,-0.045455,21.0,-0.045455,...,1.011774,210.448942,1.018813,213.950762,1.222194,197.995416,-0.428916,-0.081009,-0.00023,0.031226
6277,31140,-2,1611.0,-0.098489,402.0,-0.098655,204.0,0.186047,51.0,0.186047,...,1.001802,402.724409,1.002245,521.167386,1.001614,380.613154,0.921744,-0.09703,-0.027673,-0.200393
13906,38060,-2,6846.0,-0.201726,1711.0,-0.201959,1470.0,0.666667,367.0,0.668182,...,1.000037,1711.063988,1.027927,2580.096035,0.993822,1458.930448,1.134003,-0.201929,0.139618,-0.24172
15458,10420,-2,888.0,0.00339,222.0,0.004525,95.0,-0.103774,23.0,-0.115385,...,1.011533,224.560398,1.02532,286.064325,1.040836,224.820648,-0.25453,0.01611,-0.040053,0.060475
15516,24660,-2,915.0,-0.051813,228.0,-0.053942,85.0,-0.123711,21.0,-0.125,...,1.002256,228.514391,1.019356,197.755119,1.05344,184.352,0.966902,-0.051808,-0.338612,-0.250602


In [None]:
features = processed_data.copy().drop(['median_sale_price'], axis=1)
targets = processed_data['median_sale_price']

In [None]:
feature_scaler = StandardScaler()
features = feature_scaler.fit_transform(features)
target_scaler = StandardScaler()
targets = target_scaler.fit_transform(targets.to_numpy().reshape(-1, 1))
features.shape, targets.shape

((572, 95), (572, 1))

In [None]:
train_X, validation_X, train_Y, validation_Y = train_test_split(features, targets, test_size=0.3,random_state=42)
train_X.shape, train_Y.shape,  validation_X.shape, validation_Y.shape

((400, 95), (400, 1), (172, 95), (172, 1))

# Pytorch Implementation

In [None]:
train_X, validation_X, train_Y, validation_Y = train_test_split(features, targets, test_size=0.3,random_state=42)
train_X.shape, train_Y.shape,  validation_X.shape, validation_Y.shape

## Linear Regression

In [None]:
lr = 0.01
epochs = 100
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
X_train, Y_train, X_val, Y_val = (torch.tensor(train_X, dtype=torch.float32).to(device), 
                                  torch.tensor(train_Y, dtype=torch.float32).to(device), 
                                  torch.tensor(validation_X, dtype=torch.float32).to(device), 
                                  torch.tensor(validation_Y, dtype=torch.float32).to(device))

In [None]:
def train(X, y, model):
  linear_reg = LinearRegression(X_train.shape[1], Y_train.shape[1]).to(device)
  criterion = MSELoss()
  optimizer = SGD(model.parameters(), lr=lr)

  for epoch in range(epochs):
    optimizer.zero_grad()
    y_pred = model(X)
    loss = criterion(y_pred, y)
    loss.backward()
    optimizer.step()
    if epoch % 10 == 0:
      print('epoch {}, mse loss {}, r2 {}'.format(epoch, loss.item(), r2_score(y, y_pred)))
  
  return y_pred

In [None]:
class LinearRegression(nn.Module):
  def __init__(self, input_dim, output_dim):
    super(LinearRegression, self).__init__()
    self.linear = Linear(input_dim, output_dim)
  
  def forward(self, x):
    return self.linear(x)

In [None]:
linear_reg = LinearRegression(X_train.shape[1], Y_train.shape[1]).to(device)
lr_pred = train(X_train, Y_train, linear_reg)

epoch 0, mse loss 1.2466484308242798, r2 -2.826551914215088
epoch 10, mse loss 1.0189741849899292, r2 -11.42470645904541
epoch 20, mse loss 0.9867429733276367, r2 -13.950567245483398
epoch 30, mse loss 0.9684101343154907, r2 -14.871831893920898
epoch 40, mse loss 0.9565632343292236, r2 -15.08073616027832
epoch 50, mse loss 0.9485692381858826, r2 -14.910381317138672
epoch 60, mse loss 0.9430433511734009, r2 -14.566278457641602
epoch 70, mse loss 0.9391549825668335, r2 -14.168608665466309
epoch 80, mse loss 0.9363759160041809, r2 -13.779415130615234
epoch 90, mse loss 0.9343602657318115, r2 -13.426374435424805


## MLP

In [None]:
class MLP(nn.Module):
  def __init__(self, input_dim, output_dim):
    super(MLP, self).__init__()
    self.linear_1 = Linear(input_dim, 100)
    self.linear_2 = Linear(100, 100)
    self.linear_3 = Linear(100, output_dim)
    self.dropout = Dropout(p=0.5)
  
  def forward(self, x):
    x = F.relu(self.linear_1(x))
    x = F.relu(self.linear_2(self.dropout(x)))
    return self.linear_3(self.dropout(x))

In [None]:
mlp = MLP(X_train.shape[1], Y_train.shape[1]).to(device)
train(X_train, Y_train, mlp)

epoch 0, mse loss 1.0199886560440063, r2 -34.76950454711914
epoch 10, mse loss 1.0144093036651611, r2 -37.31852340698242
epoch 20, mse loss 1.0113297700881958, r2 -37.815826416015625
epoch 30, mse loss 1.0085357427597046, r2 -39.22800064086914
epoch 40, mse loss 1.005237340927124, r2 -41.09366226196289
epoch 50, mse loss 1.0037004947662354, r2 -40.77359390258789
epoch 60, mse loss 1.0016684532165527, r2 -42.2321891784668
epoch 70, mse loss 0.999138355255127, r2 -42.6024055480957
epoch 80, mse loss 0.9980297088623047, r2 -42.71720886230469
epoch 90, mse loss 0.9952320456504822, r2 -45.68025588989258


### With PCA

In [None]:
n_components_lst = [10, 20, 30, 40, 50]
for n in n_components_lst:
    pca = PCA(n_components=n)
    trans_X_train = torch.tensor(
        pca.fit_transform(train_X),
        dtype=torch.float32).to(device)

    linear_reg = LinearRegression(trans_X_train.shape[1], Y_train.shape[1]).to(device)
    print("n_components = {}:\n".format(n))
    train(trans_X_train, Y_train, linear_reg)

n_components = 10:

epoch 0, mse loss 2.404616117477417, r2 -1.0803132057189941
epoch 10, mse loss 1.180554986000061, r2 -9.480432510375977
epoch 20, mse loss 1.067784309387207, r2 -18.46712875366211
epoch 30, mse loss 1.0192118883132935, r2 -26.49082374572754
epoch 40, mse loss 0.9916143417358398, r2 -31.266986846923828
epoch 50, mse loss 0.9749576449394226, r2 -31.969268798828125
epoch 60, mse loss 0.964658796787262, r2 -30.3239803314209
epoch 70, mse loss 0.9582075476646423, r2 -28.03130340576172
epoch 80, mse loss 0.9541332721710205, r2 -25.88436508178711
epoch 90, mse loss 0.9515454173088074, r2 -24.107995986938477
n_components = 20:

epoch 0, mse loss 1.6667400598526, r2 -0.8776556253433228
epoch 10, mse loss 1.1063032150268555, r2 -3.7326316833496094
epoch 20, mse loss 1.0334886312484741, r2 -5.512673854827881
epoch 30, mse loss 0.9973807334899902, r2 -6.942053318023682
epoch 40, mse loss 0.9766792058944702, r2 -8.080382347106934
epoch 50, mse loss 0.9636510610580444, r2 -8.9690

## Performance

# CuML Implementation

In [None]:
cu_X = cp.array(features)
cu_y = cp.array(targets)

In [None]:
train_X, validation_X, train_Y, validation_Y = cuml.model_selection.train_test_split(cu_X, cu_y, test_size=0.3,random_state=42)
train_X.shape, train_Y.shape,  validation_X.shape, validation_Y.shape

((1509484, 59), (1509484, 1), (646921, 59), (646921, 1))

In [None]:
mse_train_lst, r2_train_lst, mse_val_lst, r2_val_lst = [], [], [], []
eval_num = 10
def performance(model, train_X, train_Y, validation_X, validation_Y):
    mse_train_tmp, r2_train_tmp = [], []
    mse_val_tmp, r2_val_tmp = [], []
    for i in range(eval_num):
        model.fit(train_X, train_Y)
        train_pred_Y = model.predict(train_X)
        mse_train_tmp.append(cuml.metrics.regression.mean_squared_error(train_Y, train_pred_Y))
        r2_train_tmp.append(cuml.metrics.regression.r2_score(train_Y, train_pred_Y))

        validation_pred_Y = model.predict(validation_X)
        mse_val_tmp.append(cuml.metrics.regression.mean_squared_error(validation_Y, validation_pred_Y))
        r2_val_tmp.append(cuml.metrics.regression.r2_score(validation_Y, validation_pred_Y))
    
    mse_train, r2_train, mse_val, r2_val = (
        cp.asnumpy(cp.mean(cp.asarray(mse_train_tmp))),
        cp.asnumpy(cp.mean(cp.asarray(r2_train_tmp))),
        cp.asnumpy(cp.mean(cp.asarray(mse_val_tmp))),
        cp.asnumpy(cp.mean(cp.asarray(r2_val_tmp))),
    )

    print("Train set:\nthe mse is {},\nthe r2 score is {}\n".format(mse_train, r2_train))
    mse_train_lst.append([str(model), mse_train]), r2_train_lst.append([str(model), r2_train])

    print("Validation set:\nthe mse is {},\nthe r2 score is {},\n".format(mse_val, r2_val))
    mse_val_lst.append([str(model), mse_val]), r2_val_lst.append([str(model), r2_val])

    return model.predict(train_X)

In [None]:
cu_lr = cuLR()
lr_res = performance(cu_lr, train_X, train_Y, validation_X, validation_Y)

Train set:
the mse is 0.9140798907694124,
the r2 score is 0.09139677745950459

Validation set:
the mse is 0.8951013293286971,
the r2 score is 0.09212993049203644,



In [None]:
cu_rg = cuRG(alpha=cp.array([1e-5]))
rg_res = performance(cu_rg, train_X, train_Y, validation_X, validation_Y)

Train set:
the mse is 0.9141186078624843,
the r2 score is 0.09135829233814086

Validation set:
the mse is 0.8951668842880217,
the r2 score is 0.0920634403824494,



In [None]:
one_tree_cu_rf = cuRF(n_estimators=1)
dt_res = performance(one_tree_cu_rf, train_X, train_Y, validation_X, validation_Y)

  ret = func(*args, **kwargs)


Train set:
the mse is 0.12175291178482124,
the r2 score is 0.8789765652668935

Validation set:
the mse is 0.24263198880461653,
the r2 score is 0.7539068334239815,



In [None]:
cu_rf = cuRF(max_depth=10, n_estimators=25, random_state=42)
rf_res = performance(cu_rf, train_X, train_Y, validation_X, validation_Y)

  return func(**kwargs)
  ret = func(*args, **kwargs)


Train set:
the mse is 0.10321049870144625,
the r2 score is 0.8967800502912485

Validation set:
the mse is 0.10742240387943576,
the r2 score is 0.8926005350349557,



In [None]:
n_components_lst = [20, 22, 25, 27, 30]
for n in n_components_lst:
    cu_pca = cuPCA(n_components=n)
    trans_train_X = cu_pca.fit_transform(train_X)
    trans_val_X = cu_pca.transform(validation_X)
    
    one_tree_cu_rf = cuRF(n_estimators=1)
    print("n_components = {}:".format(n))
    performance(one_tree_cu_rf, trans_train_X, train_Y, trans_val_X, validation_Y)

n_components = 20:
Train set:
the mse is 0.16772258107581658,
the r2 score is 0.8332823211655802

Validation set:
the mse is 0.26537153481358844,
the r2 score is 0.730842904749864,

n_components = 22:
Train set:
the mse is 0.17192748006525482,
the r2 score is 0.8291026156378226

Validation set:
the mse is 0.2577492097873706,
the r2 score is 0.7385739632620378,

n_components = 25:
Train set:
the mse is 0.17230737572271498,
the r2 score is 0.828724996107973

Validation set:
the mse is 0.26418422327195323,
the r2 score is 0.7320471534494389,

n_components = 27:
Train set:
the mse is 0.1692116163110628,
the r2 score is 0.8318022074174458

Validation set:
the mse is 0.27572527092355353,
the r2 score is 0.7203414712095174,

n_components = 30:
Train set:
the mse is 0.17884501380281265,
the r2 score is 0.8222265279900712

Validation set:
the mse is 0.2878345348148913,
the r2 score is 0.7080594670492019,

