In [140]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import torch
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

**1. Загрузка данных**

In [3]:
import kagglehub

path_calif = kagglehub.dataset_download("camnugent/california-housing-prices")
path_boston = kagglehub.dataset_download("vikrishnan/boston-house-prices")

df_calif = pd.read_csv(path_calif + '\housing.csv')
df_boston = pd.read_csv(path_boston + '\housing.csv', 
                        header = 0, 
                        names = ['CRIM', 'ZN', 'INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX','PTRATIO', 'BK', 'LSTAT', 'MEDV'], 
                        delim_whitespace = True)

In [4]:
display(df_calif.shape, df_calif.info(), df_calif.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


(20640, 10)

None

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [5]:
display(df_boston.shape, df_boston.info(), df_boston.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 505 entries, 0 to 504
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     505 non-null    float64
 1   ZN       505 non-null    float64
 2   INDUS    505 non-null    float64
 3   CHAS     505 non-null    int64  
 4   NOX      505 non-null    float64
 5   RM       505 non-null    float64
 6   AGE      505 non-null    float64
 7   DIS      505 non-null    float64
 8   RAD      505 non-null    int64  
 9   TAX      505 non-null    float64
 10  PTRATIO  505 non-null    float64
 11  BK       505 non-null    float64
 12  LSTAT    505 non-null    float64
 13  MEDV     505 non-null    float64
dtypes: float64(12), int64(2)
memory usage: 55.4 KB


(505, 14)

None

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,BK,LSTAT,MEDV
0,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
1,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
2,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
3,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2
4,0.02985,0.0,2.18,0,0.458,6.43,58.7,6.0622,3,222.0,18.7,394.12,5.21,28.7


In [6]:
# Заполение пропусков
df_calif['total_bedrooms'] = df_calif.groupby(by = 'ocean_proximity').transform(lambda x: x.fillna(x.median()))['total_bedrooms']

**2. Разделение выборок**

In [8]:
# Выделение целевой переменной и признаков
x_calif = df_calif.drop(columns = ['median_house_value'])
y_calif = df_calif['median_house_value']

x_boston = df_boston.drop(columns = ['MEDV'])
y_boston = df_boston['MEDV']

In [9]:
# Нормализация количественных признаков
x_calif_cont = x_calif.drop(columns = ['ocean_proximity'])
x_calif_cont = preprocessing.normalize(x_calif_cont)

x_boston = preprocessing.normalize(x_boston).astype(float)

In [10]:
# Кодирование признака ocean_proximity
ohe = preprocessing.OneHotEncoder()
x_calif_op = ohe.fit_transform(x_calif[['ocean_proximity']]).toarray()

x_calif = np.concatenate((x_calif_op, x_calif_cont), axis = 1).astype(float)

In [96]:
# Перевод в тензоры
x_calif = torch.tensor(x_calif, dtype = torch.float32)
y_calif = torch.tensor(y_calif, dtype = torch.float32)

x_boston = torch.tensor(x_boston, dtype = torch.float32)
y_boston = torch.tensor(y_boston, dtype = torch.float32)

  x_calif = torch.tensor(x_calif, dtype = torch.float32)
  y_calif = torch.tensor(y_calif, dtype = torch.float32)
  x_boston = torch.tensor(x_boston, dtype = torch.float32)
  y_boston = torch.tensor(y_boston, dtype = torch.float32)


In [98]:
x_calif.dtype, y_calif.dtype

(torch.float32, torch.float32)

In [100]:
# Обучающая и тестовая выборки
x_cte, x_ctr, y_cte, y_ctr = train_test_split(x_calif, y_calif, random_state = 42)
x_bte, x_btr, y_bte, y_btr = train_test_split(x_calif, y_calif, random_state = 42, test_size = 0.2)

**3. Нейрон**

In [102]:
# Датасет
dataset_c = TensorDataset(x_ctr, y_ctr)
dataset_b = TensorDataset(x_btr, y_btr)

# Дата-итератор
batch_size = 10
data_iter_c = DataLoader(dataset_c, batch_size, shuffle = True)
data_iter_b = DataLoader(dataset_b, batch_size, shuffle = True)

In [136]:
# Модель
model_c = torch.nn.Sequential(torch.nn.Linear(in_features = x_calif.shape[1], out_features = 1))
model_b = torch.nn.Sequential(torch.nn.Linear(in_features = x_boston.shape[1], out_features = 1))

#Ф-ция потерь
loss = torch.nn.MSELoss(reduction = 'mean')

#Оптимайзер
trainer_c = torch.optim.SGD(model_c.parameters(), lr = 0.001)
trainer_b = torch.optim.SGD(model_b.parameters(), lr = 0.001)

In [138]:
# Обучение. California
n_epochs = 100
for epoch in range(1, n_epochs + 1):
    for x, y in data_iter_c:
        trainer_c.zero_grad()
        l = loss(model_c(x).reshape(-1), y)
        l.backward()
        trainer_c.step()
    l = loss(model_c(x_ctr).reshape(-1), y_ctr)
   # if epoch % 10 == 0:
        #print('epoch %d, loss: %f' % (epoch, l.item()),'\nw', model_c[0].weight.data, '\nb', model_c[0].bias.data)
        #print()
        

In [142]:
# Обучение. Boston
n_epochs = 100
for epoch in tqdm(range(1, n_epochs + 1)):
    for x, y in data_iter_b:
        trainer_b.zero_grad()
        l = loss(model_b(x).reshape(-1), y)
        l.backward()
        trainer_b.step()
    l = loss(model_b(x_btr).reshape(-1), y_btr)
    #if epoch % 10 == 0:
        #print('epoch %d, loss: %f' % (epoch, l.item()),'\nw', model_b[0].weight.data, '\nb', model_b[0].bias.data)
        #print()

100%|██████████| 100/100 [00:18<00:00,  5.36it/s]


**4. Оценка качества**

In [172]:
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import r2_score

pred_c = model_c(x_cte).detach().numpy()
pred_b = model_b(x_bte).detach().numpy()

RMSE_c = MSE(y_cte, pred_c, squared = False)
RMSE_b = MSE(y_bte, pred_b, squared = False)

r2_c = r2_score(y_cte, pred_c)
r2_b = r2_score(y_bte, pred_b)

In [174]:
print('California:', RMSE_c, r2_c)
print('Boston:', RMSE_b, r2_b)

California: 88465.97 0.4134646742384115
Boston: 88955.016 0.408054820354639
