In [27]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

from torch.utils.data import TensorDataset, DataLoader
from torch import nn, optim
import torch as th

In [28]:
seed = 42
np.random.seed(seed)
th.manual_seed(seed)
if th.cuda.is_available():
    th.cuda.manual_seed_all(seed)

In [None]:
data_train = pd.read_csv('./rent-prediction/train.csv')
data_test = pd.read_csv('./rent-prediction/test.csv')
data_train.head(3)

In [None]:
data_train.head()

In [None]:
data_train.isna().sum()

In [None]:
data_train.shape

In [None]:
data_train.info()

In [None]:
data_train.select_dtypes(np.number).min()

In [35]:
data_train.loc[data_train['rooms_count'] < 1, 'rooms_count'] = np.nan
data_test.loc[data_train['rooms_count'] < 1, 'rooms_count'] = np.nan

In [None]:
dup_ix = data_train.drop(columns=['ID']).duplicated()
data_train[dup_ix]

In [37]:
data_train = data_train[~dup_ix]

In [None]:
sns.heatmap(data_train.isna(), cbar=False, cmap='viridis')
plt.show()
data_train.isna().sum().sort_values(ascending=False)

In [None]:
data_train.nunique()

In [40]:
drop = data_train.columns[data_train.nunique() == 1]
data_train.drop(columns=drop, inplace=True)
data_test.drop(columns=drop, inplace=True)

In [None]:
data_train.nunique()

In [None]:
data_train.select_dtypes(object).nunique()

In [None]:
for column in data_train.columns[data_train.nunique() <= 200]:
    print(column, data_train[column].unique(), end='\n'*2+'-'*75+'\n'*2)

In [44]:
for column in data_test.columns:
    col = data_train[column]
    col_test = data_test[column]
    if pd.api.types.is_object_dtype(col):
        data_train[column].fillna('unknown11', inplace=True)
        data_test[column].fillna('unknown11', inplace=True)
    elif pd.api.types.is_numeric_dtype(col):
        data_train[column].fillna(col.mean(), inplace=True)
        data_test[column].fillna(col_test.mean(), inplace=True)

In [None]:
data_train['city_district'] = data_train['location'].astype(str) + '_' + data_train['district'].astype(str)
data_test['city_district'] = data_test['location'].astype(str) + '_' + data_test['district'].astype(str)

global_avg_price_train = data_train['price_per_month'].mean()

smoothing = 10
city_district_stats = data_train.groupby('city_district')['price_per_month'].agg(['mean', 'count'])

city_district_stats['smoothed_avg'] = (
    city_district_stats['mean'] * city_district_stats['count'] + global_avg_price_train * smoothing
) / (city_district_stats['count'] + smoothing)

avg_price_city_district_train = city_district_stats['smoothed_avg']

city_stats = data_train.groupby('location')['price_per_month'].agg(['mean', 'count'])
city_stats['smoothed_avg'] = (
    city_stats['mean'] * city_stats['count'] + global_avg_price_train * smoothing
) / (city_stats['count'] + smoothing)
avg_price_city_train = city_stats['smoothed_avg']

def calculate_avg_price(city, district, avg_price_city_district, avg_price_city, global_avg_price):
    city_district = city + '_' + district
    if district == 'unknown11':  
        return avg_price_city.get(city, global_avg_price)
    elif city == 'unknown11':  
        return global_avg_price
    else:  
        return avg_price_city_district.get(city_district, global_avg_price)

data_train['avg_price_city_district'] = data_train.apply(
    lambda row: calculate_avg_price(
        row['location'], row['district'], 
        avg_price_city_district_train, avg_price_city_train, global_avg_price_train
    ), axis=1
)

data_test['avg_price_city_district'] = data_test.apply(
    lambda row: calculate_avg_price(
        row['location'], row['district'], 
        avg_price_city_district_train, avg_price_city_train, global_avg_price_train
    ), axis=1
)

data_train['avg_price_city_district'] = data_train['avg_price_city_district'].fillna(global_avg_price_train)
data_test['avg_price_city_district'] = data_test['avg_price_city_district'].fillna(global_avg_price_train)

data_train.drop(['city_district'], inplace=True, axis=1)
data_test.drop(['city_district'], inplace=True, axis=1)

print(data_train.head())
print(data_test.head())

In [46]:
target = 'price_per_month'

In [47]:
X, y = data_train.drop(columns=['ID', target]), data_train[target]
X_test_final = data_test.drop(columns=['ID'])

In [48]:
numeric_features = X.select_dtypes(include=np.number).columns
numeric_transformer = StandardScaler()

categorical_features = X.select_dtypes(include=object).columns
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [49]:
X_train_transformed = preprocessor.fit_transform(X)

X_test_final_transformed = preprocessor.transform(X_test_final)

In [None]:
X_train_transformed.shape

In [None]:
from sklearn.metrics import r2_score

seed = 42
np.random.seed(seed)
th.manual_seed(seed)
if th.cuda.is_available():
    th.cuda.manual_seed_all(seed)

device = th.device('cuda' if th.cuda.is_available() else 'cpu')
print(f'Используется устройство: {device}')

n_bagging_models = 6

def create_bootstrap_indices(n_samples):
    return np.random.choice(n_samples, size=n_samples, replace=True)

trained_models = []

num_epochs = 2000
patience = 40

dataset_train = TensorDataset(
    th.tensor(X_train_transformed.toarray()).float(),
    th.tensor(y.values).float()
)

n_train = X_train_transformed.shape[0]

model = nn.Sequential(
    nn.Linear(X_train_transformed.shape[1], 256),
    nn.ReLU(),
    nn.Linear(256, 128),
    nn.ReLU(),
    nn.Linear(128, 1)
)

for i in range(n_bagging_models):
    print(f'\nОбучение модели {i + 1} из {n_bagging_models}')
    
    bootstrap_indices = create_bootstrap_indices(n_train)
    oob_indices = np.setdiff1d(np.arange(n_train), bootstrap_indices)
    
    X_train_bag = X_train_transformed[bootstrap_indices]
    y_train_bag = y.values[bootstrap_indices]
    
    if len(oob_indices) == 0:
        print("Все данные включены в бутстрэп-подвыборку. Пропуск модели.")
        continue  
    
    X_val_oob = X_train_transformed[oob_indices]
    y_val_oob = y.values[oob_indices]
    
    dataset_train_bag = TensorDataset(
        th.tensor(X_train_bag.toarray()).float(),
        th.tensor(y_train_bag).float()
    )
    dataloader_train_bag = DataLoader(dataset_train_bag, batch_size=8, shuffle=True)
    
    dataset_val_oob = TensorDataset(
        th.tensor(X_val_oob.toarray()).float(),
        th.tensor(y_val_oob).float()
    )
    dataloader_val_oob = DataLoader(dataset_val_oob, batch_size=8, shuffle=False)
    
    model = nn.Sequential(
        nn.Linear(X_train_transformed.shape[1], 256),
        nn.ReLU(),
        nn.Linear(256, 256),
        nn.ReLU(),
        nn.Linear(256, 128),
        nn.ReLU(),
        nn.Linear(128, 64),
        nn.ReLU(),
        nn.Linear(64, 64),
        nn.ReLU(),
        nn.Linear(64, 1),
    ).to(device)
    
    optimizer = optim.Adam(model.parameters(), lr=5e-4)
    criterion = nn.MSELoss()
    
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10, verbose=True)
    
    best_val_loss = np.inf
    best_r2 = -np.inf
    epochs_no_improve = 0
    best_model_state = None
    best_epoch = 0
    
    for epoch in range(1, num_epochs + 1):
        
        model.train()
        train_losses = []
        for X_batch, y_batch in dataloader_train_bag:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)
            
            optimizer.zero_grad()
            y_pred = model(X_batch).squeeze(dim=1)  
            loss = criterion(y_pred, y_batch)
            loss.backward()
            optimizer.step()
            
            train_losses.append(loss.item())
        
        avg_train_loss = np.mean(train_losses)
        
        
        model.eval()
        val_losses = []
        all_preds = []
        all_targets = []
        
        with th.no_grad():
            for X_batch, y_batch in dataloader_val_oob:
                X_batch = X_batch.to(device)
                y_batch = y_batch.to(device)
                
                y_pred = model(X_batch).squeeze(dim=1)  
                loss = criterion(y_pred, y_batch)
                val_losses.append(loss.item())
                
                all_preds.append(y_pred.cpu().numpy())
                all_targets.append(y_batch.cpu().numpy())
        
        avg_val_loss = np.mean(val_losses)
        all_preds = np.concatenate(all_preds)
        all_targets = np.concatenate(all_targets)
        r2 = r2_score(all_targets, all_preds)
        
        scheduler.step(avg_val_loss)
        
        if r2 > best_r2:
            best_val_loss = avg_val_loss
            best_r2 = r2
            epochs_no_improve = 0
            best_model_state = model.state_dict()
            best_epoch = epoch
        else:
            epochs_no_improve += 1
        
        if epoch % 10 == 0 or epoch == 1:
            print(f'Epoch {epoch:03d} | Train Loss: {avg_train_loss:.4f} | Val Loss (OOB): {avg_val_loss:.4f} | R2 (OOB): {r2:.4f}')
        
        if epochs_no_improve >= patience:
            print(f'\nРанняя остановка после {epoch} эпох без улучшений для модели {i + 1}.')
            break
    
    if best_model_state is not None:
        model.load_state_dict(best_model_state)
        print(f'Лучшая модель {i + 1} загружена. Epoch: {best_epoch} | Val Loss (OOB): {best_val_loss:.4f} | R2 (OOB): {best_r2:.4f}')
    
    trained_models.append(model)

def ensemble_predict_final(models, X_test_final, device, batch_size=8):
    test_dataset = TensorDataset(th.tensor(X_test_final.toarray()).float())
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    ensemble_preds = []
    
    for X_batch, in test_loader:
        X_batch = X_batch.to(device)
        
        preds = []
        for model in models:
            model.eval()
            with th.no_grad():
                y_pred = model(X_batch).squeeze(dim=1)  
                preds.append(y_pred.cpu().numpy())
        
        
        avg_pred = np.mean(preds, axis=0)
        ensemble_preds.extend(avg_pred)
    
    return np.array(ensemble_preds)

def ensemble_predict(models, dataloader, device):
    ensemble_preds = []
    all_targets = []
    
    for X_batch, y_batch in dataloader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)
        
        preds = []
        for model in models:
            model.eval()
            with th.no_grad():
                y_pred = model(X_batch).squeeze(dim=1)  
                preds.append(y_pred.cpu().numpy())
        
        avg_pred = np.mean(preds, axis=0)
        ensemble_preds.extend(avg_pred)
        all_targets.extend(y_batch.cpu().numpy())
    
    return np.array(ensemble_preds), np.array(all_targets)

In [None]:
def predict_test(models, X_test_transformed, device):
    test_preds = []
    test_dataset = TensorDataset(th.tensor(X_test_transformed.toarray()).float())  
    dataloader_test = DataLoader(test_dataset, batch_size=8, shuffle=False)
    
    for X_batch, in dataloader_test:
        X_batch = X_batch.to(device)
        preds = []
        for model in models:
            model.eval()
            with th.no_grad():
                y_pred = model(X_batch).squeeze()
                preds.append(y_pred.cpu().numpy())
        
        avg_pred = np.mean(preds, axis=0)
        
        if np.isscalar(avg_pred):  
            test_preds.append(avg_pred)
        else:
            test_preds.extend(avg_pred)  
    
    return np.array(test_preds)

test_preds = predict_test(trained_models, X_test_final_transformed, device)

submission_df = pd.DataFrame({
    'ID': data_test['ID'],
    'price_per_month': test_preds
})

submission_df.to_csv('submission_bagging112.csv', index=False)
print('Предсказания сохранены в "submission_bagging.csv".')