In [1]:
import os
import torch
import random
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim

from tqdm.notebook import tqdm
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [2]:
def set_seed(seed: int = 42) -> None:
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ["PYTHONHASHSEED"] = str(seed)
    print(f"Random seed set as {seed}")


set_seed(42)

Random seed set as 42


# Загрузка датасета

In [3]:
df = pd.read_csv("air_quality.csv", sep=",")

df.head()

Unnamed: 0,Temperature,Humidity,PM2.5,PM10,NO2,SO2,CO,Proximity_to_Industrial_Areas,Population_Density,air_quality
0,29.8,59.1,5.2,17.9,18.9,9.2,1.72,6.3,319,1
1,28.3,75.6,2.3,12.2,30.8,9.7,1.64,6.0,611,1
2,23.1,74.7,26.7,33.8,24.4,12.6,1.63,5.2,619,1
3,27.1,39.1,6.1,6.3,13.5,5.3,1.15,11.1,551,1
4,26.5,70.7,6.9,16.0,21.9,5.6,1.01,12.7,303,1


In [4]:
df.describe()

Unnamed: 0,Temperature,Humidity,PM2.5,PM10,NO2,SO2,CO,Proximity_to_Industrial_Areas,Population_Density,air_quality
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,30.02902,70.05612,20.14214,30.21836,26.4121,10.01482,1.500354,8.4254,497.4238,0.7
std,6.720661,15.863577,24.554546,27.349199,8.895356,6.750303,0.546027,3.610944,152.754084,0.458303
min,13.4,36.0,0.0,-0.2,7.4,-6.2,0.65,2.5,188.0,0.0
25%,25.1,58.3,4.6,12.3,20.1,5.1,1.03,5.4,381.0,0.0
50%,29.0,69.8,12.0,21.7,25.3,8.0,1.41,7.9,494.0,1.0
75%,34.0,80.3,26.1,38.1,31.9,13.725,1.84,11.1,600.0,1.0
max,58.6,128.1,295.0,315.8,64.9,44.9,3.72,25.8,957.0,1.0


In [5]:
df.shape

(5000, 10)

# Подготовка данных

Разделение датасета на обучающую, валидационную и тестовую выборку со стратификацией

In [6]:
# Размеры выборок
train_size = 0.6
val_size = 0.2
test_size = 0.2

# Разделение признаков и целевой переменной
X = df.drop(columns=["air_quality"])
y = df["air_quality"]

# Разбивка
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, train_size=train_size, random_state=12, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp,
    y_temp,
    test_size=(test_size / (test_size + val_size)),
    random_state=12,
    stratify=y_temp,
)

# Вывод результата
print(f"Всего ({1.0*100}%): {df.shape[0]}")
print(f"Тренировачный ({train_size*100}%): {X_train.shape[0]}")
print(f"Валидационный ({val_size*100}%): {X_val.shape[0]}")
print(f"Тестовый ({test_size*100}%): {X_test.shape[0]}")

Всего (100.0%): 5000
Тренировачный (60.0%): 3000
Валидационный (20.0%): 1000
Тестовый (20.0%): 1000


Создание объектов для работы с данными

In [7]:
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.Tensor(X.values)
        self.y = torch.from_numpy(y.values).reshape(-1, 1).float()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, index):
        return (self.X[index], self.y[index])

In [8]:
train_dataset = CustomDataset(X_train, y_train)
val_dataset = CustomDataset(X_val, y_val)
test_dataset = CustomDataset(X_test, y_test)

In [9]:
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Нейросетевая модель

In [10]:
class CustomModel(nn.Module):
    def __init__(self, inputs_size, hidden_size, out_size, hidden_count):
        super(CustomModel, self).__init__()
        self.linear_in = nn.Linear(inputs_size, hidden_size)
        self.linear_hiddens = nn.ModuleList()
        for _ in range(hidden_count):
            self.linear_hiddens.append(nn.Linear(hidden_size, hidden_size))
        self.linear_out = nn.Linear(hidden_size, out_size)

    def forward(self, x):
        x = self.linear_in(x)
        x = torch.relu(x)
        for linear_hidden in self.linear_hiddens:
            x = linear_hidden(x)
            x = torch.relu(x)
        x = self.linear_out(x)
        y_pred = torch.sigmoid(x)
        return y_pred

In [11]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [12]:
model = CustomModel(inputs_size=9, hidden_size=100, out_size=1, hidden_count=4)

# Код обучения и оценки модели

In [13]:
def train_loop(model, train_dataloader, val_dataloader, epochs=20, lr=0.001):
    model.to(device)
    criterion = nn.BCELoss()
    optimizer = optim.SGD(model.parameters(), lr=lr)
    for epoch in range(epochs):
        model.train()
        train_loss = 0.0

        # Обучающая часть цикла
        for batch in (
            tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{epochs} [Training]")
            if ((epoch + 1) % 100 == 0 or epoch == 0)
            else train_dataloader
        ):
            X_batch, y_batch = batch
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            y_pred = model(X_batch)
            loss = criterion(y_pred, y_batch)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * X_batch.size(0)
        train_loss /= len(train_dataloader.dataset)

        # Валидационная часть цикла
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for batch in (
                tqdm(val_dataloader, desc=f"Epoch {epoch+1}/{epochs} [Training]")
                if ((epoch + 1) % 100 == 0 or epoch == 0)
                else val_dataloader
            ):
                X_batch, y_batch = batch
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                y_pred = model(X_batch)
                loss = criterion(y_pred, y_batch)
                val_loss += loss.item() * X_batch.size(0)
        val_loss /= len(val_dataloader.dataset)
        if (epoch + 1) % 100 == 0 or epoch == 0:
            print(
                f"Epoch [{epoch+1}/{epochs}] - Train Loss: {train_loss:.4f} - Val Loss: {val_loss:.4f}"
            )
    print("Обучение завершено!")

In [14]:
def evaluate_model(model, test_dataloader):
    model.eval()
    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for inputs, labels in test_dataloader:
            inputs = inputs.to(device)
            outputs = model(inputs)
            predicted = (outputs > 0.5).float().cpu().numpy()
            all_predictions.extend(predicted)
            all_labels.extend(labels.numpy())

    return np.array(all_predictions), np.array(all_labels)

# Обучение модели и проверка качества

In [15]:
%%time
train_loop(model, train_dataloader, val_dataloader, 1000)

Epoch 1/1000 [Training]:   0%|          | 0/94 [00:00<?, ?it/s]

Epoch 1/1000 [Training]:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch [1/1000] - Train Loss: 0.6604 - Val Loss: 0.6446


Epoch 100/1000 [Training]:   0%|          | 0/94 [00:00<?, ?it/s]

Epoch 100/1000 [Training]:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch [100/1000] - Train Loss: 0.3121 - Val Loss: 0.2853


Epoch 200/1000 [Training]:   0%|          | 0/94 [00:00<?, ?it/s]

Epoch 200/1000 [Training]:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch [200/1000] - Train Loss: 0.2745 - Val Loss: 0.2598


Epoch 300/1000 [Training]:   0%|          | 0/94 [00:00<?, ?it/s]

Epoch 300/1000 [Training]:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch [300/1000] - Train Loss: 0.2424 - Val Loss: 0.2404


Epoch 400/1000 [Training]:   0%|          | 0/94 [00:00<?, ?it/s]

Epoch 400/1000 [Training]:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch [400/1000] - Train Loss: 0.2284 - Val Loss: 0.2719


Epoch 500/1000 [Training]:   0%|          | 0/94 [00:00<?, ?it/s]

Epoch 500/1000 [Training]:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch [500/1000] - Train Loss: 0.2114 - Val Loss: 0.2384


Epoch 600/1000 [Training]:   0%|          | 0/94 [00:00<?, ?it/s]

Epoch 600/1000 [Training]:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch [600/1000] - Train Loss: 0.2148 - Val Loss: 0.2340


Epoch 700/1000 [Training]:   0%|          | 0/94 [00:00<?, ?it/s]

Epoch 700/1000 [Training]:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch [700/1000] - Train Loss: 0.1916 - Val Loss: 0.2015


Epoch 800/1000 [Training]:   0%|          | 0/94 [00:00<?, ?it/s]

Epoch 800/1000 [Training]:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch [800/1000] - Train Loss: 0.2015 - Val Loss: 0.1927


Epoch 900/1000 [Training]:   0%|          | 0/94 [00:00<?, ?it/s]

Epoch 900/1000 [Training]:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch [900/1000] - Train Loss: 0.1519 - Val Loss: 0.2807


Epoch 1000/1000 [Training]:   0%|          | 0/94 [00:00<?, ?it/s]

Epoch 1000/1000 [Training]:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch [1000/1000] - Train Loss: 0.1497 - Val Loss: 0.1856
Обучение завершено!
CPU times: user 33min 9s, sys: 727 ms, total: 33min 10s
Wall time: 2min 22s


In [19]:
y_pred, y_true = evaluate_model(model, test_dataloader)

print(
    classification_report(
        y_true,
        y_pred,
    )
)

              precision    recall  f1-score   support

         0.0       0.88      0.91      0.90       300
         1.0       0.96      0.95      0.95       700

    accuracy                           0.94      1000
   macro avg       0.92      0.93      0.93      1000
weighted avg       0.94      0.94      0.94      1000

