In [None]:
import torch
import torch.nn as nn
import torch.nn.init as init
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from torch.utils.data import DataLoader, TensorDataset


In [None]:
data = pd.read_csv('/content/data.csv')
df = pd.DataFrame(data)

In [None]:
class DataPreprocessor:
    def __init__(self, cat_cols):
        self.cat_cols = cat_cols
        self.label_encoders = {}
        self.scaler = StandardScaler()

    def fit_transform(self, df):
        df = df.copy()
        df.fillna(0, inplace=True)

        for col in self.cat_cols:
            df[col] = df[col].astype(str).replace('0', 'MISSING')
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col])
            self.label_encoders[col] = le

        num_cols = df.columns[~df.columns.isin([*self.cat_cols, 'target'])]
        if len(num_cols) > 0:
            df[num_cols] = self.scaler.fit_transform(df[num_cols])

        return df


In [None]:

class ImprovedModel(nn.Module):
    def __init__(self, num_features, cat_dims, embedding_dim=4):
        super().__init__()
        self.embeddings = nn.ModuleList([
            nn.Embedding(dim, embedding_dim) for dim in cat_dims
        ])
        for emb in self.embeddings:
            init.xavier_normal_(emb.weight)

        input_size = num_features + len(cat_dims)*embedding_dim

        self.fc = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.BatchNorm1d(128),
            nn.Dropout(0.3),
            nn.LeakyReLU(0.1),

            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.Dropout(0.2),
            nn.LeakyReLU(0.1),

            nn.Linear(64, 32),
            nn.BatchNorm1d(32),
            nn.LeakyReLU(0.1),

            nn.Linear(32, 1)
        )

        self._init_weights()

    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='leaky_relu')
                if m.bias is not None:
                    init.constant_(m.bias, 0)

    def forward(self, x_num, x_cat):
        embeddings = []
        for i, emb_layer in enumerate(self.embeddings):
            embeddings.append(emb_layer(x_cat[:, i]))
        embeddings = torch.cat(embeddings, dim=1)

        x = torch.cat([x_num, embeddings], dim=1)
        return self.fc(x).squeeze()


In [None]:
CAT_COLS = df.select_dtypes(include=['object', 'category']).columns.tolist()
TARGET = 'oil_property_value'
BATCH_SIZE = 32
EPOCHS = 200

preprocessor = DataPreprocessor(CAT_COLS)
processed_df = preprocessor.fit_transform(df)

X_num = processed_df.drop(CAT_COLS + [TARGET], axis=1).values.astype(np.float32)
X_cat = processed_df[CAT_COLS].values.astype(np.int64)
y = processed_df[TARGET].values.astype(np.float32)

y_scaler = StandardScaler()
y = y_scaler.fit_transform(y.reshape(-1, 1)).flatten()

X_num_train, X_num_val, X_cat_train, X_cat_val, y_train, y_val = train_test_split(
    X_num, X_cat, y, test_size=0.2, random_state=42)


In [None]:
train_dataset = TensorDataset(
    torch.tensor(X_num_train),
    torch.tensor(X_cat_train),
    torch.tensor(y_train)
)
val_dataset = TensorDataset(
    torch.tensor(X_num_val),
    torch.tensor(X_cat_val),
    torch.tensor(y_val)
)

In [None]:

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

cat_dims = [len(preprocessor.label_encoders[col].classes_) for col in CAT_COLS]
model = ImprovedModel(num_features=X_num.shape[1], cat_dims=cat_dims)

optimizer = torch.optim.AdamW(model.parameters(), lr=0.0005, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=5, factor=0.5)


In [None]:

criterion = nn.SmoothL1Loss()

best_val_loss = float('inf')
patience_counter = 0
patience = 10

for epoch in range(EPOCHS):
    model.train()
    train_loss = 0
    for x_num, x_cat, y_batch in train_loader:
        optimizer.zero_grad()
        preds = model(x_num, x_cat)
        loss = criterion(preds, y_batch)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        train_loss += loss.item()

    model.eval()
    val_loss = 0
    with torch.no_grad():
        for x_num, x_cat, y_batch in val_loader:
            preds = model(x_num, x_cat)
            val_loss += criterion(preds, y_batch).item()

    train_loss /= len(train_loader)
    val_loss /= len(val_loader)

    scheduler.step(val_loss)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
    else:
        patience_counter += 1

    model.eval()
    val_loss = 0
    total_mae = 0
    with torch.no_grad():
        for x_num_val, x_cat_val, y_val in val_loader:
            val_preds = model(x_num_val, x_cat_val)

            val_loss += criterion(val_preds, y_val).item()
            mae = torch.mean(torch.abs(val_preds - y_val))
            total_mae += mae.item() * y_val.size(0)

    # Вычисляем средние значения
    val_loss_epoch = val_loss / len(val_loader)
    val_mae = total_mae / len(val_loader.dataset)

    print(f'Epoch {epoch+1:03d} | '
          f'Train Loss: {train_loss_epoch:.4f} | '
          f'Val Loss: {val_loss_epoch:.4f} | '
          f'Val MAE: {val_mae:.4f}')



Epoch 001 | Train Loss: 0.0045 | Val Loss: 0.0044 | Val MAE: 0.0646
Epoch 002 | Train Loss: 0.0045 | Val Loss: 0.0043 | Val MAE: 0.0622
Epoch 003 | Train Loss: 0.0045 | Val Loss: 0.0046 | Val MAE: 0.0651
Epoch 004 | Train Loss: 0.0045 | Val Loss: 0.0044 | Val MAE: 0.0647
Epoch 005 | Train Loss: 0.0045 | Val Loss: 0.0041 | Val MAE: 0.0626
Epoch 006 | Train Loss: 0.0045 | Val Loss: 0.0037 | Val MAE: 0.0591
Epoch 007 | Train Loss: 0.0045 | Val Loss: 0.0039 | Val MAE: 0.0588
Epoch 008 | Train Loss: 0.0045 | Val Loss: 0.0043 | Val MAE: 0.0627
Epoch 009 | Train Loss: 0.0045 | Val Loss: 0.0043 | Val MAE: 0.0624
Epoch 010 | Train Loss: 0.0045 | Val Loss: 0.0043 | Val MAE: 0.0639
Epoch 011 | Train Loss: 0.0045 | Val Loss: 0.0044 | Val MAE: 0.0657
Epoch 012 | Train Loss: 0.0045 | Val Loss: 0.0045 | Val MAE: 0.0674
Epoch 013 | Train Loss: 0.0045 | Val Loss: 0.0040 | Val MAE: 0.0632
Epoch 014 | Train Loss: 0.0045 | Val Loss: 0.0044 | Val MAE: 0.0643
Epoch 015 | Train Loss: 0.0045 | Val Loss: 0.004

In [None]:
torch.save(model, 'mdl.pth')

In [None]:
def inference(model, preprocessor, new_data):
    new_df = pd.DataFrame(new_data)
    new_df.fillna(0, inplace=True)

    for col in CAT_COLS:
        new_df[col] = new_df[col].astype(str).replace('0', 'MISSING')
        le = preprocessor.label_encoders[col]
        new_df[col] = new_df[col].apply(lambda x: x if x in le.classes_ else 'MISSING')
        new_df[col] = le.transform(new_df[col])

    num_cols = new_df.columns[~new_df.columns.isin(CAT_COLS)]
    if len(num_cols) > 0:
        new_df[num_cols] = preprocessor.scaler.transform(new_df[num_cols])

    X_num = torch.tensor(new_df.drop(CAT_COLS, axis=1).values.astype(np.float32))
    X_cat = torch.tensor(new_df[CAT_COLS].values.astype(np.int64))

    model.eval()
    with torch.no_grad():
        pred = model(X_num, X_cat)
    return preprocessor.y_scaler.inverse_transform(pred.numpy().reshape(-1, 1)).flatten()

new_data = { # Данные формата (колонны)
            # blend_id	comp1_smiles	comp1_type	comp1_mass	comp1_LogP	comp1_TPSA	comp1_MolWt	comp1_Van_Der_Waals_volume	comp1_Fraction_non_rotatable_bonds	comp1_num_atoms	... comp20_LogP	comp20_TPSA	comp20_MolWt	comp20_Van_Der_Waals_volume	comp20_Fraction_non_rotatable_bonds	comp20_num_atoms	comp20_Degree_of_branching	comp20_Labute_asa	comp20_Mol_mr	oil_property_value
            }


model.load_state_dict(torch.load('/content/mdl.pth'))
model.eval()
predictions = inference(model, preprocessor, new_data)
print(f"Predictions: {predictions}")