In [1]:
import re

# Основные библиотеки
import pandas as pd
import numpy as np

# PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import GradScaler, autocast
from torch.optim.lr_scheduler import ReduceLROnPlateau

# Модуль для визуализации прогресса
from tqdm.notebook import tqdm

# TensorBoard
from torch.utils.tensorboard import SummaryWriter

# Машинное обучение
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from NN_project.source.modelclassifier import ModelClassifier

import logging
import warnings

warnings.filterwarnings('ignore')
logging.getLogger("transformers").setLevel(logging.ERROR)


device = torch.device("cpu")

TextModelClassifierOption = {
    'model'     : 'sentence-transformers/LaBSE',
    'n_classes' : 2,
    'max_len'   : 512,
    'path'      : 'LaBSE_CrossEntropyLoss_two.pt'
}

print(device)

cpu


In [2]:
tms = ModelClassifier(
    model_name=TextModelClassifierOption['model'],
    n_classes=TextModelClassifierOption['n_classes'],
    max_len=TextModelClassifierOption['max_len'],
    device=device
)
tms.load_state_dict(torch.load(TextModelClassifierOption['path'], map_location=device))

<All keys matched successfully>

In [52]:
class Cleaner:
    def __init__(self):
        self._drop_list = [
            'Unnamed: 0', 'id', 'imgUrl', 'allPrice', 'sale', 'annotation', 'isbn',
            'bookName', 'datePublisher', 'da', 'db', 'dc', 'weight', 'age', 'bookGenres', 
            'decoration', 'typeObject', 'illustrations', 'groupOfType', 'underGroup',
            'genres', 'authors', 'publisher', 'series', 'sound_module', 'myPrice'
        ]

        self._patterns = {
            'embossing_gold': r'тиснение золотом',
            'embossing_silver': r'тиснение серебром',
            'embossing_colored': r'тиснение цветное',
            'embossing_volume': r'тиснение объемное',
            'partial_lacquer': r'частичная лакировка',
            'puffy_cover': r'пухлая обложка',
            'bookmark_ribbon': r'ляссе',
            'super_cover': r'супер',
            'edge_trim_gold': r'обрез золотой',
            'edge_trim_silver': r'обрез серебряный',
            'edge_trim_colored': r'обрез цветной',
            'slipcase_close': r'футляр закрытый',
            'slipcase_open': r'футляр открытый',
            'stickers': r'с наклейками',
            'puzzles': r'с пазлами',
            'movable_elements': r'с подвижными элементами',
            'volume_panorama': r'с объемной панорамой',
            'sound_module': r'со звуковым модулем',
            'toy': r'с игрушкой',
            'magnet': r'с магнитами',
            'glitter': r'глиттер',
            'flocking': r'флокинг',
            'soft_touch': r'покрытие софттач',
            'cutouts': r'вырубка',
            'textile_inserts': r'текстильные и пластиковые вставки'
        }
        
        self._patterns_ill = {
            'black_white': r'черно-белые',
            'color': r'цветные'
        }
        
        self._quality = {
            'Газетная': 0, 'Офсет': 1, 'Крафт': 2, 'Типографская': 3, 'Мелованная': 4, 'Картон': 5,
            'Ламинированные': 6, 'Рисовая': 7, 'Дизайнерская бумага': 8, 'Синтетическая': 9, 'ПВХ': 10,
            'Рафлаглосс': 11, 'Ткань': 12
        }

        self._cover = {
            'обл': 0, 'Лист': 1, 'Пакет': 2, 'Blister': 3, 'Jewel-box': 4, 
            'Amarey': 5, 'Blu-Ray': 6, 'карт': 7, 'Обл.': 8, '7Б': 9, 
            '7А': 10, '7Бц': 11, 'Инт': 12, 'Box': 13
        }

        self._select_featers = [
            'pages', 'volume', 'covers', 'pageType', 'rateSize', 'foreign_language', 
            'rate', 'black_white', 'color', 'slipcase_open', 'partial_lacquer', 'bookmark_ribbon',
            'super_cover', 'embossing_volume', 'embossing_colored', 'embossing_gold', 'edge_trim_colored',
            'edge_trim_gold'
        ]

    def _add_binary_features(self, df_init, patterns):
        for feature, pattern in patterns.items():
            df_init[feature] = df_init['decoration'].apply(lambda x: 1 if re.search(pattern, x) else 0)
        return df_init

    def _add_binary_features_ill(self, df_init, patterns_ill):
        for feature, pattern in patterns_ill.items():
            df_init[feature] = df_init['illustrations'].apply(lambda x: 1 if re.search(pattern, str(x).lower()) else 0)
        return df_init

    def __call__(self, table):
        #df = table.dropna(subset=['myPrice'])
        
        table['rate'] = table['rate'].fillna(0.0)
        table['pages'] = table['pages'].fillna(0.0)
        table['rate'] = table['rate'].round()

        table['da'] = table['da'].fillna(table['da'].median())
        table['db'] = table['db'].fillna(table['db'].median())
        table['dc'] = table['dc'].fillna(table['dc'].median())
        table['volume'] = table['da'] * table['db'] * table['dc'] * 10**(-3)
        
        table['volume'] = table['volume'].round(1)
        table['decoration'] = table['decoration'].fillna('Без декораций').str.lower()
        
        table = self._add_binary_features(table, self._patterns)
        table['typeObject'] = table['typeObject'].fillna('Книги')
        table['foreign_language'] = (table['typeObject'] == 'Книги на иностранном языке').astype(int)

        table['illustrations'] = table['illustrations'].fillna('черно-белые')

        # Применяем функцию
        table = self._add_binary_features_ill(table, self._patterns_ill)

        table['pageType'] = table['pageType'].fillna('Газетная')
        table['pageType'] = table['pageType'].map(self._quality)
        table['covers'] = table['covers'].fillna('обл - мягкий переплет')
        table['covers'] = table['covers'].apply(lambda x: x.strip().split(' ')[0])
        table['covers'] = table['covers'].map(self._cover)
        table['covers'] = table['covers'].fillna(0)

        table = table.drop(columns=self._drop_list)
        return table[self._select_featers]

    def transform(self, dataset):
        df_clean = pd.read_csv("cleaned_labirint_dataset.csv")
        df_clean = df_clean.drop(columns=["myPrice"])
        continuous_cols = ["pages", "rateSize", "volume"]

        scaler = StandardScaler()
        df_clean[continuous_cols] = scaler.fit_transform(df_clean[continuous_cols])
        dataset[continuous_cols] = scaler.transform(dataset[continuous_cols])

        poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
        _ = poly.fit_transform(df_clean)
        dataset = poly.transform(dataset)
        return dataset

In [54]:
#cleaner = Cleaner()
#b = cleaner(pd.DataFrame(test))
#cleaner.transform(b)

In [45]:
class AdvancedPriceRegressor(nn.Module):
    def __init__(self, input_dim):
        super(AdvancedPriceRegressor, self).__init__()
        self.fc1 = nn.Linear(input_dim, 512)
        self.bn1 = nn.BatchNorm1d(512)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(0.2)

        self.fc2 = nn.Linear(512, 256)
        self.bn2 = nn.BatchNorm1d(256)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(0.2)
        
        self.fc3 = nn.Linear(256, 128)
        self.relu3 = nn.ReLU()
        
        self.fc_out = nn.Linear(128, 1)

        self.residual = nn.Linear(input_dim, 1)

    def forward(self, x):
        residual = self.residual(x)
        x = self.fc1(x)
        x = self.bn1(x)
        x = self.relu1(x)
        x = self.dropout1(x)
        
        x = self.fc2(x)
        x = self.bn2(x)
        x = self.relu2(x)
        x = self.dropout2(x)
        
        x = self.fc3(x)
        x = self.relu3(x)
        
        x = self.fc_out(x)
        return x + residual

model = AdvancedPriceRegressor(input_dim=171).to(device)
model.load_state_dict(torch.load("best_model_0_758.pt", map_location=device))
model.eval();

In [91]:
class MicroService:
    def __init__(self, cleaner, annotationclassifier, bookregressor, device):
        self._cleaner = cleaner
        self._annotationclassifier = annotationclassifier
        self._bookregressor = bookregressor
        self._device = device

    def _str_transform(self, string, size):
        for trash in ['\n', '\t']:
            string = string.replace(trash, '')
        need_grow = 0 if len(string) >= size else size - len(string)
        for _ in range(need_grow):
            string += ' '
        return string[:512]

    def __call__(self, dataframe: pd.DataFrame):
        annotation = [self._str_transform(text, 512) for text in tqdm(dataframe['annotation'])]
        rate = [ 10 if self._annotationclassifier.predict(text) else 0 for text in tqdm(annotation)]
        rateSize = [ 9 if r == 10 else 0 for r in tqdm(rate) ]

        dataframe['rate'] = rate
        dataframe['rateSize'] = rateSize

        clean_dataframe = self._cleaner(dataframe)
        X_poly = self._cleaner.transform(clean_dataframe)
        X = torch.tensor(X_poly, dtype=torch.float32, device=self._device)
        
        pred = self._bookregressor(X)
        return pred 

In [92]:
ms = MicroService(Cleaner(), tms, model, device)

test = {
    'pages' : [160],
    'pageType' : ['Офсет'],
    'weight' : [165],
    'da' : [215],
    'db' : [144],
    'dc' : [8],
    'covers' : ['обл - мягкий переплет (крепление скрепкой или клеем)'],
    'decoration' : ['Частичная лакировка'],
    'illustrations' : ['Без иллюстраций'],
    'annotation' : [r'''
    Эта книга - настоящее лекарство от нищеты. 
    Если вы всерьез решили стать богатым, она поможет вам заглянуть в суть финансовых
    проблем и добиться реального успеха. Обсуждаемые автором принципы универсальны и неизменны.
    Они докажут вам свою действенность так же, как доказали ее многим другим людям,
    став ключом к стабильному финансовому прогрессу и процветанию.'''],
    'allPrice' : [0],
    'sale' : [0],
    'myPrice' : [0],
    'Unnamed: 0' : [0],
    'id' : [0], 
    'typeObject' : [''],
    'groupOfType' : [''],
    'underGroup' : [''],
    'genres' : [''],
    'bookName' : [''],
    'imgUrl' : [''],
    'age' : [''],
    'authors' : [''],
    'publisher' : [''],
    'series' : [''],
    'bookGenres' : [''],
    'isbn' : [''],
    'rate' : [0],
    'rateSize' : [0],
    'datePublisher' : [0]
}
y = ms(pd.DataFrame(test))
np.exp(float(y[0][0]))

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

241.51044613051232