In [1]:
import os
import sys

In [2]:
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

In [3]:
import pandas as pd
from src.data.tokenizer import Vin2ParamTokenizer
from src.data.dataset import Vin2ParamDataset
from src.train.Vin2ParamTrainer import Vin2ParamTrainer
from src.model import Vin2ParamGRU

from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch

from sklearn.model_selection import train_test_split

In [4]:
data_train = pd.read_csv("../data/vins_train.csv")

In [5]:
data_train.head()

Unnamed: 0,VIN,CarBrand,CarModel,YearManufactured,EngineType,EngineVolume,Color
0,NVKZTHZ0X18TM05845,ВАЗ (LADA),2115 Samara,2009,Бензин,1.6,Серый
1,HNVKZXWFJ22J206195,ВАЗ (LADA),2112,2003,Бензин,1.6,Чёрный
2,HYQS2DXM922I900038,ВАЗ (LADA),21099,2003,Бензин,1.4,Серебряный
3,HNVKZF1KZ636X08104,ВАЗ (LADA),2110,1998,Бензин,1.6,Зелёный
4,EBLI88GR11J203825,Opel,Meriva,2008,Бензин,1.6,Чёрный


In [6]:
data_train = data_train[["VIN", "CarBrand", "CarModel", "Color"]]

In [7]:
data_train.drop_duplicates(inplace=True)

In [8]:
sos_token = "!"
eos_token = "@"

tokenizer = Vin2ParamTokenizer(sos_token, eos_token)

In [9]:
tokenizer.fit_vin(data_train)

In [10]:
tokenizer.tokenize_vin(data_train)

In [11]:
tokenizer.fit_label(data_train)

In [12]:
tokenizer.tokenize_label(data_train)

In [13]:
train_data, val_data = train_test_split(data_train, test_size=0.05, shuffle=True, stratify=data_train["CarBrand"])

In [14]:
train_dataset = Vin2ParamDataset(train_data["VIN"].values, train_data[["CarBrand", "CarModel", "Color"]].values)
val_dataste = Vin2ParamDataset(val_data["VIN"].values, val_data[["CarBrand", "CarModel", "Color"]].values)

In [15]:
def vin_collate_fn(batch):
    batch_vins, batch_labels = list(map(lambda x: x[0], batch)), list(map(lambda x: x[1], batch))
    vins = torch.tensor(pad_sequence(batch_vins, batch_first=True, padding_value=0))
    lbs = torch.stack(batch_labels, dim=0)
    return vins, lbs

In [16]:
train_dataloader = DataLoader(train_dataset, batch_size=512, shuffle=True, collate_fn=vin_collate_fn)
val_dataloader = DataLoader(val_dataste, batch_size=512, collate_fn=vin_collate_fn)

In [17]:
carbrand_n = data_train['CarBrand'].nunique()
carmodel_n = data_train['CarModel'].nunique()
color_n = data_train['Color'].nunique()

In [18]:
carbrand_n, carmodel_n, color_n

(4, 142, 16)

In [19]:
model = Vin2ParamGRU(len(tokenizer.vocab), 50, 0, 200, 0.1, carbrand_n, carmodel_n, color_n)

In [20]:
trainer = Vin2ParamTrainer(model, train_dataloader, val_dataloader)

In [None]:
trainer.train(2)

  vins = torch.tensor(pad_sequence(batch_vins, batch_first=True, padding_value=0))


In [None]:
modelX = Vin2ParamGRU(len(tokenizer.vocab), 60, 0, 350, 0.2, carbrand_n, carmodel_n, color_n)



In [None]:
trainerX = Vin2ParamTrainer(modelX, train_dataloader, val_dataloader)

In [None]:
trainerX.train(2)

  vins = torch.tensor(pad_sequence(batch_vins, batch_first=True, padding_value=0))


epoch: 0 | step: 0 | train_loss: 9.120433807373047 | val_loss: 9.042512893676758 | brand_acc: 0.7901785714285714 | model_acc: 0.015066964285714286 | color_acc: 0.10379464285714286
epoch: 0 | step: 50 | train_loss: 6.168956756591797 | val_loss: 6.233157702854702 | brand_acc: 0.7901785714285714 | model_acc: 0.07672991071428571 | color_acc: 0.16183035714285715
epoch: 0 | step: 100 | train_loss: 6.0895586013793945 | val_loss: 5.933405944279262 | brand_acc: 0.7901785714285714 | model_acc: 0.13783482142857142 | color_acc: 0.27566964285714285
epoch: 0 | step: 150 | train_loss: 4.807186603546143 | val_loss: 4.77488340650286 | brand_acc: 0.7901785714285714 | model_acc: 0.27957589285714285 | color_acc: 0.427734375
epoch: 0 | step: 200 | train_loss: 2.7971065044403076 | val_loss: 2.835294553211757 | brand_acc: 0.8663504464285714 | model_acc: 0.6462053571428571 | color_acc: 0.6498325892857143
epoch: 0 | step: 250 | train_loss: 1.3907334804534912 | val_loss: 1.4534093652452742 | brand_acc: 0.926897

In [None]:
model.eval()
modelX.eval()
brand_acc = 0
model_acc = 0
color_acc = 0
for inp_val, label_val in val_dataloader:
    with torch.no_grad():
        out = model(inp_val)
        outX = modelX(inp_val)
    out = (out[0] + outX[0]) / 2.0, (out[1] + outX[1]) / 2.0, (out[2] + outX[2]) / 2.0

    brand_out = torch.argmax(out[0], dim=-1)
    model_out = torch.argmax(out[1], dim=-1)
    color_out = torch.argmax(out[2], dim=-1)

    brand_acc += torch.sum(brand_out == label_val[:, 0]).item()
    model_acc += torch.sum(model_out == label_val[:, 1]).item()
    color_acc += torch.sum(color_out == label_val[:, 2]).item()

val_size = len(val_dataloader) * 512
brand_acc /= val_size
model_acc /= val_size
color_acc /= val_size
print(brand_acc, model_acc, color_acc)

  vins = torch.tensor(pad_sequence(batch_vins, batch_first=True, padding_value=0))


0.9400111607142857 0.9162946428571429 0.9213169642857143
