# Парсер

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import lxml.html as l
import requests

from sklearn.model_selection import train_test_split
import torch.nn as nn
import torch
from torch.utils.data import DataLoader, Dataset
from PIL import Image
import torchvision.transforms.functional as TF
from torchvision.transforms import transforms
from torchvision.models import resnet50
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from torchvision import models

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
CARS_BROWSE_LINK = 'https://cars.usnews.com/cars-trucks/browse?sort=Alphabetical'
CARS_CLASS = 'Image__PictureImage-sc-412cjc-1 ilGKlg Image-sc-412cjc-2 DetailCardCarFinder__ProductImage-sc-18gh3vl-10 kQDDcT lnNlgK'

HEADER = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'en-US,en;q=0.9',
    'Sec-Ch-Ua': '"Not A(Brand";v="99", "Opera";v="107", "Chromium";v="121"',
    'Sec-Ch-Ua-Mobile': '?0',
    'Sec-Ch-Ua-Platform': "Windows",
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-User': '?1',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 OPR/107.0.0.0',
}

SAMPLE_SIZE = 500

def clone(element):
    return l.fromstring(l.tostring(element))

def parse_cars():
    data = []
    page = 1
    ind = 1
    while len(data) < SAMPLE_SIZE:
        current_link = CARS_BROWSE_LINK + "&page=" + str(page)
        request = requests.get(current_link, headers=HEADER)
        document = l.fromstring(request.text)

        for card in document.find_class(CARS_CLASS):
            card = clone(card)
            image = requests.get(card.get("src"), headers=HEADER)
            name = 'dataset_cars/output_image' + str(ind) + '.png'
            with open(name, 'wb') as file:
                file.write(image.content)
            ind += 1
            entry = [name, card.get("src"), "car"]
            data.append(entry)
            if len(data) >= SAMPLE_SIZE:
                break
        page += 1
        print(len(data))
    return data

def save_data_cars():
    data = parse_cars()
    df = pd.DataFrame(data, columns=['Name', 'Link', 'Class'])
    df.to_csv('cars_data.csv', index=False)

save_data_cars()

In [39]:
def get_motorcycle_link(n):
    return "https://spb.x-m.su/mototehnika/motocikly?PAGEN_2=" + str(n)
SITE='https://spb.x-m.su'

MOTORCYCLE_SAMPLE_SIZE = 300

def parse_motorcycles():
    data = []
    page = 1
    ind = 1
    while len(data) < MOTORCYCLE_SAMPLE_SIZE:
        current_link = get_motorcycle_link(page)
        request = requests.get(current_link)
        document = l.fromstring(request.text)
        
        for card in document.find_class("catalog_section_list_img_slider"):
            link = SITE + card.getchildren()[0].getchildren()[0].get("data-src")
            image = requests.get(link)
            name = 'dataset_motorcycles/output_image' + str(ind) + '.png'
            with open(name, 'wb') as file:
                file.write(image.content)
            ind += 1
            entry = [name, link, "motorcycle"]
            data.append(entry)
            if len(data) >= MOTORCYCLE_SAMPLE_SIZE:
                break
        page += 1
        print(len(data))
    return data

def save_data_motorcycles():
    data = parse_motorcycles()
    df = pd.DataFrame(data, columns=['Name', 'Link', 'Class'])
    df.to_csv('motorcycles_data.csv', index=False)

save_data_motorcycles()

20
40
60
80
100
120
140
160
180
200
220
240
260
280
300


In [45]:
def one_hot(s):
    if s == "car":
        return 0
    return 1
    
def get_tensor(name):
    image = Image.open(name)
    image = image.resize((224, 224))
    if image.mode == 'RGBA':
        image = image.convert('RGB')
    tensor = TF.to_tensor(image)
    tensor = tensor / 255
    return transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))(tensor)

class TrainDataset(Dataset):
    def __init__(self, data):
        self.tensors = list(map(get_tensor, data["Name"]))
        self.classes = list(map(one_hot, data["Class"]))

    def __len__(self):
        return len(self.classes)

    def __getitem__(self, index):
        return self.tensors[index], self.classes[index]
    
class TestDataset(Dataset):
    def __init__(self, data):
        self.names = list(data["Name"])
        self.classes = list(map(one_hot, data["Class"]))

    def __len__(self):
        return len(self.classes)

    def __getitem__(self, index):
        im = Image.open(self.names[index])
        im = im.resize((224, 224))
        tensor = TF.pil_to_tensor(im)
        tensor = tensor / 255
        tensor = transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))(tensor)
        return tensor, self.classes[index]
    
def delete_bad_objects(data, indexes):
    for ind in indexes:
        data = data[~data["Name"].str.contains('output_image' + str(ind) + '.png')]
    return data

In [50]:
DATA1 = pd.read_csv("cars_data.csv")
DATA2 = pd.read_csv("motorcycles_data.csv")
DATA2 = delete_bad_objects(DATA2, [164, 293, 296])
DATA = pd.concat([DATA1, DATA2], ignore_index=True)

data_train, data_test = train_test_split(DATA, test_size=0.2)
train_dataset = TrainDataset(data_train)
test_dataset = TrainDataset(data_test)

BATCH_SIZE = 100
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [51]:
class CarClassifier(nn.Module):
    def __init__(self, classes):
        super(CarClassifier, self).__init__()
        self.model = resnet50(weights=models.ResNet50_Weights.DEFAULT)
        for param in self.model.parameters():
            param.requires_grad = False
        self.fc1 = nn.Sequential(
            nn.Linear(1000, 32),
            nn.ReLU()
        )
        self.fc2 = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(32, classes)
        )
    def forward(self, x):
        return self.fc2(self.fc1(self.model(x)))
    
CLASSES_SIZE = 2
MODEL = CarClassifier(CLASSES_SIZE)
MODEL.to(DEVICE)

CarClassifier(
  (model): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (downsample): Sequential(
      

In [53]:
def train(model, epochs, lr):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    loss_hist = []
    acc_hist = []

    model.train()
    for epoch in range(1, epochs + 1):
        running_loss = 0
        running_acc = 0
        
        for x, y in train_loader:
            x, y = x.to(DEVICE), y.to(DEVICE)
            optimizer.zero_grad()
            y_pred = model(x) 
            _, predicted = torch.max(y_pred.data, 1)
            loss = criterion(y_pred, y)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            running_acc += accuracy_score(y, predicted) 
            loss_hist.append(loss.item())
            acc_hist.append(accuracy_score(y, predicted))
        
        epoch_loss = running_loss / len(train_loader)
        epoch_acc = running_acc / len(train_loader)
        
        print(f'Epoch {epoch}')
        print(f'Train Loss: {epoch_loss}, train accuracy: {epoch_acc}')


train(MODEL, epochs=15, lr=0.0001)

Epoch 1
Train Loss: 0.6081145746367318, train accuracy: 0.8182625482625482
Epoch 2
Train Loss: 0.5542648094041007, train accuracy: 0.8961389961389961
Epoch 3
Train Loss: 0.5128189197608403, train accuracy: 0.9041312741312743
Epoch 4
Train Loss: 0.4729328879288265, train accuracy: 0.9102702702702702
Epoch 5
Train Loss: 0.43299668601581026, train accuracy: 0.9274131274131274
Epoch 6
Train Loss: 0.39970878618104116, train accuracy: 0.9294208494208495
Epoch 7
Train Loss: 0.36223351529666353, train accuracy: 0.9447104247104247
Epoch 8
Train Loss: 0.33583122917584013, train accuracy: 0.9518532818532819
Epoch 9
Train Loss: 0.3054905150617872, train accuracy: 0.9537065637065636
Epoch 10
Train Loss: 0.2865484356880188, train accuracy: 0.9547104247104246
Epoch 11
Train Loss: 0.2672461760895593, train accuracy: 0.9498455598455599
Epoch 12
Train Loss: 0.2503055738551276, train accuracy: 0.9465637065637065
Epoch 13
Train Loss: 0.23037887045315333, train accuracy: 0.9622779922779923
Epoch 14
Train L

In [55]:
MODEL.eval()
def test_model(model, loader):
    correct = 0
    correct_f_score = 0
    with torch.no_grad():
        for data in loader:
            x, y = data
            x, y = x.to(DEVICE), y.to(DEVICE)
            outputs = model(x)
            
            _, predicted = torch.max(outputs.data, 1)
            
            correct += accuracy_score(y, predicted)
            correct_f_score += f1_score(y, predicted, average='weighted')

    print(f'Accuracy of the model: {correct / len(loader)}')
    print(f'F-score of the model: {correct_f_score / len(loader)}')


print("Test")
test_model(MODEL, test_loader)
print("Train")
test_model(MODEL, train_loader)

Test
Accuracy of the model: 0.9766666666666666
F-score of the model: 0.9766048439903445
Train
Accuracy of the model: 0.9718532818532817
F-score of the model: 0.9716671164604113
