# Парсер

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import lxml.html as l
import requests

import nltk
from nltk.corpus import stopwords
from collections import Counter
from pymystem3 import Mystem
from sklearn.model_selection import train_test_split
import torch.nn as nn
import torch
from torch.utils.data import DataLoader, Dataset

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Master\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
BROWSE_LINK = 'https://www.banki.ru/services/responses/bank/mts-bank/?page='
WEBSITE = 'https://www.banki.ru'

HEADER = {
    'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7' ,
    'Accept-Encoding' : 'gzip, deflate, br' ,
    'Accept-Language': 'en-US,en;q=0.9' ,
    'Sec-Ch-Ua' : '"Not A(Brand";v="99", "Opera";v="107", "Chromium";v="121"' ,
    'Sec-Ch-Ua-Mobile': '?0' ,
    'Sec-Ch-Ua-Platform': "Windows" ,
    'Sec-Fetch-Dest': 'document' ,
    'Sec-Fetch-Mode': 'navigate' ,
    'Sec-Fetch-Site': 'same-origin' ,
    'Sec-Fetch-User': '?1' ,
    'Upgrade-Insecure-Requests': '1' ,
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 OPR/107.0.0.0' ,
}

SAMPLE_SIZE = 1000

def clone(element):
    return l.fromstring(l.tostring(element))

def parse():
    data = []
    page = 1
    while len(data) < SAMPLE_SIZE:
        current_link = BROWSE_LINK + str(page) + "&is_countable=on"
        request = requests.get(current_link, headers = HEADER)
        document = l.fromstring(request.text)
        for card in document.find_class("la8a5ef73"):
            card = clone(card)
            links = card.xpath('//a/@href')
            if links[0].find("/services/responses/bank/response") != -1:
                instance = parse_instance(links[0])
                data.append(instance)
            if len(data) >= SAMPLE_SIZE:
                break
        page += 1
        print("len: " + str(len(data)))
    return data

def parse_instance(link):
    instance = []
    link = WEBSITE + link
    request = requests.get(link, headers=HEADER)
    document = l.fromstring(request.text)

    text = document.find_class("lb1789875 markdown-inside markdown-inside--list-type_circle-fill")[0].text_content()
    instance.append(text)
    score = document.find_class("rating-grade")[0].text_content();
    instance.append(score)
    return instance

def save_data():
    data = parse()
    df = pd.DataFrame(data, columns=['Review', 'Score'])
    df.to_csv('text_data.csv', index=False)

save_data()

In [2]:
STOPWORDS = stopwords.words('russian')
MYSTEM = Mystem()
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def text_words(text):
    return [word for word in MYSTEM.lemmatize(text) if not word.isspace() and not word in STOPWORDS]

def process_texts(texts):
    rus_alph = set(list('йцукенгшщзхъфывапролджэёячсмитьбю'))
    drop_not_letters = lambda s: ''.join(list(filter(lambda c: c in rus_alph or c == ' ', s)))
    texts = [drop_not_letters(comm.lower().replace("\n", " ").replace(".", " ").replace(",", " ")) for comm in texts]
    texts = list(map(text_words, texts))
    return texts

def count_all_words(texts):
    all_words = []
    for text in texts:
        all_words.extend(text)
    counts = Counter(all_words)
    return counts

def unwrap(counter):
    return list(counter.keys()), list(counter.values())

def embeddings_vocab(words):
    return {words[i] : i for i in range(len(words))}

def create_embeddings(texts_, vocabulary):
    texts = [text for text in texts_]
    length = max(list(map(len, texts)))
    for i in range(len(texts)):
        texts[i] = [vocabulary.get(word) for word in texts[i]]
        texts[i].extend([len(vocabulary) for _ in range(length - len(texts[i]))])
    return texts

def process_score(score):
    def binary(val):
        if val <= 3:
            return 0
        return 1
    return list(map(binary, score))

def fix_data(texts, score):
    processed_texts = process_texts(texts)
    processed_score = process_score(score)
    skip_long = [(processed_texts[i], processed_score[i]) for i in range(len(texts)) if len(processed_texts[i]) < 180]
    return [x[0] for x in skip_long], [x[1] for x in skip_long]


In [3]:
DATA = pd.read_csv("text_data.csv")
TEXTS = DATA["Review"].tolist()
SCORE = DATA["Score"].tolist()

NEW_TEXTS, NEW_SCORE = fix_data(TEXTS, SCORE)

[WORDS, COUNTS] = unwrap(count_all_words(NEW_TEXTS))

VOCABULARY = embeddings_vocab(WORDS)
EMBEDDINGS = create_embeddings(NEW_TEXTS, VOCABULARY)
VOCABULARY_SIZE = len(WORDS) + 1

In [5]:
class CustomDataset(Dataset):
    def __init__(self, texts, score):
        self.texts = texts
        self.score = score

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        return torch.tensor(self.texts[index]), self.score[index]


class ScoreClassification(nn.Module):
    def __init__(self, vocabulary_size, embeddings_dim, hidden_dim, classes_size):
        super().__init__()
        self.embeddings = nn.Embedding(vocabulary_size, embeddings_dim, padding_idx=0)
        self.lstm = nn.LSTM(embeddings_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, classes_size)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        x = self.embeddings(x)
        x = self.dropout(x)
        output, (hidden, cell) = self.lstm(x)
        x = self.linear(hidden[-1])
        return x


def train(model, epochs, lr):
    model.to(DEVICE)

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    model.train()
    for epoch in range(1, epochs + 1):
        total_loss = 0
        for x, y in train_loader:
            x, y = x.to(DEVICE), y.to(DEVICE)
            optimizer.zero_grad()
            y_pred = model(x)
            loss = criterion(y_pred, y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        average_train_loss = total_loss / len(train_loader)
        if epoch % 10 == 0:
            print(f'Epoch {epoch}')
            print(f'Train Loss: {average_train_loss}')

texts_train, texts_test, score_train, score_test = train_test_split(EMBEDDINGS, NEW_SCORE, test_size=0.2, random_state=1)
train_dataset = CustomDataset(texts_train, score_train)
test_dataset = CustomDataset(texts_test, score_test)

BATCH_SIZE = 128
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

HIDDEN_DIM = 300
CLASSES_SIZE = 2

MODEL = ScoreClassification(VOCABULARY_SIZE, len(EMBEDDINGS), HIDDEN_DIM, CLASSES_SIZE)
train(MODEL, epochs=200, lr=0.05)

Epoch 10
Train Loss: 0.47334613899389905
Epoch 20
Train Loss: 0.3916773498058319
Epoch 30
Train Loss: 0.3460238426923752
Epoch 40
Train Loss: 0.36917820076147717
Epoch 50
Train Loss: 0.318291408320268
Epoch 60
Train Loss: 0.3186407958467801
Epoch 70
Train Loss: 0.2930892010529836
Epoch 80
Train Loss: 0.2683047999938329
Epoch 90
Train Loss: 0.27024005353450775
Epoch 100
Train Loss: 0.24579465637604395
Epoch 110
Train Loss: 0.2762375424305598
Epoch 120
Train Loss: 0.22375448048114777
Epoch 130
Train Loss: 0.20041662951310477
Epoch 140
Train Loss: 0.23873620728651682
Epoch 150
Train Loss: 0.22707791378100714
Epoch 160
Train Loss: 0.31766921033461887
Epoch 170
Train Loss: 0.199457714955012
Epoch 180
Train Loss: 0.15615902344385782
Epoch 190
Train Loss: 0.12730027735233307
Epoch 200
Train Loss: 0.132468710343043


In [6]:
MODEL.eval()

ScoreClassification(
  (embeddings): Embedding(5268, 937, padding_idx=0)
  (lstm): LSTM(937, 300, batch_first=True)
  (linear): Linear(in_features=300, out_features=2, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [23]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

def test_model(model, test_loader):
    correct = 0
    total = 0
    correct_f_score = 0
    with torch.no_grad():
        for data in test_loader:
            x, y = data
            x, y = x.to(DEVICE), y.to(DEVICE)
            outputs = model(x)
            _, predicted = torch.max(outputs.data, 1)
            total += y.size(0)
            correct += accuracy_score(y, predicted)
            correct_f_score += f1_score(y, predicted, average='weighted')
            
    print(f'Accuracy of the model: {100 * correct / total}%')
    print(f'F-score of the model: {100 * correct_f_score / total}%')

print("Test")
test_model(MODEL, test_loader)
print("Train")
test_model(MODEL, train_loader)

Test
Accuracy of the model: 0.8837544326241136%
F-score of the model: 0.8795065011820331%
Train
Accuracy of the model: 0.7624271199519849%
F-score of the model: 0.7617321735221108%


In [19]:
print(DEVICE)

cpu
