In [1]:
!pip install bs4
!pip install openpyxl
!pip install fake_useragent
!pip install loguru
!pip install pymorphy2
!pip install flair
!pip install transformers sentencepiece
!pip install sacremoses
!pip install jq
!pip install slovnet

Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Installing collected packages: bs4
Successfully installed bs4-0.0.2
Collecting fake_useragent
  Downloading fake_useragent-1.5.1-py3-none-any.whl.metadata (15 kB)
Downloading fake_useragent-1.5.1-py3-none-any.whl (17 kB)
Installing collected packages: fake_useragent
Successfully installed fake_useragent-1.5.1
Collecting loguru
  Downloading loguru-0.7.2-py3-none-any.whl.metadata (23 kB)
Downloading loguru-0.7.2-py3-none-any.whl (62 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.5/62.5 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: loguru
Successfully installed loguru-0.7.2
Collecting pymorphy2
  Downloading pymorphy2-0.9.1-py3-none-any.whl.metadata (3.6 kB)
Collecting dawg-python>=0.7.1 (from pymorphy2)
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl.metadata (7.0 kB)
Collecting pymorphy2-dict

In [2]:
#  Импорт библиотек
import aiohttp
import asyncio
import pandas as pd
import numpy as np
import pickle
import pymorphy2
from urllib.parse import quote_plus
from bs4 import BeautifulSoup as bs
from datetime import date, timedelta, datetime
from fake_useragent import UserAgent
from loguru import logger
from abc import ABC, abstractmethod
from google.colab import files
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import random_projection
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import cosine_distances

import os
import gc
import nltk
from flair.data import Sentence
from flair.nn import Classifier
from torch.utils.data import DataLoader
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import OrderedDict
from transformers import MPNetPreTrainedModel, MPNetModel, AutoTokenizer
import torch
import math
from navec import Navec
from slovnet import NER

In [3]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Парсеры

In [4]:
# Интерфейс для всех парсеров
class IParser(ABC):
  def __init__(self):
    self.data = pd.DataFrame()

  @abstractmethod
  async def _get_header(self):
    """ Метод для получения header """

  @abstractmethod
  async def _get_url(self):
    """ Метод для получения url """

  @abstractmethod
  async def _get_data(self, size: int):
    """ Метод для получения urls """

  async def _get_data_titles(self, url:str, title:str, text:str, date:str):
    return {
          url: 'url',
          title: 'title',
          text: 'text',
          date: 'date'
          }

  async def _fill_data(self, frames, data_titles):
    data = pd.concat(frames)#.drop_duplicates(subset=[data_titles['url']])
    data.rename(columns = data_titles, inplace = True)
    data = data.drop_duplicates(subset=['url'])
    data['publication_date'] = pd.to_datetime(data['date'], unit='s')
    data['company'] = company
    self.data = data[['url', 'title', 'text', 'publication_date', 'company']]

  async def start(self):
    logger.info(f'{self.__class__.__name__} start')
    await self._get_data()
    logger.info(f'{self.__class__.__name__} end')

In [5]:
# Парсер Лента
class Lenta(IParser):
    async def _get_header(self):
      pass

    async def _get_url(self):
      url = 'https://lenta.ru/search/v2/process?'\
          + 'query={}&'\
          + 'from={}&'\
          + 'size={}&'\
          + 'sort=2&'\
          + 'title_only=0&'\
          + 'domain=1&'\
          + 'modified%2Cformat=yyyy-MM-dd&'\
          + 'modified%2Cfrom={}&'\
          + 'modified%2Cto={}'\

      return url

    async def _get_data(self, size: int = 500):
      page = 0
      frames = []
      url = await self._get_url()

      async with aiohttp.ClientSession() as session:
        while True:
          current_url = url.format(company,page, size, date_start.strftime("%Y-%m-%d"), date_end.strftime("%Y-%m-%d"))
          async with session.get(current_url) as response:
            if response.status != 200:
              logger.error(f'{self.__class__.__name__} status code {response.status}, url = {current_url}')
              await asyncio.sleep(10)
              continue
            data = await response.json()

          if len(data['matches']) == 0:
            break

          frames.append(pd.DataFrame(data['matches']))
          page += size

      try:
        # data = pd.concat(frames).drop_duplicates(subset=['url'])
        # data['publication_date'] = pd.to_datetime(data['pubdate'], unit='s')

        data_titles = await self._get_data_titles('url', 'title', 'text', 'pubdate')
        await self._fill_data(frames, data_titles)
      except:
        logger.info(f'{self.__class__.__name__}: нет новостей')
        pass

In [6]:
# Парсер Rbc
class Rbc(IParser):
    async def _get_header(self):
      pass

    async def _get_url(self):
      url = 'https://www.rbc.ru/search/ajax/?' +\
        'query={}&' +\
        'dateFrom={}&' +\
        'dateTo={}&' +\
        'page={}'
        # 'material={0}'.format(param_dict['material'])
        # 'offset={0}&'.format(param_dict['offset']) +\
        # 'limit={0}&'.format(param_dict['limit']) +\

      return url

    async def _get_data(self, size: int = 1):
      page = 0
      date_from = date_start
      date_to = date_end
      frames = []
      url = await self._get_url()

      async with aiohttp.ClientSession() as session:
        while True:
          current_url = url.format(company, date_from.strftime('%d.%m.%Y'), date_to.strftime('%d.%m.%Y'), page)
          async with session.get(current_url) as response:
            data = await response.json()
            if response.status != 200:
              logger.error(f'{self.__class__.__name__} status code {response.status}, url = {current_url}')
              await asyncio.sleep(10)
              continue

          if not data['moreExists']:
            # logger.info(f'Rbc: {date_from} - {date_to} done {rbc_page} pages')
            # print(f'rbc {date_from} - {date_to}, {rbc_page} done, {url}')
            break

          search_table = pd.DataFrame(data['items'])
          search_table[['overview', 'text']] = await asyncio.gather(*(self._get_article_data(v, session) for v in search_table['fronturl']))


            # if 'publish_date_t' in search_table.columns:
            #     search_table.sort_values('publish_date_t', ignore_index=True)
          frames.append(search_table)

          page += size
          if page == 100:
            date_to = datetime.strptime(search_table['publish_date'].iat[-1][:10], '%Y-%m-%d').date()
            page = 0
            logger.info(f'{self.__class__.__name__}: новые даты {date_from} - {date_to}')

      try:
        # data = pd.concat(frames).drop_duplicates(subset=['fronturl'])
        # data['publish_date'] = pd.to_datetime(data['publish_date_t'], unit='s')
        # data['site'] = self.__class__.__name__
        # data.rename(columns = {'fronturl':'url'}, inplace = True)
        # self.data = data[['url', 'title', 'text', 'publish_date', 'site']]
        data_titles = await self._get_data_titles('fronturl', 'title', 'text', 'publish_date_t')
        await self._fill_data(frames, data_titles)
      except:
        logger.info(f'{self.__class__.__name__}: нет новостей')
        pass


    async def _get_article_data(self, url: str, session):
        """
        Возвращает описание и текст статьи по ссылке
        """
        # r = rq.get(url)
        # async with aiohttp.ClientSession() as session:
        async with session.get(url) as r:
          soup = bs(await r.text(), features="lxml") # features="lxml" чтобы не было warning
        div_overview = soup.find('div', {'class': 'article__text__overview'})
        if div_overview:
            overview = div_overview.text.replace('<br />','\n').strip()
        else:
            overview = None
        p_text = soup.find_all('p')
        if p_text:
            text = ' '.join(map(lambda x:
                                x.text.replace('<br />','\n').strip(),
                                p_text))
        else:
            text = None

        return overview, text

# Удаление мусора

In [7]:
class RepeatRemover():
    morph = pymorphy2.MorphAnalyzer()
    morph_dct = {}

    def preprocess_text(self, text):
        text = text.lower()
        text = text.replace('\r', ' ').replace('\n', ' ')
        text  = self.only_valid_symb(text)
        norm_text = self.only_valid_forms(text)
        return norm_text


    def only_valid_symb(self, text: str)-> str:
        valid = set('йцукенгшщзхъфывапролджэячсмитьбюё ')
        txt =  ''.join(x for x in text if x in valid)
        while '  ' in txt:
            txt = txt.replace('  ', ' ')
        return txt.strip()

    def only_valid_forms(self, text)->str:
        filter_forms = {'ADJF', 'ADJS', 'COMP', 'NUMR', 'NPRO', 'PREP', 'CONJ', 'PRCL', 'INTJ'}
        lst = []
        for word in text.split():
            tag = self.morph_parse(word)
            if any([x in tag.tag for x in filter_forms]):
                continue
            lst.append(tag.normal_form)
        return ' '.join(lst)

    def morph_parse(self, word:str)->str:
        if word not in self.morph_dct:
            tag = self.morph.parse(word)[0]
            self.morph_dct[word] = tag
        return self.morph_dct[word]

    def start(self, articles):
        data = articles.to_dict('records')
        _text = [self.preprocess_text(x['text']) for x in data if type(x['text']) is str and x['text']!='']
        vectorized = TfidfVectorizer(min_df=2).fit_transform(_text)

        __X = np.ceil(vectorized.toarray())
        dist = pairwise_distances(__X, metric='cosine')
        np.fill_diagonal(dist, 1)
        pairs = set()
        for x, y in np.argwhere(dist < 0.15):
            pairs.add(tuple(sorted([x, y])))

        clones = set()
        for pair in pairs:
            copy_1 = data[pair[0]]
            copy_2 = data[pair[1]]
            if abs((copy_1['publication_date'] - copy_2['publication_date']).days) < 7:
                clones.add(pair[1])

        data_array = np.array(data)
        data_array_unique = np.delete(data_array, list(clones))
        data_unique = data_array_unique.tolist()
        return pd.DataFrame(data_unique)


# Классификация с помощью ESGify

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


In [9]:
# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
        token_embeddings = model_output #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# Definition of ESGify class because of custom,sentence-transformers like, mean pooling function and classifier head
class ESGify(MPNetPreTrainedModel):
    """Model for Classification ESG risks from text."""

    def __init__(self,config): #tuning only the head
        """
        """
        super().__init__(config)
        # Instantiate Parts of model
        self.mpnet = MPNetModel(config,add_pooling_layer=False)
        self.id2label =  config.id2label
        self.label2id =  config.label2id
        self.classifier = torch.nn.Sequential(OrderedDict([('norm',torch.nn.BatchNorm1d(768)),
                                                ('linear',torch.nn.Linear(768,512)),
                                                ('act',torch.nn.ReLU()),
                                                ('batch_n',torch.nn.BatchNorm1d(512)),
                                                ('drop_class', torch.nn.Dropout(0.2)),
                                                ('class_l',torch.nn.Linear(512 ,47))]))


    def forward(self, input_ids, attention_mask):
         # Feed input to mpnet model
        outputs = self.mpnet(input_ids=input_ids,
                             attention_mask=attention_mask)

        # mean pooling dataset and eed input to classifier to compute logits
        logits = self.classifier( mean_pooling(outputs['last_hidden_state'],attention_mask))

        # apply sigmoid
        logits  = 1.0 / (1.0 + torch.exp(-logits))
        return logits

In [10]:
def get_texts_with_masks(texts):
    navec = Navec.load('navec_news_v1_1B_250K_300d_100q.tar')
    ner = NER.load('slovnet_ner_news_v1.tar')
    ner.navec(navec)

    stop_words = set(stopwords.words('russian'))
    tag_list = ['FAC','LOC','ORG','PER']
    texts_with_masks = []
    for example_sent in texts:
        filtered_sentence = []
        word_tokens = word_tokenize(example_sent)
        # converts the words in word_tokens to lower case and then checks whether
        #they are present in stop_words or not
        for w in word_tokens:
            if w.lower() not in stop_words:
                filtered_sentence.append(w)
        # make a sentence
        sent = ' '.join(filtered_sentence)
        markup = ner(sent)
        k = 0
        new_string = ''
        start_t = 0
        for span in markup.spans:
              if span.type in tag_list:
                if span.start>start_t :
                    new_string+=sent[start_t:span.start]
                start_t = span.stop
                new_string+= f'<{span.type}>'
        new_string+=sent[start_t:-1]
        texts_with_masks.append(new_string)
    return texts_with_masks

In [11]:
def get_classes(df, texts_with_masks, model, tokenizer):
    weights = pd.read_excel('trashs_esgify_ru.xlsx')
    weights_dict = {}
    for row in weights.index:
        weights_dict[weights['Unnamed: 0'][row]] =  weights['trash'][row]

    for e in range(0, len(df), 10):
        last_elem = e + 10
        if e + 10 > len(df):
            last_elem = len(df)
        text_slice = texts_with_masks[e:last_elem]
        to_model = tokenizer.batch_encode_plus(
                          text_slice,
                          add_special_tokens=True,
                          max_length=512,
                          return_token_type_ids=False,
                          padding="max_length",
                          truncation=True,
                          return_attention_mask=True,
                          return_tensors='pt',
                    )
        to_model.to(device)
        results = model(**to_model)
        results.to(device)
        # for i in range(len(results)):
        #     print('-------------')
        #     for j in torch.topk(results, k=10).indices.tolist()[i]:
        #         print(f"{model.id2label[j]}: {np.round(results[i][j].item(), 3)}")
        for i in range(len(df[e:last_elem])):
            classes = ''
            for j in range(47):
                label = model.id2label[j]
                if results[i][j].item() >= weights_dict[label]:
                    classes += f'{label};'
            if len(classes) == 0:
                classes = 'Not Relevant to ESG;'
            classes = classes[:len(classes) - 1]
            df.loc[e + i, "classes"] = classes
        gc.collect()
        torch.cuda.empty_cache()
    return df

In [12]:
def classify(df):
    model = ESGify.from_pretrained('ai-lab/ESGify')
    model.to(device)
    tokenizer = AutoTokenizer.from_pretrained('ai-lab/ESGify')

    for i in range(len(df)):
        if type(df['text'][i]) == float or not bool(df['text'][i].replace(' ', '')):
            df.drop(i, inplace=True)
            continue
        if len(df['text'][i]) > 5000:
            df.loc[i, "text"] = df['text'][i][:5000]
    ru_texts = list(df['text'])
    texts_with_masks = get_texts_with_masks(ru_texts)
    df['classes'] = ''
    df = get_classes(df, ru_texts, model, tokenizer)
    df = df.loc[df['classes'] != "Not Relevant to ESG"]
    return df

# Начало работы

In [13]:
# Настройка
company = 'сбербанк'
date_start = date(year=2024, month=9, day=1)
date_end = date.today()

In [14]:
# Данные для работы с header (пока не используется)
query = quote_plus(company)
ua = UserAgent()

In [15]:
# Это для ассинхронного запуска
parsers = [Rbc(),Lenta()]

async def main():
  tasks = []
  for parser in parsers:
    tasks.append(parser.start())

  await asyncio.gather(*tasks)

In [16]:
# Запускаем асинхронную работу парсеров
await main()

[32m2024-10-23 07:29:13.765[0m | [1mINFO    [0m | [36m__main__[0m:[36mstart[0m:[36m35[0m - [1mRbc start[0m
[32m2024-10-23 07:29:13.767[0m | [1mINFO    [0m | [36m__main__[0m:[36mstart[0m:[36m35[0m - [1mLenta start[0m
[32m2024-10-23 07:29:19.740[0m | [1mINFO    [0m | [36m__main__[0m:[36mstart[0m:[36m37[0m - [1mLenta end[0m
[32m2024-10-23 07:30:20.981[0m | [1mINFO    [0m | [36m__main__[0m:[36mstart[0m:[36m37[0m - [1mRbc end[0m


In [17]:
len(parsers[1].data)

112

In [18]:
parsers[1].data.head()

Unnamed: 0,url,title,text,publication_date,company
0,https://lenta.ru/news/2024/10/22/v-app-store-p...,В App Store появилась новая официальная версия...,Фото: Ute Grabowsky / Globallookpress.com Олег...,2024-10-22 07:26:00,сбербанк
1,https://lenta.ru/news/2024/10/18/agenta/,Сбер запустил в Минфине России AI-агента,Александр Ведяхин Фото: Алексей Даничев / РИА ...,2024-10-18 12:35:00,сбербанк
2,https://lenta.ru/news/2024/10/18/servis/,В Сбере оценили сервис «цифровой профиль гражд...,Александр Ведяхин Фото: Эмин Джафаров / Коммер...,2024-10-18 11:23:06,сбербанк
3,https://lenta.ru/news/2024/10/18/zaklyuchennyh/,В Корпорации МСП назвали объем заключенных МСП...,Фото: Shutterstock / Fotodom Татьяна Романова ...,2024-10-18 09:30:00,сбербанк
4,https://lenta.ru/news/2024/10/17/vystupil/,Сбер выступил участником новых пилотных проект...,Фото: Ирина Бужор / Коммерсантъ Алена Шаповало...,2024-10-17 14:05:00,сбербанк


In [19]:
# Тут начало удаления повторок
df = pd.concat([parser.data for parser in parsers], ignore_index=True)

In [20]:
len(df)

452

In [21]:
remover = RepeatRemover()
df_clear = remover.start(df)

In [22]:
len(df_clear)

441

In [23]:
df_clear.head()

Unnamed: 0,url,title,text,publication_date,company
0,https://companies.rbc.ru/news/cPKE2axEFI/unive...,Университеты становятся центрами притяжения и ...,За последнее десятилетие университеты прошли т...,2024-10-22 15:12:53,сбербанк
1,https://www.rbc.ru/finances/22/10/2024/67177f5...,В Думе сообщили об интересе китайских банков к...,Банки из Китая и Вьетнама подают сигналы о жел...,2024-10-22 11:24:24,сбербанк
2,https://realty.rbc.ru/news/671574cb9a7947c3fee...,«Эффект шринкфляции»: спрос на строительство ч...,"Получайте рассылку с новостями, которые касают...",2024-10-22 11:12:55,сбербанк
3,https://www.rbc.ru/finances/22/10/2024/6717717...,Минфин оценил будущее IPO «Дом.РФ» в ₽15 млрд,Госкорпорация «Дом.РФ» может привлечь на IPO (...,2024-10-22 10:52:55,сбербанк
4,https://www.rbc.ru/finances/22/10/2024/671760a...,"В Минфине указали, что госкомпании «расслабили...",В 2025 году компании с государственным участие...,2024-10-22 09:44:56,сбербанк


Далее для корректной работы нужный файлы: trashs_esgify_ru.xlsx, slovnet_ner_news_v1.tar и navec_news_v1_1B_250K_300d_100q.tar

In [24]:
# Начало классфикации
df_with_classes = classify(df_clear)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/5.48k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/437M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/364 [00:00<?, ?B/s]

In [25]:
len(df_with_classes)

440

In [26]:
df_with_classes.head()

Unnamed: 0,url,title,text,publication_date,company,classes
0,https://companies.rbc.ru/news/cPKE2axEFI/unive...,Университеты становятся центрами притяжения и ...,За последнее десятилетие университеты прошли т...,2024-10-22 15:12:53,сбербанк,Environmental Management;Soil and Groundwater ...
1,https://www.rbc.ru/finances/22/10/2024/67177f5...,В Думе сообщили об интересе китайских банков к...,Банки из Китая и Вьетнама подают сигналы о жел...,2024-10-22 11:24:24,сбербанк,Soil and Groundwater Impact
2,https://realty.rbc.ru/news/671574cb9a7947c3fee...,«Эффект шринкфляции»: спрос на строительство ч...,"Получайте рассылку с новостями, которые касают...",2024-10-22 11:12:55,сбербанк,Soil and Groundwater Impact
3,https://www.rbc.ru/finances/22/10/2024/6717717...,Минфин оценил будущее IPO «Дом.РФ» в ₽15 млрд,Госкорпорация «Дом.РФ» может привлечь на IPO (...,2024-10-22 10:52:55,сбербанк,Environmental Management;Soil and Groundwater ...
4,https://www.rbc.ru/finances/22/10/2024/671760a...,"В Минфине указали, что госкомпании «расслабили...",В 2025 году компании с государственным участие...,2024-10-22 09:44:56,сбербанк,Environmental Management;Soil and Groundwater ...


In [27]:
df_with_classes.to_csv(f'with_classes_{company}.csv')