In [1]:
!pip install bs4
!pip install openpyxl
!pip install fake_useragent
!pip install loguru
!pip install pymorphy2
!pip install flair
!pip install transformers sentencepiece
!pip install sacremoses
!pip install jq
!pip install slovnet
!pip install googletrans==3.1.0a0

Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Installing collected packages: bs4
Successfully installed bs4-0.0.2
Collecting fake_useragent
  Downloading fake_useragent-1.5.1-py3-none-any.whl.metadata (15 kB)
Downloading fake_useragent-1.5.1-py3-none-any.whl (17 kB)
Installing collected packages: fake_useragent
Successfully installed fake_useragent-1.5.1
Collecting loguru
  Downloading loguru-0.7.2-py3-none-any.whl.metadata (23 kB)
Downloading loguru-0.7.2-py3-none-any.whl (62 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.5/62.5 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: loguru
Successfully installed loguru-0.7.2
Collecting pymorphy2
  Downloading pymorphy2-0.9.1-py3-none-any.whl.metadata (3.6 kB)
Collecting dawg-python>=0.7.1 (from pymorphy2)
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl.metadata (7.0 kB)
Collecting pymorphy2-dict

In [2]:
#  Импорт библиотек
import aiohttp
import asyncio
import pandas as pd
import numpy as np
import pickle
import pymorphy2
import re
import requests
import uuid
import json
from urllib.parse import quote_plus
from bs4 import BeautifulSoup as bs
from datetime import date, timedelta, datetime, timezone
from fake_useragent import UserAgent
from loguru import logger
from abc import ABC, abstractmethod
from google.colab import files
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import random_projection
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import cosine_distances

import os
import gc
import nltk
from flair.data import Sentence
from flair.nn import Classifier as Classifier1
from torch.utils.data import DataLoader
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import OrderedDict
from transformers import MPNetPreTrainedModel, MPNetModel, AutoTokenizer
import torch
import math
from navec import Navec
from slovnet import NER
from googletrans import Translator

In [3]:
# Для работы с переменными среды
from google.colab import userdata

In [4]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# ИНН

In [5]:
# Функция для определения имени компании по ИНН
def inn_to_company(inn: str):
  url_inn = f'https://egrul.itsoft.ru/{inn}.json'
  inn_request = requests.get(url_inn)
  inn_json = inn_request.json()
  full_company_name = inn_json['СвЮЛ']['СвНаимЮЛ']['СвНаимЮЛСокр']['@attributes']['НаимСокр']
  company_name = re.search(r'\".+\"', full_company_name)
  return company_name.group(0).replace('"', '').lower()

# Парсеры

In [6]:
# Интерфейс для всех парсеров
class IParser(ABC):
  def __init__(self):
    self.data = pd.DataFrame()

  @abstractmethod
  async def _get_header(self):
    """ Метод для получения header """

  @abstractmethod
  async def _get_url(self):
    """ Метод для получения url """

  @abstractmethod
  async def _get_data(self, size: int):
    """ Метод для получения urls """

  async def _get_data_titles(self, url:str, title:str, text:str, date:str):
    return {
          url: 'url',
          title: 'title',
          text: 'text',
          date: 'date'
          }

  async def _fill_data(self, frames, data_titles):
    data = pd.concat(frames)
    data.rename(columns = data_titles, inplace = True)
    data = data.drop_duplicates(subset=['url'])
    data['publication_date'] = pd.to_datetime(data['date'], unit='s')
    data['company'] = company
    data['inn'] = inn
    self.data = data[['url', 'title', 'text', 'publication_date', 'inn','company']].dropna(axis=0, how='any')

  async def start(self):
    logger.info(f'{self.__class__.__name__} start')
    await self._get_data()
    logger.info(f'{self.__class__.__name__} end')

In [7]:
# Парсер Лента
class Lenta(IParser):
    async def _get_header(self):
      pass

    async def _get_url(self):
      url = 'https://lenta.ru/search/v2/process?'\
          + 'query={}&'\
          + 'from={}&'\
          + 'size={}&'\
          + 'sort=2&'\
          + 'title_only=0&'\
          + 'domain=1&'\
          + 'modified%2Cformat=yyyy-MM-dd&'\
          + 'modified%2Cfrom={}&'\
          + 'modified%2Cto={}'\

      return url

    async def _get_data(self, size: int = 500):
      page = 0
      frames = []
      url = await self._get_url()

      async with aiohttp.ClientSession() as session:
        while True:
          current_url = url.format(company,page, size, date_start.strftime("%Y-%m-%d"), date_end.strftime("%Y-%m-%d"))
          async with session.get(current_url) as response:
            if response.status != 200:
              logger.error(f'{self.__class__.__name__} status code {response.status}, url = {current_url}')
              await asyncio.sleep(10)
              continue
            data = await response.json()

          if len(data['matches']) == 0:
            break

          frames.append(pd.DataFrame(data['matches']))
          page += size

      try:
        data_titles = await self._get_data_titles('url', 'title', 'text', 'pubdate')
        await self._fill_data(frames, data_titles)
      except:
        logger.info(f'{self.__class__.__name__}: нет новостей')
        pass

In [8]:
# Парсер Rbc
class Rbc(IParser):
    async def _get_header(self):
      pass

    async def _get_url(self):
      url = 'https://www.rbc.ru/search/ajax/?' +\
        'query={}&' +\
        'dateFrom={}&' +\
        'dateTo={}&' +\
        'page={}'

      return url

    async def _get_data(self, size: int = 1):
      page = 0
      date_from = date_start
      date_to = date_end
      frames = []
      url = await self._get_url()

      async with aiohttp.ClientSession() as session:
        while True:
          current_url = url.format(company, date_from.strftime('%d.%m.%Y'), date_to.strftime('%d.%m.%Y'), page)
          async with session.get(current_url) as response:
            data = await response.json()
            if response.status != 200:
              logger.error(f'{self.__class__.__name__} status code {response.status}, url = {current_url}')
              await asyncio.sleep(10)
              continue

          if not data['moreExists']:
            break

          search_table = pd.DataFrame(data['items'])

          text = []
          for v in search_table['fronturl']:
              text.append(await self._get_article_data(v, session))

          search_table['text'] = text

          frames.append(search_table)

          page += size
          logger.info(f"{page}")
          if page == 100:
            date_to = datetime.strptime(search_table['publish_date'].iat[-1][:10], '%Y-%m-%d').date()
            page = 0
            logger.info(f'{self.__class__.__name__}: новые даты {date_from} - {date_to}')

      try:
        data_titles = await self._get_data_titles('fronturl', 'title', 'text', 'publish_date_t')
        await self._fill_data(frames, data_titles)
      except:
        logger.info(f'{self.__class__.__name__}: нет новостей')
        pass


    async def _get_article_data(self, url: str, session):
        """
        Возвращает описание и текст статьи по ссылке
        """
        async with session.get(url) as r:
          soup = bs(await r.text(), features="lxml") # features="lxml" чтобы не было warning
        p_text = soup.find_all('p')
        if p_text:
            text = ' '.join(map(lambda x:
                                x.text.replace('<br />','\n').strip(),
                                p_text))
        else:
            text = None

        return text

In [9]:
# Парсер Регнум
class Regnum(IParser):
    async def _get_header(self):
      pass

    async def _get_url(self):
      url = 'https://regnum.ru/api/search/materials?' +\
        'page={}&' +\
        'searchText={}&' +\
        'typeIds=0&' +\
        'typeIds=1&' +\
        'typeIds=2&' +\
        'typeIds=3&' +\
        'typeIds=6&' +\
        'dateFrom={}&' +\
        'dateTo={}&' +\
        'order=desc'

      return url

    async def _get_data(self, size: int = 1):
        page = 0
        frames = []
        url = await self._get_url()

        async with aiohttp.ClientSession() as session:
          while True:
            current_url = url.format(page, company, date_start.strftime("%d-%m-%Y"), date_end.strftime("%d-%m-%Y"))
            async with session.get(current_url) as response:
              if response.status != 200:
                logger.error(f'{self.__class__.__name__} status code {response.status}, url = {current_url}')
                await asyncio.sleep(10)
                continue
              data = await response.json()

            if data['pagination']['next'] == 0:
              break

            search_table = pd.DataFrame(data['hits'])

            text = []
            for v in search_table['url']:
              text.append(await self._get_article_data(v, session))

            search_table['text'] = text

            frames.append(search_table)
            page += size

        try:
          data_titles = await self._get_data_titles('url', 'header', 'text', 'date')
          await self._fill_data(frames, data_titles)
        except:
          logger.info(f'{self.__class__.__name__}: нет новостей')
          pass


    async def _get_article_data(self, url: str, session):
        """
        Возвращает описание и текст статьи по ссылке
        """
        async with session.get(f'https://regnum.ru{url}') as r:
          soup = bs(await r.text(), features="lxml") # features="lxml" чтобы не было warning
        p_text = soup.find_all('p')
        if p_text:
            text = ' '.join(map(lambda x:
                                x.text.replace('<br />','\n').strip(),
                                p_text))
        else:
            text = None
        return text

# Удаление мусора

In [10]:
class RepeatRemover:
    morph = pymorphy2.MorphAnalyzer()
    morph_dct = {}

    def preprocess_text(self, text):
        text = text.lower()
        text = text.replace('\r', ' ').replace('\n', ' ')
        text  = self.only_valid_symb(text)
        norm_text = self.only_valid_forms(text)
        return norm_text


    def only_valid_symb(self, text: str)-> str:
        valid = set('йцукенгшщзхъфывапролджэячсмитьбюё ')
        txt =  ''.join(x for x in text if x in valid)
        while '  ' in txt:
            txt = txt.replace('  ', ' ')
        return txt.strip()

    def only_valid_forms(self, text)->str:
        filter_forms = {'ADJF', 'ADJS', 'COMP', 'NUMR', 'NPRO', 'PREP', 'CONJ', 'PRCL', 'INTJ'}
        lst = []
        for word in text.split():
            tag = self.morph_parse(word)
            if any([x in tag.tag for x in filter_forms]):
                continue
            lst.append(tag.normal_form)
        return ' '.join(lst)

    def morph_parse(self, word:str)->str:
        if word not in self.morph_dct:
            tag = self.morph.parse(word)[0]
            self.morph_dct[word] = tag
        return self.morph_dct[word]

    def start(self, articles):
        data = articles.to_dict('records')
        _text = [self.preprocess_text(x['text']) for x in data if type(x['text']) is str and x['text']!='']
        vectorized = TfidfVectorizer(min_df=2).fit_transform(_text)

        __X = np.ceil(vectorized.toarray())
        dist = pairwise_distances(__X, metric='cosine')
        np.fill_diagonal(dist, 1)
        pairs = set()
        for x, y in np.argwhere(dist < 0.15):
            pairs.add(tuple(sorted([x, y])))

        clones = set()
        for pair in pairs:
            copy_1 = data[pair[0]]
            copy_2 = data[pair[1]]
            if abs((copy_1['publication_date'] - copy_2['publication_date']).days) < 7:
                clones.add(pair[1])

        data_array = np.array(data)
        data_array_unique = np.delete(data_array, list(clones))
        data_unique = data_array_unique.tolist()
        return pd.DataFrame(data_unique)


# Проверка вхождений

In [11]:
async def is_company_in_text(text ,pattern):
  return bool(re.search(pattern,text))

async def get_occurrences_df(df, pattern):
  df['is_company_in_text'] = await asyncio.gather(*(is_company_in_text(v, pattern) for v in df['text']))
  return df[df['is_company_in_text']]

# LLM

In [12]:
def gpt_response(question):
    url = "http://5.39.220.103:5009/ask"

    data = {
        "messages": [
            {"role": "system", "content":
                                       f"Пиши только одно слово - да или нет."
                                       },
            {"role": "user", "content": f"{question}"}
        ]
    }

    response = requests.post(url, json=data)

    if response.status_code == 200:
        response_data = response.json()
        return response_data['response']
    else:
        return f"Error: {response.status_code}, {response.text}"

# GigaChat

In [13]:
class GigaChat:
    def __init__(self, auth):
        self.auth_token = auth
        self.access_token = ''
        self.token_expires_at = 0
        self.__update_token()

    def __update_token(self, scope='GIGACHAT_API_PERS'):
        rq_uid = str(uuid.uuid4())
        url = "https://ngw.devices.sberbank.ru:9443/api/v2/oauth"

        headers = {
            'Content-Type': 'application/x-www-form-urlencoded',
            'Accept': 'application/json',
            'RqUID': rq_uid,
            'Authorization': f'Basic {self.auth_token}'
        }

        payload = {
            'scope': scope
        }
        response = requests.post(url, headers=headers, data=payload, verify=False)

        try:
            response = requests.post(url, headers=headers, data=payload, verify=False)
            self.access_token = response.json()['access_token']
            self.token_expires_at = int(response.json()['expires_at'])
        except requests.RequestException as e:
            print(f"Ошибка: {str(e)}")
            self.access_token = ''
            self.token_expires_at = 0

    def get_response(self, question):
        if int(self.token_expires_at) < datetime.now(timezone.utc).timestamp() * 1000:
            self.__update_token()
        url = "https://gigachat.devices.sberbank.ru/api/v1/chat/completions"
        data = [
            {
                'role': 'system',
                'content': 'Пиши только одно слово - да или нет.'
            }
        ]

        # Подготовка данных запроса в формате JSON
        payload = json.dumps({
          "model": "GigaChat",
          "messages": [
            {
              "role": "system",
              "content": "Пиши только одно слово - да или нет."
            },
            {
              "role": "user",
              "content": f"{question}"
            }
          ],
          "stream": False,
          "update_interval": 0
        })

        # Заголовки запроса
        headers = {
            'Content-Type': 'application/json',
            'Accept': 'application/json',
            'Authorization': f'Bearer {self.access_token}'
        }

        # Выполнение POST-запроса и возвращение ответа
        try:
            response = requests.request("POST", url, headers=headers, data=payload, verify=False)
            response_data = response.json()['choices'][0]['message']['content']
            return response_data
        except requests.RequestException as e:
            return f"Произошла ошибка: {str(e)}"

# Классификация с помощью ESGify

In [14]:
# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
        token_embeddings = model_output #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# Definition of ESGify class because of custom,sentence-transformers like, mean pooling function and classifier head
class ESGify(MPNetPreTrainedModel):
    """Model for Classification ESG risks from text."""

    def __init__(self,config): #tuning only the head
        """
        """
        super().__init__(config)
        # Instantiate Parts of model
        self.mpnet = MPNetModel(config,add_pooling_layer=False)
        self.id2label =  config.id2label
        self.label2id =  config.label2id
        self.classifier = torch.nn.Sequential(OrderedDict([('norm',torch.nn.BatchNorm1d(768)),
                                                ('linear',torch.nn.Linear(768,512)),
                                                ('act',torch.nn.ReLU()),
                                                ('batch_n',torch.nn.BatchNorm1d(512)),
                                                ('drop_class', torch.nn.Dropout(0.2)),
                                                ('class_l',torch.nn.Linear(512 ,47))]))


    def forward(self, input_ids, attention_mask):
         # Feed input to mpnet model
        outputs = self.mpnet(input_ids=input_ids,
                             attention_mask=attention_mask)

        # mean pooling dataset and eed input to classifier to compute logits
        logits = self.classifier( mean_pooling(outputs['last_hidden_state'],attention_mask))

        # apply sigmoid
        logits  = 1.0 / (1.0 + torch.exp(-logits))
        return logits

In [15]:
class Classifier:
    def __init__(self, device):
        self.model = ESGify.from_pretrained('ai-lab/ESGify')
        self.model.to(device)
        self.tokenizer = AutoTokenizer.from_pretrained('ai-lab/ESGify')

        weights = pd.read_excel('trashs_esgify_ru.xlsx')
        self.weights_dict = {}
        for row in weights.index:
            self.weights_dict[weights['Unnamed: 0'][row]] =  weights['trash'][row]

        navec = Navec.load('navec_news_v1_1B_250K_300d_100q.tar')
        self.ner = NER.load('slovnet_ner_news_v1.tar')
        self.ner.navec(navec)

        self.stop_words = set(stopwords.words('russian'))
        self.tag_list = ['FAC','LOC','ORG','PER']

    def get_texts_with_masks(self, texts):
        texts_with_masks = []
        for example_sent in texts:
            filtered_sentence = []
            word_tokens = word_tokenize(example_sent)
            # converts the words in word_tokens to lower case and then checks whether
            #they are present in stop_words or not
            for w in word_tokens:
                if w.lower() not in self.stop_words:
                    filtered_sentence.append(w)
            # make a sentence
            sent = ' '.join(filtered_sentence)
            markup = self.ner(sent)
            k = 0
            new_string = ''
            start_t = 0
            for span in markup.spans:
                  if span.type in self.tag_list:
                    if span.start>start_t :
                        new_string+=sent[start_t:span.start]
                    start_t = span.stop
                    new_string+= f'<{span.type}>'
            new_string+=sent[start_t:-1]
            texts_with_masks.append(new_string)
        return texts_with_masks

    def convert_texts(self, texts):
        stop_words = set(stopwords.words('english'))
        tagger = Classifier1.load('flair/ner-english-ontonotes')
        tag_list = ['FAC','LOC','ORG','PERSON']
        texts_with_masks = []
        for example_sent in texts:
            filtered_sentence = []
            word_tokens = word_tokenize(example_sent)
            # converts the words in word_tokens to lower case and then checks whether
            #they are present in stop_words or not
            for w in word_tokens:
                if w.lower() not in stop_words:
                    filtered_sentence.append(w)
            # make a sentence
            sentence = Sentence(' '.join(filtered_sentence))
            # run NER over sentence
            tagger.predict(sentence)
            sent = ' '.join(filtered_sentence)
            k = 0
            new_string = ''
            start_t = 0
            for i in sentence.get_labels():
                info = i.to_dict()
                val = info['value']
                if info['confidence']>0.8 and val in tag_list :

                    if i.data_point.start_position>start_t :
                        new_string+=sent[start_t:i.data_point.start_position]
                    start_t = i.data_point.end_position
                    new_string+= f'<{val}>'
            new_string+=sent[start_t:-1]
            texts_with_masks.append(new_string)
        return texts_with_masks

    def get_classes(self, df, texts_with_masks):
        for e in range(0, len(df), 10):
            last_elem = e + 10
            if e + 10 > len(df):
                last_elem = len(df)
            text_slice = texts_with_masks[e:last_elem]
            to_model = self.tokenizer.batch_encode_plus(
                              text_slice,
                              add_special_tokens=True,
                              max_length=512,
                              return_token_type_ids=False,
                              padding="max_length",
                              truncation=True,
                              return_attention_mask=True,
                              return_tensors='pt',
                        )
            to_model.to(device)
            results = self.model(**to_model)
            results.to(device)
            for i in range(len(results)):
                print('-------------')
                for j in torch.topk(results, k=10).indices.tolist()[i]:
                    print(f"{self.model.id2label[j]}: {np.round(results[i][j].item(), 3)}")
            for i in range(len(df[e:last_elem])):
                classes = ''
                for j in range(47):
                    label = self.model.id2label[j]
                    if results[i][j].item() >= self.weights_dict[label]:
                        classes += f'{label};'
                if len(classes) == 0:
                    classes = 'Not Relevant to ESG;'
                classes = classes[:len(classes) - 1]
                df.loc[e + i, "classes"] = classes
            gc.collect()
            torch.cuda.empty_cache()
        return df

    def start(self, init_df):
        df = init_df.copy()
        for i in range(len(df)):
            if type(df['text'][i]) == float or not bool(df['text'][i].replace(' ', '')):
                df.drop(i, inplace=True)
                continue
            if len(df['text'][i]) > 5000:
                df.loc[i, "text"] = df['text'][i][:5000]
        ru_texts = list(df['text'])
        # transl = Translator()
        # translations = transl.translate(ru_texts, dest='en')
        # texts = []
        # for translation in translations:
        #     texts.append(translation.text)
        # texts_with_masks = self.convert_texts(texts)
        texts_with_masks = self.get_texts_with_masks(ru_texts)
        df['classes'] = ''
        df = self.get_classes(df, texts_with_masks)
        df = df.loc[df['classes'] != "Not Relevant to ESG"]
        return df

# Начало работы

In [16]:
# Настройка
inn = '7727576505'
date_start = date(year=2024, month=1, day=1)
date_end = date.today()

In [17]:
# Получаем название компании по ИНН
company = inn_to_company(inn)
company

'сибур'

In [18]:
# Паттерн для поиска вхождений компании в тексте
company_pattern = f'{company[:-1]}\w'+'{,2}'
pattern = re.compile(f"(?i){company_pattern}")
pattern

re.compile(r'(?i)сибу\w{,2}', re.IGNORECASE|re.UNICODE)

In [19]:
# Данные для работы с header (пока не используется)
query = quote_plus(company)
ua = UserAgent()

In [20]:
# Это для ассинхронного запуска
parsers = [
    # Rbc(),
    # Lenta(),
    Regnum()
]

async def main():
  tasks = []
  for parser in parsers:
    tasks.append(parser.start())

  await asyncio.gather(*tasks)

In [21]:
# Запускаем асинхронную работу парсеров
await main()

[32m2024-11-07 13:59:05.946[0m | [1mINFO    [0m | [36m__main__[0m:[36mstart[0m:[36m36[0m - [1mRegnum start[0m
[32m2024-11-07 14:00:31.722[0m | [1mINFO    [0m | [36m__main__[0m:[36mstart[0m:[36m38[0m - [1mRegnum end[0m


In [22]:
# Смотрим кол-во статей для каждого парсера
for parser in parsers:
  print(f'{parser.__class__.__name__}: кол-во статей {len(parser.data)}')

Regnum: кол-во статей 15


In [23]:
parsers[0].data.head()

Unnamed: 0,url,title,text,publication_date,inn,company
0,/news/3920408,«Газпром» стал самой убыточной российской комп...,"Москва, 3 октября, 2024, 08:47 — ИА Регнум. Ко...",2024-10-03 05:47:00,7727576505,сибур
1,/news/3913895,Газпромбанк на полях ВЭФ провел сессию по фина...,"Владивосток, 5 сентября, 2024, 15:31 — ИА Регн...",2024-09-05 12:31:20,7727576505,сибур
2,/news/3912353,Вдова бизнесмена Когана вошла в десятку богате...,"Москва, 30 августа, 2024, 12:28 — ИА Регнум. В...",2024-08-30 09:28:00,7727576505,сибур
3,/news/3910830,Пётр Вагин прокомментировал свое избрание на д...,"Тобольск, Тюменская область, 23 августа, 2024,...",2024-08-23 12:41:00,7727576505,сибур
4,/news/3902733,Газпромбанк компенсировал часть собственного у...,"Москва, 18 июля, 2024, 11:14 — ИА Регнум. Газп...",2024-07-18 08:14:00,7727576505,сибур


In [24]:
# Тут начало удаления повторок
df = pd.concat([parser.data for parser in parsers], ignore_index=True)
print(f'перваночальное кол-во статей: {len(df)}')

перваночальное кол-во статей: 15


In [25]:
remover = RepeatRemover()
df_clear = remover.start(df)
print(f'кол-во статей после удаления повторок: {len(df_clear)}') #10 сек 422 статьи

кол-во статей после удаления повторок: 15


In [26]:
df_clear.head()

Unnamed: 0,url,title,text,publication_date,inn,company
0,/news/3920408,«Газпром» стал самой убыточной российской комп...,"Москва, 3 октября, 2024, 08:47 — ИА Регнум. Ко...",2024-10-03 05:47:00,7727576505,сибур
1,/news/3913895,Газпромбанк на полях ВЭФ провел сессию по фина...,"Владивосток, 5 сентября, 2024, 15:31 — ИА Регн...",2024-09-05 12:31:20,7727576505,сибур
2,/news/3912353,Вдова бизнесмена Когана вошла в десятку богате...,"Москва, 30 августа, 2024, 12:28 — ИА Регнум. В...",2024-08-30 09:28:00,7727576505,сибур
3,/news/3910830,Пётр Вагин прокомментировал свое избрание на д...,"Тобольск, Тюменская область, 23 августа, 2024,...",2024-08-23 12:41:00,7727576505,сибур
4,/news/3902733,Газпромбанк компенсировал часть собственного у...,"Москва, 18 июля, 2024, 11:14 — ИА Регнум. Газп...",2024-07-18 08:14:00,7727576505,сибур


Удаление статей, который не относятся к деятельности компании

In [27]:
df_occurrences = await get_occurrences_df(df_clear, pattern)
print(f'кол-во статей после проверки вхождения: {len(df_occurrences)}')

кол-во статей после проверки вхождения: 15


In [28]:
df_occurrences.head()

Unnamed: 0,url,title,text,publication_date,inn,company,is_company_in_text
0,/news/3920408,«Газпром» стал самой убыточной российской комп...,"Москва, 3 октября, 2024, 08:47 — ИА Регнум. Ко...",2024-10-03 05:47:00,7727576505,сибур,True
1,/news/3913895,Газпромбанк на полях ВЭФ провел сессию по фина...,"Владивосток, 5 сентября, 2024, 15:31 — ИА Регн...",2024-09-05 12:31:20,7727576505,сибур,True
2,/news/3912353,Вдова бизнесмена Когана вошла в десятку богате...,"Москва, 30 августа, 2024, 12:28 — ИА Регнум. В...",2024-08-30 09:28:00,7727576505,сибур,True
3,/news/3910830,Пётр Вагин прокомментировал свое избрание на д...,"Тобольск, Тюменская область, 23 августа, 2024,...",2024-08-23 12:41:00,7727576505,сибур,True
4,/news/3902733,Газпромбанк компенсировал часть собственного у...,"Москва, 18 июля, 2024, 11:14 — ИА Регнум. Газп...",2024-07-18 08:14:00,7727576505,сибур,True


Закомментированный код предназначен для работы гигачата. По умолчанию используется llama.

In [29]:
morph = pymorphy2.MorphAnalyzer()

# auth = userdata.get('auth')
# giga_chat = GigaChat(auth)

df = df_occurrences.reset_index(drop=True)
df_unknown = pd.DataFrame(columns=['url',	'title',	'text',	'publication_date',	'inn',	'company'])
for i in range(len(df)):
    if type(df['text'][i]) == float:
        df.drop(i, inplace=True)
        continue
    text = df['text'][i]

    if len(text) > 12000:
      text = text[:12000]
    question = f"относиться ли данный текст к юридическому лицу {company}? Текст: {text}"
    answer = gpt_response(question).lower()
    # answer = giga_chat.get_response(question).lower()
    # if 'да' not in answer[:3] and 'нет' not in answer[:4]:
    #     df_unknown.loc[len(df_unknown)] = df.loc[i]
    if "да" not in answer:
      df.drop(i, inplace=True)
      continue
print(len(df)) #21мин25сек - 340

7


In [30]:
df = df.reset_index(drop=True)

In [31]:
df.head()

Unnamed: 0,url,title,text,publication_date,inn,company,is_company_in_text
0,/news/3902733,Газпромбанк компенсировал часть собственного у...,"Москва, 18 июля, 2024, 11:14 — ИА Регнум. Газп...",2024-07-18 08:14:00,7727576505,сибур,True
1,/news/3895237,«Всероссийский день поля» пройдет на Ставрополье,"Ставрополь, 13 июня, 2024, 09:35 — ИА Регнум. ...",2024-06-13 06:35:00,7727576505,сибур,True
2,/news/3890946,Выставку-форум «Россия» посетили 14 млн гостей,"Москва, 24 мая, 2024, 17:52 — ИА Регнум. Между...",2024-05-24 14:52:00,7727576505,сибур,True
3,/news/3872138,СИБУР предложил повысить привлекательность тех...,"Москва, 6 марта, 2024, 17:36 — ИА Регнум. Рост...",2024-03-06 14:36:00,7727576505,сибур,True
4,/news/3869144,Первый глава Татарстана Шаймиев присоединился ...,"Казань, 22 февраля, 2024, 14:37 — ИА Регнум. П...",2024-02-22 11:37:00,7727576505,сибур,True


In [32]:
df.to_csv(f'without_class_{inn}_{company}.csv')

Далее для корректной работы нужный файлы: trashs_esgify_ru.xlsx, slovnet_ner_news_v1.tar и navec_news_v1_1B_250K_300d_100q.tar

In [33]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


In [36]:
# Начало классфикации
classifier = Classifier(device)
df_with_classes = classifier.start(df)

-------------
Soil and Groundwater Impact: 0.893
Responsible Investment & Greenwashing: 0.622
Not Relevant to ESG: 0.536
Corporate Governance: 0.534
Wastewater Management: 0.463
Environmental Management: 0.401
Surface Water Pollution: 0.401
Discrimination: 0.379
Indigenous People: 0.338
Natural Resources: 0.246
-------------
Environmental Management: 0.823
Soil and Groundwater Impact: 0.785
Corporate Governance: 0.774
Wastewater Management: 0.648
Values and Ethics: 0.581
Natural Resources: 0.581
Surface Water Pollution: 0.58
Not Relevant to ESG: 0.538
Employee Health and Safety: 0.469
Responsible Investment & Greenwashing: 0.46
-------------
Soil and Groundwater Impact: 0.856
Not Relevant to ESG: 0.601
Corporate Governance: 0.586
Environmental Management: 0.566
Values and Ethics: 0.541
Responsible Investment & Greenwashing: 0.484
Surface Water Pollution: 0.469
Wastewater Management: 0.461
Employee Health and Safety: 0.326
Discrimination: 0.319
-------------
Soil and Groundwater Impact:

In [37]:
len(df_with_classes)

7

In [38]:
df_with_classes.head()

Unnamed: 0,url,title,text,publication_date,inn,company,is_company_in_text,classes
0,/news/3902733,Газпромбанк компенсировал часть собственного у...,"Москва, 18 июля, 2024, 11:14 — ИА Регнум. Газп...",2024-07-18 08:14:00,7727576505,сибур,True,Environmental Management;Soil and Groundwater ...
1,/news/3895237,«Всероссийский день поля» пройдет на Ставрополье,"Ставрополь, 13 июня, 2024, 09:35 — ИА Регнум. ...",2024-06-13 06:35:00,7727576505,сибур,True,Corporate Governance;Environmental Management;...
2,/news/3890946,Выставку-форум «Россия» посетили 14 млн гостей,"Москва, 24 мая, 2024, 17:52 — ИА Регнум. Между...",2024-05-24 14:52:00,7727576505,сибур,True,Environmental Management;Soil and Groundwater ...
3,/news/3872138,СИБУР предложил повысить привлекательность тех...,"Москва, 6 марта, 2024, 17:36 — ИА Регнум. Рост...",2024-03-06 14:36:00,7727576505,сибур,True,Environmental Management;Soil and Groundwater ...
4,/news/3869144,Первый глава Татарстана Шаймиев присоединился ...,"Казань, 22 февраля, 2024, 14:37 — ИА Регнум. П...",2024-02-22 11:37:00,7727576505,сибур,True,Environmental Management;Soil and Groundwater ...


In [39]:
df_with_classes.to_csv(f'with_classes_{company}.csv')