# 1 — Сбор ссылок на дела по номеру решения

В этой части работы реализуем поиск дел на сайте «ГАС Правосудие» по номерам решения из карточки блокировки Роскомсвободы. Сначала импортируем нужные библиотеки:

In [None]:
from selenium import webdriver as wb
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup
import pandas as pd
import time
from math import ceil
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

Загружаем данные с сайта Роскомсвободы. Нас интересуют столбцы: ```record_id, decisionNumber, authority```. Поиск на сайте «ГАС Правосудие» — по полю ```decisionNumber```

In [None]:
data = pd.read_csv('data_chel.csv')

In [None]:
data

Как работает поиск дела: по номеру решения из карточки блокировки Роскомсвободы и региону составляем поисковой запрос на сайте «ГАС Правосудие». Затем среди результатов поиска ищем дела, в которых совпадает либо только суд, либо и суд, и дата вынесения решения (есть не у всех дела на сайте, поэтому это не блокирующее условие): 

In [None]:
def extract_info(row):
    if row.text == 'Ничего не найдено':
        return False
    else: 
        court_gas = row.find(attrs = {"data-comment": "Наименование суда"}).text.replace('  ', ' ')
        case_url = row.find('a', class_ = 'resultHeader openCardLink')['href']
        category = row.find(attrs = {"data-comment": "Статья или категория"}).text
        if row.find(attrs = {"data-comment": "Дата решения"}):
            date_gas = row.find(attrs = {"data-comment": "Дата решения"}).text
            if (court_gas.startswith(court)) and (date_gas == date):
                return {'record_id': record_id, 'case': case, 'case_url': case_url, 
                        'court': court_gas, 'decisionDate': date_gas, 'category': category}
            else:
                return False
        else:
            date_gas = False
            if court_gas.startswith(court):
                return {'record_id': record_id, 'case': case, 'case_url': case_url, 
                        'court': court_gas, 'decisionDate': date_gas, 'category': category}
            else:
                return False

In [None]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager

driver = webdriver.Chrome(ChromeDriverManager().install())

Код проходит по всем номерам решений из загруженного датасета и ищет нужные дела. Если нужное дело не находится на первых двух страницах выдачи или если поиск не дает результатов, решение пропускается.

In [None]:
# Челябинская область
df = pd.DataFrame()

for record in tqdm(range(len(data))):
    case = data.loc[record].decisionNumber
    if pd.isna(case):
        continue
    record_id = data.loc[record].record_id
    court = data.loc[record].authority_name
    date = data.loc[record].decisionDate
    link = f'https://bsr.sudrf.ru/bigs/portal.html#%7B%22type%22:%22MULTIQUERY%22,%22multiqueryRequest%22:%7B%22queryRequests%22:%5B%7B%22type%22:%22Q%22,%22request%22:%22%7B%5C%22mode%5C%22:%5C%22EXTENDED%5C%22,%5C%22typeRequests%5C%22:%5B%7B%5C%22fieldRequests%5C%22:%5B%7B%5C%22name%5C%22:%5C%22case_doc_subject_rf%5C%22,%5C%22operator%5C%22:%5C%22EX%5C%22,%5C%22query%5C%22:%5C%22%D0%A7%D0%B5%D0%BB%D1%8F%D0%B1%D0%B8%D0%BD%D1%81%D0%BA%D0%B0%D1%8F%20%D0%BE%D0%B1%D0%BB%D0%B0%D1%81%D1%82%D1%8C%5C%22,%5C%22fieldName%5C%22:%5C%22case_doc_subject_rf_cat%5C%22%7D,%7B%5C%22name%5C%22:%5C%22case_user_doc_number_cat%5C%22,%5C%22operator%5C%22:%5C%22SW%5C%22,%5C%22query%5C%22:%5C%222%D0%B0-{case}%5C%22,%5C%22fieldName%5C%22:%5C%22case_user_doc_number_cat%5C%22%7D%5D,%5C%22mode%5C%22:%5C%22AND%5C%22,%5C%22name%5C%22:%5C%22common%5C%22,%5C%22typesMode%5C%22:%5C%22AND%5C%22%7D%5D%7D%22,%22operator%22:%22AND%22,%22queryRequestRole%22:%22CATEGORIES%22%7D%5D%7D,%22sorts%22:%5B%7B%22field%22:%22score%22,%22order%22:%22desc%22%7D%5D,%22simpleSearchFieldsBundle%22:%22default%22,%22noOrpho%22:false%7D'
    driver.get(link)
    time.sleep(15)
    page_html = driver.page_source
    soup = BeautifulSoup(page_html, 'html.parser')
    rows = soup.find_all('div', class_='bgs-result')
    result_number = int(soup.find('div', class_= 'resultCount')['data-total'])
    finds = 0
    for row in rows[:-1]:
        extract = extract_info(row)
        if extract != False:
            df = df.append(extract, ignore_index=True)
            finds = 1
    if (finds < 1) and (result_number > 10):
        page = 10
        new_link = f'https://bsr.sudrf.ru/bigs/portal.html#%7B%22type%22:%22MULTIQUERY%22,%22multiqueryRequest%22:%7B%22queryRequests%22:%5B%7B%22type%22:%22Q%22,%22request%22:%22%7B%5C%22mode%5C%22:%5C%22EXTENDED%5C%22,%5C%22typeRequests%5C%22:%5B%7B%5C%22fieldRequests%5C%22:%5B%7B%5C%22name%5C%22:%5C%22case_doc_subject_rf%5C%22,%5C%22operator%5C%22:%5C%22EX%5C%22,%5C%22query%5C%22:%5C%22%D0%A7%D0%B5%D0%BB%D1%8F%D0%B1%D0%B8%D0%BD%D1%81%D0%BA%D0%B0%D1%8F%20%D0%BE%D0%B1%D0%BB%D0%B0%D1%81%D1%82%D1%8C%5C%22,%5C%22fieldName%5C%22:%5C%22case_doc_subject_rf_cat%5C%22%7D,%7B%5C%22name%5C%22:%5C%22case_user_doc_number_cat%5C%22,%5C%22operator%5C%22:%5C%22SW%5C%22,%5C%22query%5C%22:%5C%222%D0%B0-{case}%5C%22,%5C%22fieldName%5C%22:%5C%22case_user_doc_number_cat%5C%22%7D%5D,%5C%22mode%5C%22:%5C%22AND%5C%22,%5C%22name%5C%22:%5C%22common%5C%22,%5C%22typesMode%5C%22:%5C%22AND%5C%22%7D%5D%7D%22,%22operator%22:%22AND%22,%22queryRequestRole%22:%22CATEGORIES%22%7D%5D%7D,%22sorts%22:%5B%7B%22field%22:%22score%22,%22order%22:%22desc%22%7D%5D,%22simpleSearchFieldsBundle%22:%22default%22,%22noOrpho%22:false,%22start%22:{page}%7D'
        driver.get(new_link)
        time.sleep(15)
        page_html_new = driver.page_source
        soup_new = BeautifulSoup(page_html_new, 'html.parser')
        rows_new = soup_new.find_all('div', class_='bgs-result')
        for row_new in rows_new[:-1]:
            extract_new = extract_info(row_new)
            if extract_new != False:
                df = df.append(extract_new, ignore_index=True)

In [None]:
# Владимирская область
df = pd.DataFrame()

for record in tqdm(range(len(data))):
    case = data.loc[record].decisionNumber
    if pd.isna(case):
        continue
    record_id = data.loc[record].record_id
    court = data.loc[record].authority_name
    date = data.loc[record].decisionDate
    link = f'https://bsr.sudrf.ru/bigs/portal.html#%7B%22type%22:%22MULTIQUERY%22,%22multiqueryRequest%22:%7B%22queryRequests%22:%5B%7B%22type%22:%22Q%22,%22request%22:%22%7B%5C%22mode%5C%22:%5C%22EXTENDED%5C%22,%5C%22typeRequests%5C%22:%5B%7B%5C%22fieldRequests%5C%22:%5B%7B%5C%22name%5C%22:%5C%22case_doc_subject_rf%5C%22,%5C%22operator%5C%22:%5C%22EX%5C%22,%5C%22query%5C%22:%5C%22%D0%B2%D0%BB%D0%B0%D0%B4%D0%B8%D0%BC%D0%B8%D1%80%D1%81%D0%BA%D0%B0%D1%8F%20%D0%BE%D0%B1%D0%BB%D0%B0%D1%81%D1%82%D1%8C%5C%22,%5C%22fieldName%5C%22:%5C%22case_doc_subject_rf_cat%5C%22%7D,%7B%5C%22name%5C%22:%5C%22case_user_doc_number_cat%5C%22,%5C%22operator%5C%22:%5C%22SW%5C%22,%5C%22query%5C%22:%5C%222%D0%B0-{case}%5C%22,%5C%22fieldName%5C%22:%5C%22case_user_doc_number_cat%5C%22%7D%5D,%5C%22mode%5C%22:%5C%22AND%5C%22,%5C%22name%5C%22:%5C%22common%5C%22,%5C%22typesMode%5C%22:%5C%22AND%5C%22%7D%5D%7D%22,%22operator%22:%22AND%22,%22queryRequestRole%22:%22CATEGORIES%22%7D%5D%7D,%22sorts%22:%5B%7B%22field%22:%22score%22,%22order%22:%22desc%22%7D%5D,%22simpleSearchFieldsBundle%22:%22default%22,%22noOrpho%22:false%7D'
    driver.get(link)
    time.sleep(15)
    page_html = driver.page_source
    soup = BeautifulSoup(page_html, 'html.parser')
    rows = soup.find_all('div', class_='bgs-result')
    result_number = int(soup.find('div', class_= 'resultCount')['data-total'])
    finds = 0
    for row in rows[:-1]:
        extract = extract_info(row)
        if extract != False:
            df = df.append(extract, ignore_index=True)
            finds = 1
    if (finds < 1) and (result_number > 10):
        page = 10
        new_link = f'https://bsr.sudrf.ru/bigs/portal.html#%7B%22type%22:%22MULTIQUERY%22,%22multiqueryRequest%22:%7B%22queryRequests%22:%5B%7B%22type%22:%22Q%22,%22request%22:%22%7B%5C%22mode%5C%22:%5C%22EXTENDED%5C%22,%5C%22typeRequests%5C%22:%5B%7B%5C%22fieldRequests%5C%22:%5B%7B%5C%22name%5C%22:%5C%22case_doc_subject_rf%5C%22,%5C%22operator%5C%22:%5C%22EX%5C%22,%5C%22query%5C%22:%5C%22%D0%B2%D0%BB%D0%B0%D0%B4%D0%B8%D0%BC%D0%B8%D1%80%D1%81%D0%BA%D0%B0%D1%8F%20%D0%BE%D0%B1%D0%BB%D0%B0%D1%81%D1%82%D1%8C%5C%22,%5C%22fieldName%5C%22:%5C%22case_doc_subject_rf_cat%5C%22%7D,%7B%5C%22name%5C%22:%5C%22case_user_doc_number_cat%5C%22,%5C%22operator%5C%22:%5C%22SW%5C%22,%5C%22query%5C%22:%5C%222%D0%B0-{case}%5C%22,%5C%22fieldName%5C%22:%5C%22case_user_doc_number_cat%5C%22%7D%5D,%5C%22mode%5C%22:%5C%22AND%5C%22,%5C%22name%5C%22:%5C%22common%5C%22,%5C%22typesMode%5C%22:%5C%22AND%5C%22%7D%5D%7D%22,%22operator%22:%22AND%22,%22queryRequestRole%22:%22CATEGORIES%22%7D%5D%7D,%22sorts%22:%5B%7B%22field%22:%22score%22,%22order%22:%22desc%22%7D%5D,%22simpleSearchFieldsBundle%22:%22default%22,%22noOrpho%22:false,%22start%22:{page}%7D'
        driver.get(new_link)
        time.sleep(15)
        page_html_new = driver.page_source
        soup_new = BeautifulSoup(page_html_new, 'html.parser')
        rows_new = soup_new.find_all('div', class_='bgs-result')
        for row_new in rows_new[:-1]:
            extract_new = extract_info(row_new)
            if extract_new != False:
                df = df.append(extract_new, ignore_index=True)

Результат работы этого кода — промежуточный. В таблице есть ссылка на дело с полной информацией и данные из превью поиска. Тексты решений получаются на следующем этапе работы.

In [None]:
df.to_csv('vladimir_pol_blocks_links.csv', index=False)

# 2 — Сбор текстов решений суда

Во второй части работы с помощью ссылок, собранных на предыдущем шаге, собираем текстовые решения и другую информацию по делу: ФИО судьи, категорию дела, регион и т.д.

Импортируем датасет из предыдущего шага: 

In [None]:
data = pd.read_csv('vladimir_pol_blocks_links.csv')

In [None]:
data.head(10)

Функция, которая собирает из карточки дела нужною информацию: 

In [None]:
def extract_info(test_soup):
    info = test_soup.find('div', {'id': 'bookmark0'})
    cells = info.find_all('span', class_='one-value')
    case_number = cells[0].text
    unique_identifier = cells[1].text
    case_type = cells[2].text
    instance = cells[3].text
    category = cells[4].text
    region = cells[5].text
    result = cells[6].text
    court_first_instance = cells[7].text
    link_to_source = cells[8].text
    judical_system = info.find_all('a', class_='paramLink one-value')
    judge = judical_system[1].text
    members = info.find_all('td', class_='one-table-value')
    if len(members) == 3:
        plaintiff = members[1].text
        interested_party = members[3].text
    else:
        plaintiff = ''
        interested_party = ''
    all_text = ''
    if driver.find_elements(By.TAG_NAME, 'iframe'):
        iframe = driver.find_elements(By.TAG_NAME, 'iframe')[0]
        driver.switch_to.frame(iframe)
        page_html_iframe = driver.page_source
        soup = BeautifulSoup(page_html_iframe, 'html.parser')
        texts = soup.find_all('p')
        for text in texts:
            all_text = all_text + ' ' + text.text
    return {'record_id': record_id, 'case_number':case_number, 'unique_identifier':unique_identifier, 
            'case_type':case_type, 'instance':instance, 'category':category, 'region':region, 'result':result, 
            'court_first_instance':court_first_instance, 'link_to_source':link_to_source, 'judge':judge, 
            'plaintiff':plaintiff, 'interested_party':interested_party, 'text':all_text}

In [None]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager

driver = webdriver.Chrome(ChromeDriverManager().install())

In [None]:
df = pd.DataFrame()
for i in tqdm(range(len(data))):
    record_id = data.loc[i].record_id
    link = data.loc[i].case_url
    driver.get(link)
    driver.get(link)
    time.sleep(15)
    page_html = driver.page_source
    soup = BeautifulSoup(page_html, 'html.parser')
    row = extract_info(soup)
    df = df.append(row, ignore_index=True)

Результат работы этого кода — датасет с номерами дел и полной информацией по ним, в том числе с текстовыми решениями, если они были в карточке дела.

In [None]:
df.to_csv('vladimir_pol_blocks_texts.csv', encoding='utf-8-sig', index=False)