In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import pandas as pd

In [2]:
from tqdm import tqdm 
import pickle

# CODE

In [3]:
class Article:
    def __init__(self, elem) -> None:
        self.link = elem.get_attribute('href')
        span_elems = elem.find_elements(By.TAG_NAME, 'span')
        self.date = span_elems[0].text
        self.author = span_elems[1].text if len(span_elems) > 1 else ''
        self.text = ''
        self.title = ''

class FinamNewsParser:
    def __init__(self) -> None:
        user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36'

        options = webdriver.ChromeOptions()
        options.add_argument('headless')
        options.add_argument(f'user-agent={user_agent}')

        self.driver = webdriver.Chrome(options=options)
        
    def collect_news(self, ticker, start = None, end = None, maxCount = None):
        template_url = 'https://www.finam.ru/quote/moex/{}/publications/'
        url = template_url.format(ticker)
    
        if not start or not end:
            self.driver.get(url)
        else:
            url +=  "{}/{}/{}".format('date',  start, end)
            self.driver.get(url)
            stop = False
            #кликаем кнопочку "Загрузить еще", пока не получим все новости за период
            while not stop:
                try:
                    WebDriverWait(self.driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//span[(starts-with(@class, "pointer")) and (contains(@class, "cl-blue"))]')))
                    self.driver.execute_script("finfin.local.plugin_block_item_publication_list_filter_date.loadMore(this);")
                except:
                    stop = True

        print("Getting news from:  {}".format(url))
        links_section = self.driver.find_element(By.ID, 'finfin-local-plugin-block-item-publication-list-filter-date-content')
        a_elems = links_section.find_elements(By.TAG_NAME, 'a')

        articles = list(map(lambda elem: Article(elem), a_elems))

        for id, article in enumerate(articles):
            if maxCount is not None and id == maxCount:
                return articles[:maxCount]
    
            self.driver.get(article.link)
            try:
                title_section = self.driver.find_element(By.TAG_NAME, 'h1')
                article.title = title_section.text

                text_section = self.driver.find_element(
                    By.XPATH, 
                    '//div[(starts-with(@class, "finfin-local-plugin-publication-item-item-")) and (contains(@class, "-text"))]'
                )
                
                p_elems = text_section.find_elements(By.TAG_NAME, 'p')
                p_elems_text = list(map(lambda elem: elem.text, p_elems))

                if len(p_elems_text):
                    article.text = ' '.join(p_elems_text)
            except:
                # print('Couldnt parse article from href: {}'.format(article.link))
                pass
        
        return articles

# Сбор новостей

In [4]:
# %%time 

# parser = FinamNewsParser()
# news = parser.collect_news('sber', start='2022-12-01', end='2023-12-01')

In [5]:
# len(news)

In [6]:
df_files = pd.read_parquet('tickers_files.parquet')
df_files.head(40)

Unnamed: 0,ticker,file
0,ABIO,ABIO_from_2022_12_20_to_2023_12_20.csv
1,ABRD,ABRD_from_2022_12_20_to_2023_12_20.csv
2,AFKS,AFKS_from_2022_12_20_to_2023_12_20.csv
3,AFLT,AFLT_from_2022_12_20_to_2023_12_20.csv
4,AGRO,AGRO_from_2022_12_20_to_2023_12_20.csv
5,AKRN,AKRN_from_2022_12_20_to_2023_12_20.csv
6,ALRS,ALRS_from_2022_12_20_to_2023_12_20.csv
7,AMEZ,AMEZ_from_2022_12_20_to_2023_12_20.csv
8,APTK,APTK_from_2022_12_20_to_2023_12_20.csv
9,AQUA,AQUA_from_2022_12_20_to_2023_12_20.csv


In [None]:
%%time 

# автоматизированны парсинг новостей

df_files = pd.read_parquet('tickers_files.parquet')
parser = FinamNewsParser()

BLACK_LIST = []

for ticker in tqdm(df_files.values[30:]):
    if ticker[0] != 'SBER':
        try:
            news = parser.collect_news(ticker[0].lower(), start='2022-12-01', end='2023-12-01')

            with open('./news_data/' + ticker[0].lower() + '.pickle', 'wb') as f:
                pickle.dump(news, f)
        except:
            BLACK_LIST.append(ticker[0])
            print(ticker[0] + ' НОВОСТИ НЕ РАСПАРСИЛО') 

print(BLACK_LIST)

  0%|                                                                                          | 0/200 [00:00<?, ?it/s]

Getting news from:  https://www.finam.ru/quote/moex/dsky/publications/date/2022-12-01/2023-12-01


  0%|▍                                                                             | 1/200 [06:15<20:46:56, 375.96s/it]

Getting news from:  https://www.finam.ru/quote/moex/dvec/publications/date/2022-12-01/2023-12-01


  1%|▊                                                                              | 2/200 [06:57<9:50:39, 178.99s/it]

Getting news from:  https://www.finam.ru/quote/moex/dzrdp/publications/date/2022-12-01/2023-12-01


  2%|█▏                                                                             | 3/200 [07:27<6:04:28, 111.01s/it]

Getting news from:  https://www.finam.ru/quote/moex/dzrd/publications/date/2022-12-01/2023-12-01


  2%|█▌                                                                              | 4/200 [07:56<4:17:50, 78.93s/it]

Getting news from:  https://www.finam.ru/quote/moex/eelt/publications/date/2022-12-01/2023-12-01


  2%|██                                                                              | 5/200 [08:41<3:35:51, 66.42s/it]

Getting news from:  https://www.finam.ru/quote/moex/elfv/publications/date/2022-12-01/2023-12-01


  3%|██▍                                                                             | 6/200 [09:19<3:04:16, 56.99s/it]

Getting news from:  https://www.finam.ru/quote/moex/enpg/publications/date/2022-12-01/2023-12-01


  4%|██▊                                                                             | 7/200 [10:47<3:36:01, 67.16s/it]

Getting news from:  https://www.finam.ru/quote/moex/etln/publications/date/2022-12-01/2023-12-01


  4%|███▏                                                                           | 8/200 [13:42<5:24:09, 101.30s/it]

Getting news from:  https://www.finam.ru/quote/moex/fees/publications/date/2022-12-01/2023-12-01


  4%|███▌                                                                           | 9/200 [16:13<6:12:11, 116.92s/it]

Getting news from:  https://www.finam.ru/quote/moex/fesh/publications/date/2022-12-01/2023-12-01
