In [1]:
## crawler
from bs4 import BeautifulSoup
from urllib import parse
import requests
import time

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC 
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys

## database save
import pandas as pd
import sqlite3

import datetime

In [2]:
def get_article_info(main_url = "https://tass.ru/"):
    ## getting html source
    options = webdriver.ChromeOptions()
    #options.add_argument('headless')
    driver = webdriver.Chrome(options=options)
    driver.get(main_url)
    time.sleep(20)

    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(15)
    driver.find_element(By.CSS_SELECTOR, '#infinite_listing button').click()
    time.sleep(15)

    ## scroll down to the bottom
    for i in range(25):
        ActionChains(driver).key_down(Keys.END).perform()
        ActionChains(driver).key_up(Keys.END).perform()

        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(15)


    link_tag_list = driver.find_elements(By.CLASS_NAME,'tass_pkg_link-v5WdK')
    article_links = list(set([l.get_attribute('href') for l in link_tag_list]))

    driver.close()

    ## content parsing
    article_id_filtered = []
    article_links_filtered = []

    for l in article_links:
        article_token = l.split('/')[-1]
        try:
            int(article_token)
            article_id = '-'.join(l.split('/')[-2:])
            
            # proisshestviya 제외
            if l.split('/')[-2] != 'proisshestviya':                
                article_id_filtered.append(article_id)
                article_links_filtered.append(l)
                
        except ValueError:
            continue
            
    return article_id_filtered, article_links_filtered

def get_article_contents(article_links):
    
    article_body_list = []

    for link in article_links:
        html_source = requests.get(link)
        soup = BeautifulSoup(html_source.text, "html.parser")
        
        if link.split('/')[-2] != 'nauka': 
            article_body = soup.select('article p')
            article_body = [b.text for b in article_body]

            article_title = soup.select('div h1')
            article_title = [t.text for t in article_title]
            
#            print('****************************************************')
#            print(link)
#            print(article_title)
#            print("====================================================")

            article_body_list.append((article_title[0] ,article_body))

        elif link.split('/')[-2] == 'nauka':
            article_body = soup.select('div.text-content p')
            article_body = [b.text for b in article_body]

            article_title = soup.select('div h1')
            article_title = [t.text for t in article_title]
            
#            print('****************************************************')
#            print(link)
#            print(article_title)
#            print("====================================================")

            article_body_list.append((article_title[0] ,article_body))

    return article_body_list

### crawl

In [3]:
## crawl contents

article_ids, article_links = get_article_info()
article_body_list = get_article_contents(article_links)

### make data table

In [4]:
## article title table

article_id_list = article_ids.copy()
title_list = [b[0] for b in article_body_list.copy()]
is_read = 0

time_now = datetime.datetime.now()
update_date = str(time_now.year)+'-'+str(time_now.month)+'-'+str(time_now.day)

yandex_articletitle = pd.DataFrame({
    'article_id': article_id_list,
    'title': title_list,
    'is_read': is_read,
    'update_date': update_date
})

## article body table
contents_list = []
article_ids_list = []

for article_id, article_body in zip(article_id_list,article_body_list):
     
    for body in article_body[1]:
        contents_list.append(body)
        article_ids_list.append(article_id)

        
yandex_articlebody = pd.DataFrame({
    'contents': contents_list,
    'article_id': article_ids_list
})

yandex_articlebody['contents_id'] = range(yandex_articlebody.shape[0])
yandex_articlebody['update_date'] = update_date

In [5]:
yandex_articletitle.head()

Unnamed: 0,article_id,title,is_read,update_date
0,obschestvo-16667327,Рождество по григорианскому календарю: история...,0,2023-12-25
1,ekonomika-19267671,"Международная выставка-форум ""Россия""",0,2023-12-25
2,opinions-19491087,"""Украина нам не союзник"": Киеву указали на его...",0,2023-12-25
3,ekonomika-19487589,Прогноз по году: российская экономика под санк...,0,2023-12-25
4,obschestvo-16562831,"""С весельем и радостью каждый год восемь дней""...",0,2023-12-25


In [6]:
yandex_articlebody.head()

Unnamed: 0,contents,article_id,contents_id,update_date
0,С 24 на 25 декабря наступает один из самых зна...,obschestvo-16667327,0,2023-12-25
1,ТАСС рассказывает почему на самом деле не впол...,obschestvo-16667327,1,2023-12-25
2,В этот день верующие празднуют рождение младен...,obschestvo-16667327,2,2023-12-25
3,В этот день верующие празднуют рождение младен...,obschestvo-16667327,3,2023-12-25
4,Мнения историков о точной дате Рождества Христ...,obschestvo-16667327,4,2023-12-25


### save to db

In [7]:
## db save

conn = sqlite3.connect('../rueng_projects/rueng.sqlite3')
cursor = conn.cursor()

## articletitle
title_tuples = [(id_, title, is_read, update_date) for id_, title, is_read, update_date  in zip(yandex_articletitle.article_id, yandex_articletitle.title, yandex_articletitle.is_read, yandex_articletitle.update_date)]
cursor.executemany("INSERT INTO yandex_articletitle VALUES (?, ?, ?, ?)", title_tuples)

## articlebody
body_tuples = [(id_, content, update_date, article_id) for id_, content, update_date, article_id in zip(yandex_articlebody.contents_id, yandex_articlebody.contents, yandex_articlebody.update_date, yandex_articlebody.article_id)]
cursor.executemany("INSERT INTO yandex_articlebody VALUES (?, ?, ?, ?)", body_tuples)

conn.commit()
conn.close()