In [4]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [5]:
# Парсер на основе связки pandas, requests, bs4

def hacker_news(pages=10):

    news = pd.DataFrame(columns=['news', 'points'])

    # Функция расчета времени
    def get_load_time(article_url):
        # будем ждать 3 секунды, иначе выводить exception и присваивать константное значение
        try:

            # делаем запрос по url статьи article_url
            response = requests.get(
                article_url, stream=True, timeout=3.000
            )
            # получаем время загрузки страницы
            load_time = response.elapsed.total_seconds()
        except Exception as e:
            print(e)
            load_time = ">3"
        return load_time

    # Функция получения id, news_headline, rating, link
    def get_info(page=1):
        url = f'https://news.ycombinator.com/?p={page}'

        responce = requests.get(url)

        # Получаем заголовок
        page = pd.read_html(responce.text)[2]
        page.drop([1], axis=1, inplace=True)
        page.rename(columns={0: 'index', 2: 'news'}, inplace=True)

        page = page[(~page['news'].isna()) & (page['news'] != 'More')]

        # Получаем рейтинг
        points = pd.DataFrame(
            page['news'][page['index'].isna()].reset_index(drop=True))
        points.rename(columns={'news': 'points'}, inplace=True)

        # Проверяем есть ли у новости рейтинг, если его нет то присваиваем ноль
        points['contains_points'] = points['points'].str.contains("points")
        points['points'] = points['points'].str.split(' ')

        def set_points(row):
            if row['contains_points']:
                return row[0][0]
            else:
                return 0

        points['points'] = points.apply(set_points, axis=1)
        
        
        # Получаем колличество комментариев
        comments = pd.DataFrame(
            page['news'][page['index'].isna()].reset_index(drop=True))
        comments.rename(columns={'news': 'comments'}, inplace=True)
        
        comments['n_comments'] = comments['comments'].str.split('|')
        comments['contains_comments'] = comments['comments'].str.contains("comment*")
        comments['n_comments'] = comments['n_comments'].apply(lambda x: x[-1])


        def set_comments(row):
            if row['contains_comments']:
                return row[1]
            else:
                return '0 comments'

        # Чистим на отсутвие 
        comments['n_comments'] = comments.apply(set_comments, axis=1)

        # Прикручиваем рейтинг к новости
        news = pd.DataFrame(
            page['news'][~page['index'].isna()].reset_index(drop=True))

        result = pd.concat([news, points['points'], comments['n_comments']], axis=1)
        
        # Начинаем собирать ссылки и id
        soup = BeautifulSoup(responce.text, 'html.parser')

        link_list = []

        for i in range(len(soup.find_all(class_='title'))):
            try:
                link = soup.find_all(class_='title')[i].find('a')['href']
                text = soup.find_all(class_='title')[i].text

                if text != 'More':
                    link_list.append([text, link])
                else:
                    pass
            except:
                pass
        
        # Получаем id новостей
        trs = soup.find_all('tr', class_='athing')
        id_list = []

        for tr in trs:
            tr_id = tr['id']
            id_list.append(tr_id)

        id = pd.DataFrame(id_list, columns=['id'])
        
        # Получаем ссылки новостей
        links = pd.DataFrame(link_list)
        links = pd.concat([id, links], axis=1)

        links.rename(columns={0: 'news', 1: 'link'}, inplace=True)

        # Прикручиваем ссылки
        result = pd.merge(left=result, right=links)

        # Расчитываем время
        result['load_time'] = result['link'].apply(get_load_time)

        return result

    for i in range(pages):
        news = pd.concat([news, get_info(i+1)])

    news.reset_index(drop=True, inplace=True)

    return news


test = hacker_news(10)

test

('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))
HTTPSConnectionPool(host='www.bbc.com', port=443): Max retries exceeded with url: /news/world-australia-67340901 (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self signed certificate (_ssl.c:1002)')))
HTTPSConnectionPool(host='www.bbc.com', port=443): Max retries exceeded with url: /news/uk-england-wiltshire-67336495 (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self signed certificate (_ssl.c:1002)')))
HTTPSConnectionPool(host='www.telegraph.co.uk', port=443): Max retries exceeded with url: /politics/2023/11/07/kings-speech-driverless-cars-users-not-prosecuted/ (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self signed certificate (_ssl.c:1002)')))
Invalid URL 'item?id=38184195': No scheme supplied. Perhaps you meant https:

Unnamed: 0,news,points,n_comments,id,link,load_time
0,Northlight technology in Alan Wake 2 (remedyga...,349,167 comments,38180846,https://www.remedygames.com/article/how-northl...,0.65255
1,Oh my poor business logic (rednafi.com),92,41 comments,38159363,https://rednafi.com/misc/oh_my_poor_business_l...,0.39949
2,"Go, Containers, and the Linux Scheduler (river...",245,90 comments,38181346,https://www.riverphillips.dev/blog/go-cfs/,0.314109
3,Perfect Dark: Recompiled (hackaday.com),34,10 comments,38159905,https://hackaday.com/2023/11/05/perfect-dark-r...,0.381835
4,Interactive examples for learning jq (ishan.page),38,5 comments,38186153,https://ishan.page/blog/2023-11-06-jq-by-example/,1.454779
...,...,...,...,...,...,...
295,RenderDoc is a free MIT licensed stand-alone g...,15,1 comment,38154296,https://renderdoc.org/,1.389319
296,Representations and srategies for games with i...,90,5 comments,38144772,https://www2.cs.sfu.ca/~bbart/personal/masters...,1.520471
297,Mass producing the most expensive rice cooker ...,125,225 comments,38142586,https://www.youtube.com/watch?v=xLCwr8qG1p4,0.294055
298,Cortextual (cortextual.net),68,12 comments,38149839,https://cortextual.net/,0.43771


In [10]:
with pd.ExcelWriter("./data/hacker.xlsx") as writer:
    test.to_excel(writer, sheet_name='hacker_news', index=False)

---