# **Scraping web site : Hacker News**

In [1]:
import webbrowser
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import re
from time import time, sleep



## **2. Schematise the HTML structure by drawing tree block**

This command opens the link of the file I inserted on my google drive.
It is also available in the .zip file if there is a problem opening it.

In [2]:
webbrowser.open('https://drive.google.com/file/d/1GUOLmITXO78E2WoPkhtNNYQj-7r2FLwn/view?usp=sharing')

True

## **3. Get the HTML page in Python using the requests library**

In [3]:
html_website = requests.get('https://news.ycombinator.com/')
soup = BeautifulSoup(html_website.text, 'html.parser')

## **4. Start parsing the page with Beautiful Soup**

### **b) Print all the title on the homepage (30 names)**

In [4]:
titlelink = []
for title in soup.find_all('a', {'class':'titlelink'}) :
    titlelink.append(title.text)
titlelink

['The computers used to do 3D animation for Final Fantasy VII in 1996',
 'Ruby 3.2 preview 1 with support for WASM compilation',
 'What software engineers can learn from the rapid collapse of Fast',
 "Show HN: Redditle.com – For those of us who add 'Reddit' to every Google search",
 'The ever-increasing walled-gardeness of Twitter',
 'TypeScript as fast as Rust: TypeScript++',
 'Newly Measured Particle Seems Heavy Enough to Break Known Physics',
 'The Muse (YC W12) Is Hiring a Senior Platform Engineer (Learn More)',
 'The 0.5 MB of nothing in all Apple Music files (2020)',
 'I stopped advertising and nothing happened',
 'ACM Opens First 50 Years Backfile',
 'Denonia: The First Malware Specifically Targeting AWS Lambda',
 'Tree-sitter grammar for org-mode',
 'A detailed look at the S-300P anti-aircraft missile system',
 'Microplastics found in live human lung tissue',
 'From context collapse to content collapse',
 'Canada to ban foreigners from buying homes',
 'Mantle – Serverless Maps 

In [5]:
#Let's check that we have the 30 titles of each tag
len(titlelink)

30

## **5. Structure your information by creating a dataclass named Post containing all the relevant information**

In [6]:
#creation of the Post class
#Each class is defaulted to 'nan' except for the rank and the post name  

class Post():
    def __init__(self, rank, name, site = np.nan, site_url = np.nan, point = np.nan, author = np.nan, date_published = np.nan, number_comments = np.nan):
        self.rank = rank
        self.name = name
        self.site = site
        self.site_url = site_url
        self.point = point
        self.author = author
        self.date_published = date_published
        self.number_comments = number_comments

## **6. Parse the following informations: name , points , author , data published , number of comments**

In [7]:
list_tr = soup.find('table',{'class':'itemlist'}).find_all('tr')

#size is defined as the size of the 'list_tr' - 2 because the last two 'tr' are not useful
size = len(list_tr)-2

all_posts = []


for index in range(0,size,3):
    line1 = list_tr[index] 
    line2 = line1.next_sibling   
    post = Post(
        rank = line1.find('span',{'class':'rank'}).text.replace('.',''),    
        name = line1.find('a',{'class':'titlelink'}).text )

#We have a range with a step of 3 because the information for each article is located every three 'tr' tags.
#line1 represents the first 'tr' among the blocks of 3 where the title, the name of the site and its url are found
#line2 represents the second 'tr' among the blocks of 3 where the points, the author and the date of publication are found 
    
#We perform a condition on the other classes
#If the class has content then the class takes it
    
    #Get the name of the website
    element = line1.find('a',{'class':'titlelink'})  
    
    if element :
        post.site_url = element.get('href')
        
    element = line1.find('span',{'class':'sitestr'})
    if element :
        post.site = element.text
        
    element = line2.find('span',{'class':'score'})
    if element:
        post.point=element.text.replace(' points','')
        
    element = line2.find('a',{'class':'hnuser'})
    if element:
        post.author=element.text
        
    element = line2.find('span',{'class':'age'})
    if element:
        post.date_published=element.get('title')
        
    element = line2.find(string=re.compile('comment'))
    if element:
        post.number_comments=element.replace('comments','')
        
    #Save the current post    
    all_posts.append(vars(post))

## **7. Write the data in a CSV**

In [8]:
data = pd.DataFrame(all_posts).set_index('rank')
data.to_csv('posts.csv')
data.head()

Unnamed: 0_level_0,name,site,site_url,point,author,date_published,number_comments
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,The computers used to do 3D animation for Fina...,lunduke.substack.com,https://lunduke.substack.com/p/the-computers-u...,393,marcobambini,2022-04-07T16:55:20,189
2,Ruby 3.2 preview 1 with support for WASM compi...,ruby-lang.org,https://www.ruby-lang.org/en/news/2022/04/03/r...,210,pvsukale3,2022-04-07T17:38:00,46
3,What software engineers can learn from the rap...,pragmaticengineer.com,https://newsletter.pragmaticengineer.com/p/the...,220,gregdoesit,2022-04-07T17:23:16,144
4,Show HN: Redditle.com – For those of us who ad...,redditle.com,https://redditle.com,261,greentfrapp,2022-04-07T16:33:15,138
5,The ever-increasing walled-gardeness of Twitter,annoying.technology,https://annoying.technology/posts/e6901c0ea272...,87,matrixagent,2022-04-07T19:10:48,56


## **8. Write code to parse the 5 first pages of HackerNews**

In [9]:
def parseFivePages(filename='posts_5_pages.csv'):
    all_posts = []

#extraction of pages 1 to 5

    for page in range(1,6):
        html_website = requests.get(f'https://news.ycombinator.com?p={page}')
        soup = BeautifulSoup(html_website.text, 'html.parser')
        list_tr = soup.find('table',{'class':'itemlist'}).find_all('tr')
        size = len(list_tr)-2
        
        
        #rewriting the code for page 1 that will run on pages 1 to 5

        for index in range(0,size,3):
            line1 = list_tr[index]
            line2 = line1.next_sibling
            post = Post(
                rank = line1.find('span',{'class':'rank'}).text.replace('.',''),
                name = line1.find('a',{'class':'titlelink'}).text)
            
            #Get the name of the website
            element = line1.find('a',{'class':'titlelink'})
           
            #If element is not none
            if element :
                post.site_url = element.get('href')

            element = line1.find('span',{'class':'sitestr'})
            if element :
                post.site = element.text

            element = line2.find('span',{'class':'score'})
            if element:
                post.point = element.text.replace(' points','')

            element = line2.find('a',{'class':'hnuser'})
            if element:
                post.author = element.text

            element = line2.find('span',{'class':'age'})
            if element:
                post.date_published = element.get('title')

            element = line2.find(string = re.compile('comment'))
            if element:
                post.number_comments = element.replace('comments','')
                
            #Save the current post    
            all_posts.append(vars(post))
          
        
    #creation of the dataframe and the CSV file for the 5 pages
    data = pd.DataFrame(all_posts)
    data = data.set_index('rank')
    data.to_csv(filename)

In [10]:
parseFivePages()

## **9. Think about a way to run your code every hour**

In [None]:
one_hours  = 60 * 60  

#one_hours is defined in seconds
#in an hour there are 60 minutes that have 60 seconds

while True:
    sleep(one_hours) 
    parseFivePages('every_hours.csv') #creation of the CSV file witch run each hours