In [1]:
import requests
import re
from bs4 import BeautifulSoup

In [27]:
articles = []

url = 'https://news.ycombinator.com/news'

r = requests.get(url)
html_soup = BeautifulSoup(r.text, 'html.parser')

In [29]:
for item in html_soup.find_all('tr', class_="athing"):
    #print(item)
    
    item_a = item.find('a', class_="storylink")
    item_link = item_a.get('href') if item_a else None
    
    item_text = item_a.get_text(strip=True) if item_a else None
    next_row = item.find_next_sibling('tr')
    item_score = next_row.find('span', class_="score")
    item_score = item_score.get_text(strip=True) if item_score else '0 points'
    
    item_comments = next_row.find('a', string=re.compile('\d+(\s)comment(s?)')) 
    #repalce non-breaking space
    item_comments = item_comments.get_text(strip=True).replace('\xa0', ' ') if item_comments else '0 comments'
    
    articles.append( {
        'link' : item_link,
        'title' : item_text,
        'score' : item_score,
        'comments' : item_comments
    })
    

    
articles[:2]

[{'link': 'https://www.quora.com/Experienced-programmers-and-computer-scientists-what-are-some-really-old-or-even-nearly-forgotten-books-you-think-every-new-programmer-should-read/answer/Alan-Kay-11?share=1',
  'title': "Alan Kay's answer to 'what are some forgotten books programmers should read'",
  'score': '195 points',
  'comments': '44 comments'},
 {'link': 'https://www.nytimes.com/2019/08/08/style/linkedin-social-media.html',
  'title': 'Why Aren’t We Talking About LinkedIn?',
  'score': '41 points',
  'comments': '39 comments'}]

## scrape from json

In [None]:
articles = []

url = 'https://hacker-news.firebaseio.com/v0'

top_stories = requests.get(url + '/topstories.json').json()

for story_id in top_stories:
    story_url = url + '/item/{}.json'.format(story_id)
    print('Fetching:', story_url)
    r = requests.get(story_url)
    story_dict = r.json()
    articles.append(story_dict)
    
for i,article in enumerate(articles):
    if i is 3:
        break
    print(article)

## Scrape data from multiple pages

In [2]:
import requests
import dataset
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

In [17]:

authors_seen = set()
data = []
base_url = 'http://quotes.toscrape.com/'

def clean_url(url):
    # Clean '/author/Steve-Martin' to 'Steve-Martin'
    # Use urljoin to make an absolute URL
    url = urljoin(base_url, url)
    # Use urlparse to get out the path part
    path = urlparse(url).path
    # Now split the path by '/' and get the second part
    # E.g. '/author/Steve-Martin' -> ['', 'author', 'Steve-Martin']
    return path.split('/')[2]

In [18]:


def scrape_quotes(html_soup):
    for quote in html_soup.select('div.quote'):
        quote_text = quote.find(class_='text').get_text(strip=True)
        quote_author = quote.find(class_='author').get_text(strip=True)
        #quote_author_url = quote.find(class_='author').find_next_sibling('a').get('href')

        quote_tag_urls = [clean_url(a.get('href')) for a in quote.find_all('a', class_="tag")]


        data.append( {
            'quote_author' : quote_author,
            'quote_text' : quote_text if quote_text else None,
            'quote_tags' : quote_tag_urls
        } )
        


In [20]:

url = base_url
while True:                                          
    print('Now scraping page:', url)
    r = requests.get(url)
    html_soup = BeautifulSoup(r.text, 'html.parser')
    # Scrape the quotes
    scrape_quotes(html_soup)
    # Is there a next page?
    next_a = html_soup.select('li.next > a')
    if not next_a or not next_a[0].get('href'):
        break
    url = urljoin(url, next_a[0].get('href'))

Now scraping page: http://quotes.toscrape.com/
Now scraping page: http://quotes.toscrape.com/page/2/
Now scraping page: http://quotes.toscrape.com/page/3/
Now scraping page: http://quotes.toscrape.com/page/4/
Now scraping page: http://quotes.toscrape.com/page/5/
Now scraping page: http://quotes.toscrape.com/page/6/
Now scraping page: http://quotes.toscrape.com/page/7/
Now scraping page: http://quotes.toscrape.com/page/8/
Now scraping page: http://quotes.toscrape.com/page/9/
Now scraping page: http://quotes.toscrape.com/page/10/


In [23]:
len(data)

170

## Scrape github stars

In [26]:
import requests
from bs4 import BeautifulSoup
import re

session = requests.Session()

url = 'https://github.com/{}'
username = 'google'

r = session.get(url.format(username), params={'page': 1, 'tab': 'repositories'})
html_soup = BeautifulSoup(r.text, 'html.parser')

In [36]:
repos = html_soup.find(class_='repo-list').find_all('li')
for repo in repos:
    
    name = repo.find('h3').find('a').get_text(strip=True)
    # <span>
    language = repo.find(attrs={'itemprop': 'programmingLanguage'})
    language = language.get_text(strip=True) if language else 'unknown'
    stars = repo.find('a', attrs={'href': re.compile('\/stargazers')})
    stars = int(stars.get_text(strip=True).replace(',', '')) if stars else 0


## Scraping IMDB Ratings

In [39]:
import requests
from bs4 import BeautifulSoup

url = 'http://www.imdb.com/title/tt0944947/episodes'

episodes = []
ratings = []

In [41]:
for season in range(1, 3):
    r = requests.get(url, params={'season': season})
    soup = BeautifulSoup(r.text, 'html.parser')
    listing = soup.find('div', class_="eplist")
    
    for epnr, div in enumerate(listing.find_all('div', recursive=False)):
        episode = "{}.{}".format(season, epnr + 1)
        rating_el = div.find(class_='ipl-rating-star__rating')
        rating = float(rating_el.get_text(strip=True))
        print('Episode:', episode, '-- rating:', rating)
        episodes.append(episode)
        ratings.append(rating)

Episode: 1.1 -- rating: 9.1
Episode: 1.2 -- rating: 8.8
Episode: 1.3 -- rating: 8.7
Episode: 1.4 -- rating: 8.8
Episode: 1.5 -- rating: 9.1
Episode: 1.6 -- rating: 9.2
Episode: 1.7 -- rating: 9.3
Episode: 1.8 -- rating: 9.0
Episode: 1.9 -- rating: 9.6
Episode: 1.10 -- rating: 9.5
Episode: 2.1 -- rating: 8.8
Episode: 2.2 -- rating: 8.6
Episode: 2.3 -- rating: 8.9
Episode: 2.4 -- rating: 8.8
Episode: 2.5 -- rating: 8.8
Episode: 2.6 -- rating: 9.1
Episode: 2.7 -- rating: 8.9
Episode: 2.8 -- rating: 8.8
Episode: 2.9 -- rating: 9.7
Episode: 2.10 -- rating: 9.4
