# Web Scraping with BeautifulSoup

In [None]:
import requests
from bs4 import BeautifulSoup
import re

## Get with `requests`

In [83]:
# Get a webpage and create a Response object
url = 'https://github.com/umitkacar'
r = requests.get(url)

In [84]:
r

<Response [200]>

In [85]:
r.status_code

200

In [86]:
type(r)

requests.models.Response

In [87]:
print(r.content)

b'\n\n<!DOCTYPE html>\n<html lang="en">\n  <head>\n    <meta charset="utf-8">\n  <link rel="dns-prefetch" href="https://github.githubassets.com">\n  <link rel="dns-prefetch" href="https://avatars0.githubusercontent.com">\n  <link rel="dns-prefetch" href="https://avatars1.githubusercontent.com">\n  <link rel="dns-prefetch" href="https://avatars2.githubusercontent.com">\n  <link rel="dns-prefetch" href="https://avatars3.githubusercontent.com">\n  <link rel="dns-prefetch" href="https://github-cloud.s3.amazonaws.com">\n  <link rel="dns-prefetch" href="https://user-images.githubusercontent.com/">\n\n\n\n  <link crossorigin="anonymous" media="all" integrity="sha512-FG+rXqMOivrAjdEQE7tO4BwM1poGmg70hJFTlNSxjX87grtrZ6UnPR8NkzwUHlQEGviu9XuRYeO8zH9YwvZhdg==" rel="stylesheet" href="https://github.githubassets.com/assets/frameworks-146fab5ea30e8afac08dd11013bb4ee0.css" />\n  <link crossorigin="anonymous" media="all" integrity="sha512-kKM4skZOwyHkJcb5b9LmdZ40Y9HVj6cr4jlgNXoE/L9vuITIljgGiCqAEa+ywheSL

In [88]:
type(r.content), type(r.text)

(bytes, str)

## Parse with `BeautifulSoup`

In [89]:
soup = BeautifulSoup(r.text,'html.parser')
type(soup)

bs4.BeautifulSoup

In [90]:
print(soup.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <link href="https://github.githubassets.com" rel="dns-prefetch"/>
  <link href="https://avatars0.githubusercontent.com" rel="dns-prefetch"/>
  <link href="https://avatars1.githubusercontent.com" rel="dns-prefetch"/>
  <link href="https://avatars2.githubusercontent.com" rel="dns-prefetch"/>
  <link href="https://avatars3.githubusercontent.com" rel="dns-prefetch"/>
  <link href="https://github-cloud.s3.amazonaws.com" rel="dns-prefetch"/>
  <link href="https://user-images.githubusercontent.com/" rel="dns-prefetch"/>
  <link crossorigin="anonymous" href="https://github.githubassets.com/assets/frameworks-146fab5ea30e8afac08dd11013bb4ee0.css" integrity="sha512-FG+rXqMOivrAjdEQE7tO4BwM1poGmg70hJFTlNSxjX87grtrZ6UnPR8NkzwUHlQEGviu9XuRYeO8zH9YwvZhdg==" media="all" rel="stylesheet">
   <link crossorigin="anonymous" href="https://github.githubassets.com/assets/site-90a338b2464ec321e425c6f96fd2e675.css" integrity="sha512-kKM4skZOw

In [97]:
list(soup.children)

['\n',
 'html',
 '\n',
 <html lang="en">
 <head>
 <meta charset="utf-8"/>
 <link href="https://github.githubassets.com" rel="dns-prefetch"/>
 <link href="https://avatars0.githubusercontent.com" rel="dns-prefetch"/>
 <link href="https://avatars1.githubusercontent.com" rel="dns-prefetch"/>
 <link href="https://avatars2.githubusercontent.com" rel="dns-prefetch"/>
 <link href="https://avatars3.githubusercontent.com" rel="dns-prefetch"/>
 <link href="https://github-cloud.s3.amazonaws.com" rel="dns-prefetch"/>
 <link href="https://user-images.githubusercontent.com/" rel="dns-prefetch"/>
 <link crossorigin="anonymous" href="https://github.githubassets.com/assets/frameworks-146fab5ea30e8afac08dd11013bb4ee0.css" integrity="sha512-FG+rXqMOivrAjdEQE7tO4BwM1poGmg70hJFTlNSxjX87grtrZ6UnPR8NkzwUHlQEGviu9XuRYeO8zH9YwvZhdg==" media="all" rel="stylesheet">
 <link crossorigin="anonymous" href="https://github.githubassets.com/assets/site-90a338b2464ec321e425c6f96fd2e675.css" integrity="sha512-kKM4skZOwyHk

In [None]:
[type(item) for item in list(soup.children)]

In [99]:
list(soup.children)[3]

<html lang="en">
<head>
<meta charset="utf-8"/>
<link href="https://github.githubassets.com" rel="dns-prefetch"/>
<link href="https://avatars0.githubusercontent.com" rel="dns-prefetch"/>
<link href="https://avatars1.githubusercontent.com" rel="dns-prefetch"/>
<link href="https://avatars2.githubusercontent.com" rel="dns-prefetch"/>
<link href="https://avatars3.githubusercontent.com" rel="dns-prefetch"/>
<link href="https://github-cloud.s3.amazonaws.com" rel="dns-prefetch"/>
<link href="https://user-images.githubusercontent.com/" rel="dns-prefetch"/>
<link crossorigin="anonymous" href="https://github.githubassets.com/assets/frameworks-146fab5ea30e8afac08dd11013bb4ee0.css" integrity="sha512-FG+rXqMOivrAjdEQE7tO4BwM1poGmg70hJFTlNSxjX87grtrZ6UnPR8NkzwUHlQEGviu9XuRYeO8zH9YwvZhdg==" media="all" rel="stylesheet">
<link crossorigin="anonymous" href="https://github.githubassets.com/assets/site-90a338b2464ec321e425c6f96fd2e675.css" integrity="sha512-kKM4skZOwyHkJcb5b9LmdZ40Y9HVj6cr4jlgNXoE/L9vuIT

In [105]:
list(soup.children)[3].text

'\n\n\n\n\n\n\n\n\n\n\n\n\n\numitkacar (Senior R&D Engineer) · GitHub\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSkip to content\n\n\n\n\n\n\n\n\n\n\n\n\n\n              Sign\xa0up\n            \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n                    Why GitHub?\n                    \n\n\n\n\nFeatures →\n\nCode review\nProject management\nIntegrations\nActions\nPackages\nSecurity\nTeam management\nHosting\n\n\nCustomer stories →\nSecurity →\n\n\n\n\n\nTeam\n\n\nEnterprise\n\n\n\n\n                    Explore\n                    \n\n\n\n\n\nExplore GitHub →\n\nLearn & contribute\n\nTopics\nCollections\nTrending\nLearning Lab\nOpen source guides\n\nConnect with others\n\nEvents\nCommunity forum\nGitHub Education\n\n\n\n\n\nMarketplace\n\n\n\n\n                    Pricing\n                    \n\n\n\n\nPlans →\n\nCompare plans\nContact Sales\n\n\nNonprofit →\nEducation →\n\n\n\n\n\n\n\n\n\n\n'

In [106]:
soup.title

<title>umitkacar (Senior R&amp;D Engineer) · GitHub</title>

In [107]:
soup.find('a')

<a class="px-2 py-4 bg-blue text-white show-on-focus js-skip-to-content" href="#start-of-content">Skip to content</a>

In [108]:
len(soup.find_all('a'))

95

In [109]:
soup.find_all('a')

[<a class="px-2 py-4 bg-blue text-white show-on-focus js-skip-to-content" href="#start-of-content">Skip to content</a>,
 <a aria-label="Homepage" class="mr-4" data-ga-click="(Logged out) Header, go to homepage, icon:logo-wordmark" href="https://github.com/">
 <svg aria-hidden="true" class="octicon octicon-mark-github text-white" height="32" version="1.1" viewbox="0 0 16 16" width="32"><path d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0016 8c0-4.42-3.58-8-8-8z" fill-rule="evenodd"></path></svg>

In [None]:
soup.find_all('a', class_='text-bold flex-auto min-width-0')

In [None]:
soup.find_all('span', class_='repo')

In [110]:
repos = soup.find_all('a', class_='text-bold flex-auto min-width-0')
repos

[<a class="text-bold flex-auto min-width-0" href="/umitkacar/ImageLabeler_EarDetection">
 <span class="repo" title="ImageLabeler_EarDetection">ImageLabeler_EarDetection</span>
 </a>,
 <a class="text-bold flex-auto min-width-0" href="/umitkacar/CornerNet">
 <span class="repo" title="CornerNet">CornerNet</span>
 </a>,
 <a class="text-bold flex-auto min-width-0" href="/umitkacar/PADify">
 <span class="repo" title="PADify">PADify</span>
 </a>,
 <a class="text-bold flex-auto min-width-0" href="/umitkacar/streamlit">
 <span class="repo" title="streamlit">streamlit</span>
 </a>,
 <a class="text-bold flex-auto min-width-0" href="/umitkacar/Pytorch-Learning">
 <span class="repo" title="Pytorch-Learning">Pytorch-Learning</span>
 </a>,
 <a class="text-bold flex-auto min-width-0" href="/umitkacar/SigNet">
 <span class="repo" title="SigNet">SigNet</span>
 </a>]

In [111]:
type(repos[1])

bs4.element.Tag

In [115]:
repos = soup.find_all('a', class_='text-bold flex-auto min-width-0', href=True)

In [118]:
['github.com' + repo['href'] for repo in repos]

['github.com/umitkacar/ImageLabeler_EarDetection',
 'github.com/umitkacar/CornerNet',
 'github.com/umitkacar/PADify',
 'github.com/umitkacar/streamlit',
 'github.com/umitkacar/Pytorch-Learning',
 'github.com/umitkacar/SigNet']

In [None]:
[repo.text.strip() for repo in repos]

In [None]:
[repo.find('a') for repo in repos]