# Data Acquisition, Web Scrapping Notebook

In [1]:
import requests
import bs4

In [3]:
#make the http request and turn response into a beautiful soup object
response = requests.get('https://web-scraping-demo.zgulde.net/news')
html = response.text
soup = bs4.BeautifulSoup(html)

In [4]:
soup

<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<title>News Example Page</title>
<link href="https://unpkg.com/tailwindcss@^2/dist/tailwind.min.css" rel="stylesheet"/>
<link href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.4.1/font/bootstrap-icons.css" rel="stylesheet"/>
</head>
<body class="mx-auto max-w-screen-lg pb-32">
<h1 class="my-5 text-4xl text-center">News!</h1>
<div class="my-5 text-red-800 px-5 py-3 bg-red-100 font-bold">
<p>
<i class="bi bi-exclamation-circle text-xl"></i>
        All data on this page is strictly for demonstration purposes and fake.
    </p>
</div>
<div class="grid gap-y-12">
<div class="grid grid-cols-4 gap-x-4 border rounded pr-3 bg-green-50 hover:shadow-lg transition duration-500">
<img src="/static/placeholder.png"/>
<div class="col-span-3 space-y-3 py-3">
<h2 class="text-2xl text-green-900">blood conference

In [5]:
#One way
article_container = soup.select('.grid.gap-y-12')[0]

In [6]:
articles = article_container.children

In [7]:
list(articles)

['\n',
 <div class="grid grid-cols-4 gap-x-4 border rounded pr-3 bg-green-50 hover:shadow-lg transition duration-500">
 <img src="/static/placeholder.png"/>
 <div class="col-span-3 space-y-3 py-3">
 <h2 class="text-2xl text-green-900">blood conference case</h2>
 <div class="grid grid-cols-2 italic">
 <p> 2007-12-22 </p>
 <p class="text-right">By Katie Brock </p>
 </div>
 <p>Trip buy it. Soldier produce say current fast action everybody suddenly. Civil south model arrive scene. Member south deal budget fear.
 Cold admit memory. Brother right budget where herself discussion rise. Detail election since.</p>
 </div>
 </div>,
 '\n',
 <div class="grid grid-cols-4 gap-x-4 border rounded pr-3 bg-green-50 hover:shadow-lg transition duration-500">
 <img src="/static/placeholder.png"/>
 <div class="col-span-3 space-y-3 py-3">
 <h2 class="text-2xl text-green-900">well population development</h2>
 <div class="grid grid-cols-2 italic">
 <p> 2008-02-01 </p>
 <p class="text-right">By Randy Gilbert </p

In [9]:
#Second way
articles = article_container.select('.grid.grid-cols-4.gap-x-4.border')

In [10]:
#return the first element
article = articles[0]

In [11]:
#get a printed rep of the element
print(article.prettify())

<div class="grid grid-cols-4 gap-x-4 border rounded pr-3 bg-green-50 hover:shadow-lg transition duration-500">
 <img src="/static/placeholder.png"/>
 <div class="col-span-3 space-y-3 py-3">
  <h2 class="text-2xl text-green-900">
   blood conference case
  </h2>
  <div class="grid grid-cols-2 italic">
   <p>
    2007-12-22
   </p>
   <p class="text-right">
    By Katie Brock
   </p>
  </div>
  <p>
   Trip buy it. Soldier produce say current fast action everybody suddenly. Civil south model arrive scene. Member south deal budget fear.
Cold admit memory. Brother right budget where herself discussion rise. Detail election since.
  </p>
 </div>
</div>



In [12]:
title = article.find('h2').text

In [14]:
date_and_byline_div = article.select('.grid.grid-cols-2.italic')[0]
date_and_byline_div

<div class="grid grid-cols-2 italic">
<p> 2007-12-22 </p>
<p class="text-right">By Katie Brock </p>
</div>

In [16]:
date_p, by_p = date_and_byline_div.find_all('p')
date_p.text

' 2007-12-22 '

In [17]:
by_p.text

'By Katie Brock '

In [19]:
summary = article.find_all('p')[-1].text
summary

'Trip buy it. Soldier produce say current fast action everybody suddenly. Civil south model arrive scene. Member south deal budget fear.\nCold admit memory. Brother right budget where herself discussion rise. Detail election since.'

In [20]:
def process_article(article):
    

    title = article.find('h2').text
    date_and_byline_div = article.select('.grid.grid-cols-2.italic')[0]
    date_p, by_p = date_and_byline_div.find_all('p')
    summary = article.find_all('p')[-1].text
    return {
        "title": title,
        "date": date_p.text,
        "by": by_p.text,
        "summary": summary
        
    }

In [21]:
process_article(articles[3])

{'title': 'suddenly try give',
 'date': ' 2005-12-13 ',
 'by': 'By Joshua Cain ',
 'summary': 'Go during rise ever face state. Participant situation somebody. Capital suddenly should democratic.\nCertainly interview next time. Establish name successful ok wide key. Think personal support main sell pretty government fly.'}

In [22]:
[process_article(article) for article in articles]

[{'title': 'blood conference case',
  'date': ' 2007-12-22 ',
  'by': 'By Katie Brock ',
  'summary': 'Trip buy it. Soldier produce say current fast action everybody suddenly. Civil south model arrive scene. Member south deal budget fear.\nCold admit memory. Brother right budget where herself discussion rise. Detail election since.'},
 {'title': 'well population development',
  'date': ' 2008-02-01 ',
  'by': 'By Randy Gilbert ',
  'summary': 'Three push last radio look man character. Issue get course research this. Assume benefit same gun water need long.\nAdult despite relationship region. Pull have per close concern. Main as one let training perhaps.'},
 {'title': 'trade go increase',
  'date': ' 2011-04-18 ',
  'by': 'By Christina Gould ',
  'summary': 'Catch leader agreement offer your. Often provide never face exactly.\nMr high deep. Race article including list. May quickly discover interview.'},
 {'title': 'suddenly try give',
  'date': ' 2005-12-13 ',
  'by': 'By Joshua Cain ',

In [23]:
import pandas as pd

pd.DataFrame([process_article(article) for article in articles])

Unnamed: 0,title,date,by,summary
0,blood conference case,2007-12-22,By Katie Brock,Trip buy it. Soldier produce say current fast ...
1,well population development,2008-02-01,By Randy Gilbert,Three push last radio look man character. Issu...
2,trade go increase,2011-04-18,By Christina Gould,Catch leader agreement offer your. Often provi...
3,suddenly try give,2005-12-13,By Joshua Cain,Go during rise ever face state. Participant si...
4,deep action talk,1982-12-08,By Jose Blackwell,Little organization opportunity they produce. ...
5,cut look minute,2008-02-28,By William Johnson,Any court site despite kitchen institution pro...
6,such film my,2007-03-02,By Janet Ortiz,Mr tax analysis also myself.\nShe say she reli...
7,too future join,2007-06-28,By Mrs. Christine Hancock MD,Tax administration lot doctor include about re...
8,join window beat,1981-11-30,By Brenda Huffman,Actually development attorney. Age outside Ame...
9,production long loss,1985-10-14,By Courtney Thompson,Quality beautiful her ten once tell use everyo...


### Mini Exercise

In [24]:
#Step 1: Make the http request and turn response into a beautiful soup object
response = requests.get('https://web-scraping-demo.zgulde.net/people')
html = response.text
soup = bs4.BeautifulSoup(html)
soup

<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<title>Example People Page</title>
<link href="https://unpkg.com/tailwindcss@^2/dist/tailwind.min.css" rel="stylesheet"/>
<link href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.4.1/font/bootstrap-icons.css" rel="stylesheet"/>
</head>
<body class="mx-auto max-w-screen-lg pb-32">
<h1 class="my-5 text-4xl text-center">People</h1>
<div class="my-5 text-red-800 px-5 py-3 bg-red-100 font-bold">
<p>
<i class="bi bi-exclamation-circle text-xl"></i>
        All data on this page is strictly for demonstration purposes and fake.
    </p>
</div>
<div class="grid grid-cols-2 gap-x-12 gap-y-16" id="people">
<div class="person border rounded px-3 py-5 grid grid-cols-2 gap-x-3 bg-purple-50 hover:shadow-lg transition duration-500">
<h2 class="text-2xl text-purple-800 name col-span-full border-b">Kathleen Silva<

In [25]:
#Step 2: Select the container with all people
people_container = soup.select('.grid.grid-cols-2.gap-x-12.gap-y-16')[0]
people_container

<div class="grid grid-cols-2 gap-x-12 gap-y-16" id="people">
<div class="person border rounded px-3 py-5 grid grid-cols-2 gap-x-3 bg-purple-50 hover:shadow-lg transition duration-500">
<h2 class="text-2xl text-purple-800 name col-span-full border-b">Kathleen Silva</h2>
<p class="quote col-span-full px-5 py-5 text-center text-gray-500">
            "Cloned bandwidth-monitored flexibility"
        </p>
<div class="grid grid-cols-9">
<i class="bi bi-envelope-fill text-purple-800"></i>
<p class="email col-span-8">smithkatherine@williams-cook.com</p>
<i class="bi bi-telephone-fill text-purple-800"></i>
<p class="phone col-span-8">988.834.4291</p>
</div>
<div class="address grid grid-cols-9">
<i class="bi bi-geo-fill text-purple-800"></i>
<p class="col-span-8">
                53227 Ricky Mills <br/>
                Robertside, IA 40909
            </p>
</div>
</div>
<div class="person border rounded px-3 py-5 grid grid-cols-2 gap-x-3 bg-purple-50 hover:shadow-lg transition duration-500">
<h

In [29]:
#Identify Individual People
people = people_container.select('.grid.grid-cols-2.gap-x-3')

In [31]:
#Check the length of People is expected
len(people)

10

In [34]:
#return the first person in people
person = people[0]
print(person.prettify())

<div class="person border rounded px-3 py-5 grid grid-cols-2 gap-x-3 bg-purple-50 hover:shadow-lg transition duration-500">
 <h2 class="text-2xl text-purple-800 name col-span-full border-b">
  Kathleen Silva
 </h2>
 <p class="quote col-span-full px-5 py-5 text-center text-gray-500">
  "Cloned bandwidth-monitored flexibility"
 </p>
 <div class="grid grid-cols-9">
  <i class="bi bi-envelope-fill text-purple-800">
  </i>
  <p class="email col-span-8">
   smithkatherine@williams-cook.com
  </p>
  <i class="bi bi-telephone-fill text-purple-800">
  </i>
  <p class="phone col-span-8">
   988.834.4291
  </p>
 </div>
 <div class="address grid grid-cols-9">
  <i class="bi bi-geo-fill text-purple-800">
  </i>
  <p class="col-span-8">
   53227 Ricky Mills
   <br/>
   Robertside, IA 40909
  </p>
 </div>
</div>



In [35]:
#Step 3: Get the individual elements

#Name of person
name = person.find('h2').text
name

'Kathleen Silva'

In [139]:
#quote for person
quote = person.find('p').text.strip()
quote

'"Cloned bandwidth-monitored flexibility"'

In [45]:
#get e-mail and phone
email_phone = person.select('.grid.grid-cols-9')[0]
email_phone

<div class="grid grid-cols-9">
<i class="bi bi-envelope-fill text-purple-800"></i>
<p class="email col-span-8">smithkatherine@williams-cook.com</p>
<i class="bi bi-telephone-fill text-purple-800"></i>
<p class="phone col-span-8">988.834.4291</p>
</div>

In [46]:
#Break out email and phone
email_p, phone_p = email_phone.find_all('p')

#text for email
email_p.text

'smithkatherine@williams-cook.com'

In [47]:
#text for phone
phone_p.text

'988.834.4291'

In [86]:
#Get address element
address = person.select('.address.grid.grid-cols-9')[0]
address

<div class="address grid grid-cols-9">
<i class="bi bi-geo-fill text-purple-800"></i>
<p class="col-span-8">
                53227 Ricky Mills <br/>
                Robertside, IA 40909
            </p>
</div>

In [83]:
address2 = person.select('div', class_='col-span-8')[-1]
address2.text

'\n\n\n                53227 Ricky Mills \n                Robertside, IA 40909\n            \n'

In [88]:
#break out address
address.text

'\n\n\n                53227 Ricky Mills \n                Robertside, IA 40909\n            \n'

In [145]:
def process_person(person):
    

    name = person.find('h2').text
    quote = person.find('p').text.strip()
    email_phone = person.select('.grid.grid-cols-9')[0]
    email_p, phone_p = email_phone.find_all('p')
    address = person.select('.address.grid.grid-cols-9')[0]
    return {
        "name": name,
        "quote": quote,
        "email": email_p.text,
        "phone": phone_p.text,
        "address": address.text.strip()
        
    }

In [146]:
process_person(people[2])

{'name': 'Brittany Foster',
 'quote': '"Quality-focused dedicated system engine"',
 'email': 'millsnicholas@long.com',
 'phone': '924-562-7886',
 'address': '220 Virginia Common Suite 588 \n                Ethanside, IA 18549'}

In [147]:
df = pd.DataFrame([process_person(person) for person in people])
df

Unnamed: 0,name,quote,email,phone,address
0,Kathleen Silva,"""Cloned bandwidth-monitored flexibility""",smithkatherine@williams-cook.com,988.834.4291,53227 Ricky Mills \n Robertside...
1,Richard Whitney,"""Face-to-face background approach""",cphillips@barry.com,176-727-5936,902 Galvan Streets Suite 694 \n ...
2,Brittany Foster,"""Quality-focused dedicated system engine""",millsnicholas@long.com,924-562-7886,220 Virginia Common Suite 588 \n ...
3,Shawn Hamilton,"""Extended multimedia protocol""",dukerachael@hotmail.com,001-507-956-4595x28585,01918 Kathryn Flat Apt. 976 \n ...
4,Amanda Garcia,"""Quality-focused client-driven secured line""",wpeters@carpenter.biz,692.389.3064x0213,96736 Steven Crossroad \n Lake ...
5,John Tapia,"""Organic responsive policy""",michael31@zimmerman.com,003.179.6964x52515,080 Miller Cliffs \n Kristaches...
6,Ernest Taylor,"""Multi-tiered fault-tolerant extranet""",angela11@sanchez.com,(120)803-2185x0214,9051 Pace Radial \n South Jason...
7,Michael Farley,"""Decentralized stable throughput""",brownmatthew@hotmail.com,003-655-9102,03915 Christian Dam \n Lake Ant...
8,Ian Hawkins,"""Customer-focused dynamic groupware""",janet77@jimenez.biz,755.221.3640x5396,6286 Garza Overpass \n Lake Ant...
9,Tina Miller,"""Total regional website""",rogerstevens@yahoo.com,349.954.2132x390,5996 Graves Groves Apt. 382 \n ...


In [96]:
import re

In [143]:
#Clean up quote
df.quote = df.quote.str.replace(r'\n', r'')

In [144]:
df

Unnamed: 0,name,quote,email,phone,address
0,Kathleen Silva,"""Cloned bandwidth-monitored flexibility""",smithkatherine@williams-cook.com,988.834.4291,\n\n\n 53227 Ricky Mills \n ...
1,Richard Whitney,"""Face-to-face background approach""",cphillips@barry.com,176-727-5936,\n\n\n 902 Galvan Streets Suite...
2,Brittany Foster,"""Quality-focused dedicated system engine""",millsnicholas@long.com,924-562-7886,\n\n\n 220 Virginia Common Suit...
3,Shawn Hamilton,"""Extended multimedia protocol""",dukerachael@hotmail.com,001-507-956-4595x28585,\n\n\n 01918 Kathryn Flat Apt. ...
4,Amanda Garcia,"""Quality-focused client-driven secured line""",wpeters@carpenter.biz,692.389.3064x0213,\n\n\n 96736 Steven Crossroad \...
5,John Tapia,"""Organic responsive policy""",michael31@zimmerman.com,003.179.6964x52515,\n\n\n 080 Miller Cliffs \n ...
6,Ernest Taylor,"""Multi-tiered fault-tolerant extranet""",angela11@sanchez.com,(120)803-2185x0214,\n\n\n 9051 Pace Radial \n ...
7,Michael Farley,"""Decentralized stable throughput""",brownmatthew@hotmail.com,003-655-9102,\n\n\n 03915 Christian Dam \n ...
8,Ian Hawkins,"""Customer-focused dynamic groupware""",janet77@jimenez.biz,755.221.3640x5396,\n\n\n 6286 Garza Overpass \n ...
9,Tina Miller,"""Total regional website""",rogerstevens@yahoo.com,349.954.2132x390,\n\n\n 5996 Graves Groves Apt. ...


In [107]:
#Clean up address
df.address = df.address.str.replace(r'\n\n\n', r'')

In [108]:
df

Unnamed: 0,name,quote,email,phone,address
0,Kathleen Silva,"""Cloned bandwidth-monitored flexib...",smithkatherine@williams-cook.com,988.834.4291,53227 Ricky Mills \n ...
1,Richard Whitney,"""Face-to-face background approach""...",cphillips@barry.com,176-727-5936,902 Galvan Streets Suite 694 \...
2,Brittany Foster,"""Quality-focused dedicated system ...",millsnicholas@long.com,924-562-7886,220 Virginia Common Suite 588 ...
3,Shawn Hamilton,"""Extended multimedia protocol"" ...",dukerachael@hotmail.com,001-507-956-4595x28585,01918 Kathryn Flat Apt. 976 \n...
4,Amanda Garcia,"""Quality-focused client-driven sec...",wpeters@carpenter.biz,692.389.3064x0213,96736 Steven Crossroad \n ...
5,John Tapia,"""Organic responsive policy""",michael31@zimmerman.com,003.179.6964x52515,080 Miller Cliffs \n ...
6,Ernest Taylor,"""Multi-tiered fault-tolerant extra...",angela11@sanchez.com,(120)803-2185x0214,9051 Pace Radial \n ...
7,Michael Farley,"""Decentralized stable throughput"" ...",brownmatthew@hotmail.com,003-655-9102,03915 Christian Dam \n ...
8,Ian Hawkins,"""Customer-focused dynamic groupwar...",janet77@jimenez.biz,755.221.3640x5396,6286 Garza Overpass \n ...
9,Tina Miller,"""Total regional website""",rogerstevens@yahoo.com,349.954.2132x390,5996 Graves Groves Apt. 382 \n...


In [138]:
df.address.str.extract(r'(.+)\s(.+)\s')

Unnamed: 0,0,1
0,53227 Ricky Mills,"Robertside, IA 40909"
1,902 Galvan Streets Suite 694,"East Joshuamouth, SC 80241"
2,220 Virginia Common Suite 588,"Ethanside, IA 18549"
3,01918 Kathryn Flat Apt. 976,"Andrewport, ME 79276"
4,96736 Steven Crossroad,"Lake Brittanyborough, WI 58958"
5,080 Miller Cliffs,"Kristachester, IL 94664"
6,9051 Pace Radial,"South Jasonside, NJ 57030"
7,03915 Christian Dam,"Lake Antonioville, NC 34411"
8,6286 Garza Overpass,"Lake Anthonyside, ID 51956"
9,5996 Graves Groves Apt. 382,"New Debrabury, MO 88214"
