# Web Scraping Mini-Exercise

From web-scraping-demo.zgulde.net gather all the info associated with each person on the people page.

In [2]:
import numpy as np
import pandas as pd
from requests import get
from bs4 import BeautifulSoup

In [3]:
url = 'https://web-scraping-demo.zgulde.net/people'

In [5]:
response = get(url)

In [6]:
response.text

'<!DOCTYPE html>\n<html lang="en">\n<head>\n    <meta charset="UTF-8">\n    <meta http-equiv="X-UA-Compatible" content="IE=edge">\n    <meta name="viewport" content="width=device-width, initial-scale=1.0">\n    <title>Example People Page</title>\n    <link href="https://unpkg.com/tailwindcss@^2/dist/tailwind.min.css" rel="stylesheet" />\n    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.4.1/font/bootstrap-icons.css" />\n</head>\n<body class="mx-auto max-w-screen-lg pb-32">\n    \n<h1 class="my-5 text-4xl text-center">People</h1>\n\n<div class="my-5 text-red-800 px-5 py-3 bg-red-100 font-bold">\n    <p>\n        <i class="bi bi-exclamation-circle text-xl"></i>\n        All data on this page is strictly for demonstration purposes and fake.\n    </p>\n</div>\n\n<div id="people" class="grid grid-cols-2 gap-x-12 gap-y-16">\n    \n    <div class="person border rounded px-3 py-5 grid grid-cols-2 gap-x-3 bg-purple-50 hover:shadow-lg transition duration-500">\n    

In [8]:
soup = BeautifulSoup(response.content, 'html.parser')

In [9]:
soup

<!DOCTYPE html>

<html lang="en">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<title>Example People Page</title>
<link href="https://unpkg.com/tailwindcss@^2/dist/tailwind.min.css" rel="stylesheet"/>
<link href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.4.1/font/bootstrap-icons.css" rel="stylesheet"/>
</head>
<body class="mx-auto max-w-screen-lg pb-32">
<h1 class="my-5 text-4xl text-center">People</h1>
<div class="my-5 text-red-800 px-5 py-3 bg-red-100 font-bold">
<p>
<i class="bi bi-exclamation-circle text-xl"></i>
        All data on this page is strictly for demonstration purposes and fake.
    </p>
</div>
<div class="grid grid-cols-2 gap-x-12 gap-y-16" id="people">
<div class="person border rounded px-3 py-5 grid grid-cols-2 gap-x-3 bg-purple-50 hover:shadow-lg transition duration-500">
<h2 class="text-2xl text-purple-800 name col-span-full border-b">Michelle Dixon

In [13]:
people = soup.find_all('div', class_ = 'person')

In [15]:
len(people)

10

In [16]:
person = people[0]

In [17]:
person

<div class="person border rounded px-3 py-5 grid grid-cols-2 gap-x-3 bg-purple-50 hover:shadow-lg transition duration-500">
<h2 class="text-2xl text-purple-800 name col-span-full border-b">Michelle Dixon</h2>
<p class="quote col-span-full px-5 py-5 text-center text-gray-500">
            "Pre-emptive dedicated pricing structure"
        </p>
<div class="grid grid-cols-9">
<i class="bi bi-envelope-fill text-purple-800"></i>
<p class="email col-span-8">kellyjones@yahoo.com</p>
<i class="bi bi-telephone-fill text-purple-800"></i>
<p class="phone col-span-8">132.680.2766</p>
</div>
<div class="address grid grid-cols-9">
<i class="bi bi-geo-fill text-purple-800"></i>
<p class="col-span-8">
                5620 Crystal Summit <br/>
                North Zacharyfort, VA 18645
            </p>
</div>
</div>

In [19]:
person_name = person.find('h2').text
person_name

'Michelle Dixon'

In [21]:
person_quote = person.find('p').text
person_quote

'\n            "Pre-emptive dedicated pricing structure"\n        '

In [22]:
person_email = person.find(class_ = 'email').text
person_email

'kellyjones@yahoo.com'

In [24]:
person_phone = person.find(class_ = 'phone').text
person_phone

'132.680.2766'

In [26]:
person_address = person.find_all('p')[-1].text
person_address

'\n                5620 Crystal Summit \n                North Zacharyfort, VA 18645\n            '

In [35]:
#Now create a function to do this for all people on the page
# And add their info to a dictionary 
# Then turn the dict into a df
def get_people_data():
    url = 'https://web-scraping-demo.zgulde.net/people'
    response = get(url)
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    people = soup.find_all('div', class_ = 'person')
    
    #Create empty list
    people_info = []
    
    for person in people:
        person_info = {
            'name': person.find('h2').text,
            'quote': person.find('p').text.strip(),
            'email': person.find(class_ = 'email').text,
            'phone': person.find(class_ = 'phone').text,
            'address': person.find_all('p')[-1].text.strip()
        }
        
        people_info.append(person_info)
        
    #Convert to dataframe
    people_info = pd.DataFrame(people_info)
    
    return people_info

In [36]:
people_info = get_people_data()

In [37]:
people_info

Unnamed: 0,name,quote,email,phone,address
0,Shelley Pierce,"""Face-to-face interactive core""",bgreen@hotmail.com,+1-744-255-8135x35943,47236 Pamela Expressway Apt. 843 \n ...
1,Christopher Page,"""Vision-oriented motivating utilization""",angiegutierrez@yahoo.com,9829991948,07152 Chelsea Forks Apt. 768 \n ...
2,Deborah Perez,"""Business-focused reciprocal array""",ashlee09@gmail.com,1932027806,3434 Benson Pass \n Howardville...
3,Frank Mills,"""Compatible logistical contingency""",davidmitchell@griffith.org,+1-079-469-4982x8363,300 White Canyon Apt. 096 \n No...
4,Bradley Lopez,"""Customizable even-keeled attitude""",haley51@moody.com,(705)659-8671,47007 Anthony Stravenue Apt. 154 \n ...
5,Mark Steele,"""Assimilated zero tolerance knowledge user""",fergusonwesley@hotmail.com,209.015.5577,741 April Groves \n Kristenberg...
6,Cassandra Smith,"""Multi-channeled dynamic superstructure""",stephaniethomas@hayes-johnson.biz,001-670-942-7926,737 Murray Centers \n Zacharyto...
7,Thomas Wise,"""Up-sized asymmetric initiative""",gregorygallegos@taylor-rogers.com,(884)012-1922x4003,0336 House Junction Apt. 818 \n ...
8,Dr. Shelby Hoover PhD,"""Secured well-modulated solution""",ajohnston@barry.com,001-897-501-3958x192,031 Sanchez Lake \n East Sharon...
9,Laurie Ball,"""Front-line modular framework""",rmiller@robinson-mitchell.net,578-703-1162,53927 Jason Heights Suite 605 \n ...
