# Fetching data using web scraping

## Import Libraries

In [24]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

## Import Webpage

In [25]:
webpage=requests.get('http://localhost:8000/a.html').text
print(webpage)

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Document</title>
</head>
<body>
    <div>
        <h1>Students name</h1>
        <ul>
            <li class="Student name">John Doe</li>
            <li class="Student name">Jane Smith</li>
            <li class="Student name">Emily Johnson</li>
            <li class="Student name">Michael Brown</li>
            <li class="Student name">Sarah Davis</li>
        </ul>

        <h1>Student Details</h1>
        <ul >
            <li class="Student details">John Doe - Age: 20, Major: Computer Science</li>
            <li class="Student details">Jane Smith - Age: 22, Major: Mathematics</li>
            <li class="Student details">Emily Johnson - Age: 21, Major: Physics</li>
            <li class="Student details">Michael Brown - Age: 23, Major: Chemistry</li>
            <li class="Student details">Sarah Davis - Age: 19, Major: Biology</li>

## define "soup" as Beautiful Library

In [26]:
soup=BeautifulSoup(webpage,'lxml')
print(soup.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1.0" name="viewport"/>
  <title>
   Document
  </title>
 </head>
 <body>
  <div>
   <h1>
    Students name
   </h1>
   <ul>
    <li class="Student name">
     John Doe
    </li>
    <li class="Student name">
     Jane Smith
    </li>
    <li class="Student name">
     Emily Johnson
    </li>
    <li class="Student name">
     Michael Brown
    </li>
    <li class="Student name">
     Sarah Davis
    </li>
   </ul>
   <h1>
    Student Details
   </h1>
   <ul>
    <li class="Student details">
     John Doe - Age: 20, Major: Computer Science
    </li>
    <li class="Student details">
     Jane Smith - Age: 22, Major: Mathematics
    </li>
    <li class="Student details">
     Emily Johnson - Age: 21, Major: Physics
    </li>
    <li class="Student details">
     Michael Brown - Age: 23, Major: Chemistry
    </li>
    <li class="Student details">
     Sarah Davis - Age: 19,

## Extract the heading

In [48]:
for n in soup.find_all('h1'):
    print(n.text.strip())

Students name
Student Details
Student Grades


## Extract the students name

In [29]:
print("Students name")
for sn in soup.find_all('li','Student name'):
    print(sn.text.strip())

Students name
John Doe
Jane Smith
Emily Johnson
Michael Brown
Sarah Davis


## Extract the students details

In [32]:
print("Students Details")
for sd in soup.find_all('li','Student details'):
    print(sd.text.strip())

Students Details
John Doe - Age: 20, Major: Computer Science
Jane Smith - Age: 22, Major: Mathematics
Emily Johnson - Age: 21, Major: Physics
Michael Brown - Age: 23, Major: Chemistry
Sarah Davis - Age: 19, Major: Biology


## Extract the students grades

In [33]:
print("Students Grades")
for sg in soup.find_all('li','Student grades'):
    print(sg.text.strip())

Students Grades
John Doe - Grade: A
Jane Smith - Grade: B+
Emily Johnson - Grade: A-
Michael Brown - Grade: B
Sarah Davis - Grade: A


## Store as a Table(Pandas DataFrame)

In [39]:
names=[li.text for li in soup.find_all("li",'Student name')]
details=[li.text for li in soup.find_all("li",'Student details')]
grades=[li.text for li in soup.find_all("li",'Student grades')]

df=pd.DataFrame({
    "Name":names,
    "details":details,
    "grades":grades
})
df

Unnamed: 0,Name,details,grades
0,John Doe,"John Doe - Age: 20, Major: Computer Science",John Doe - Grade: A
1,Jane Smith,"Jane Smith - Age: 22, Major: Mathematics",Jane Smith - Grade: B+
2,Emily Johnson,"Emily Johnson - Age: 21, Major: Physics",Emily Johnson - Grade: A-
3,Michael Brown,"Michael Brown - Age: 23, Major: Chemistry",Michael Brown - Grade: B
4,Sarah Davis,"Sarah Davis - Age: 19, Major: Biology",Sarah Davis - Grade: A
