## Scraping a website using Beautiful Soup

In [None]:
from bs4 import BeautifulSoup

In [None]:
import requests

In [None]:
url = "https://www.sitpune.edu.in"

In [None]:
response = requests.get(url)

In [None]:
print(response)

<Response [200]>


In [None]:
html_content = response.content

In [None]:
soup = BeautifulSoup(html_content, "html.parser")

## Extracting the title

In [None]:
soup.find("title")

<title>
         B Tech Engineering College in Pune - SIT Pune | Symbiosis      </title>

In [None]:
soup.find("title").string

'\r\n         B Tech Engineering College in Pune - SIT Pune | Symbiosis      '

## Extracting the h1 tags

In [None]:
h1_tags = soup.find_all("h1")

In [None]:
h1_tags

[<h1 class="c-font-uppercase c-center c-font-bold" style="font-size: 30px !important;padding-top: 15px !important;
     margin: 0 0 15px 0 !important;">SIT PUNE IN SPOTLIGHT</h1>]

In [None]:
for h1 in h1_tags:
    print(h1.text)

SIT PUNE IN SPOTLIGHT


## Extracting h2 tags

In [None]:
h2_tags = soup.find_all("h2")

In [None]:
h2_tags

[<h2 class="titleh2" style="margin-bottom: 3px;">Symbiosis - The world is one family.</h2>,
 <h2 class="titleh2" style="margin-top: 3px;">We welcome as one, learn as one  and thrive as one.</h2>,
 <h2 class="c-left c-font-dark c-font-uppercase c-font-bold" style="color:#0a0a0a;">Why SIT ?</h2>]

In [None]:
for h2 in h2_tags:
    print(h2.text)

Symbiosis - The world is one family.
We welcome as one, learn as one  and thrive as one.
Why SIT ?


## Extracting h3 tags

In [None]:
h3_tags = soup.find_all("h3")

In [None]:
for h3 in h3_tags:
    print(h3.text)

Dr. Ketan Kotecha’s Legacy Of Empowering Students At SIT Pune
Shape Your Engineering Career at SIT Pune – SITEEE 2025 Applications Closing
Exclusive interview of our Director & Dean Dr. Ketan Kotecha on fostering technovation at Symbiosis Institute of Technology Pune
NEWS & HAPPENING'S
Programmes Offered
 B.Tech 
 M.Tech 
 Ph.D 
Testimonials
Parents
Students
Alumni
SYMBIOSIS INSTITUTE OF TECHNOLOGY (SIT)
ABOUT INSTITUTE
INFORMATION FOR
IMPORTANT LINKS


## Extracting the < a > tags


In [None]:
a_tags = soup.find_all("a")

In [None]:
len(a_tags)

275

In [None]:
a_tags

[<a href="https://siu.edu.in/" style="border: 3px solid #ffffff;border-radius: 0px 7px 7px 0px;padding: 2px 10px;float: right;margin-bottom: 2.1rem;background: #c52f33;    color: #ffffff;" target="_blank">
 		            Visit SIU Website
 		             </a>,
 <a class="c-logo home-logo" href="https://www.sitpune.edu.in/">
 <img alt="SIT Pune MTech College in pune" class="c-desktop-logo" height="auto" src="https://www.sitpune.edu.in/index/assets/images/sit pune.png" width="500px"/>
 <img alt="SIT Pune MTech College in pune" class="c-mobile-logo" src="https://www.sitpune.edu.in/index/assets/images/sit pune.png" style="width:250px;"/>
 </a>,
 <a class="button btn_top hidden-xs blinking-text" href="/assets/pdf/Induction 2025 Program-Schedule_compressed.pdf" role="button" style="background-image: linear-gradient(to bottom, #769DCC, #002040);color: #ffffff;
     margin-right: 0;
     padding: 10px 24px;
     border-radius: 50px;
     " target="_blank">Induction Program 2025 </a>,
 <a class

## Extracting the links from the < a > tags

In [None]:
list_of_links = []
for a in a_tags:
    link = a.get("href")
    if link != None:
        list_of_links.append(link)

In [None]:
len(list_of_links)

272

In [None]:
list_of_links

['https://siu.edu.in/',
 'https://www.sitpune.edu.in/',
 '/assets/pdf/Induction 2025 Program-Schedule_compressed.pdf',
 'contactus',
 'https://www.sitpune.edu.in/assets/pdf/Mandatory Disclosure 2024-25_compressed.pdf',
 'https://www.sitpune.edu.in/dual-degree-programs',
 'javascript:void(0)',
 'about-us-the-institute',
 'siu-vision-mission',
 'chancellors-message',
 'institute-vision-mission',
 'dean-director-message',
 'javascript:void(0)',
 'deputy-director-academics',
 '',
 'institute-committees',
 'https://www.siu.edu.in/about-us/authorities-committees/internal-committees',
 'timesarticle',
 'gallery',
 'https://siu.edu.in/siu-departments.php',
 'https://siu.edu.in/symbiosis-committee.php',
 'https://www.siu.edu.in/about-us/why-us/student-wellbeing',
 'javascript:void(0)',
 'javascript:void(0)',
 'btech-AI',
 'btech-civil-engineering',
 'btech-computer-science-engineering',
 'btech-electronic-telecommunications',
 'btech-Information-Technology-Program-Structures',
 'btech-mechanica

## Making a dataframe of the links

In [None]:
import pandas as pd

In [None]:
df_links = pd.DataFrame(data=list_of_links, columns=["Link"])

In [None]:
df_links.shape

(272, 1)

In [None]:
df_links

Unnamed: 0,Link
0,https://siu.edu.in/
1,https://www.sitpune.edu.in/
2,/assets/pdf/Induction 2025 Program-Schedule_co...
3,contactus
4,https://www.sitpune.edu.in/assets/pdf/Mandator...
...,...
267,https://www.aicte-india.org/feedback/index.php
268,https://www.sitpune.edu.in/assets/pdf/AICTE-Ap...
269,https://www.sitpune.edu.in/assets/pdf/OER lett...
270,/


## Saving it as a .csv file

In [None]:
df_links.to_csv(r'sit_links.csv')

In [None]:
all_text = soup.text

## Extracting all the text from the website

In [None]:
all_text

'\n\n\n\n\r\n         B Tech Engineering College in Pune - SIT Pune | Symbiosis      \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\r\n\t\t            Visit SIU Website\r\n\t\t             \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nInduction Program 2025 \n\n\nContact Us \n\n\nMandatory Disclosure \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\r\n Subsequent merit for SITEEE , JEE (Main) and Any State Govt. Engg. Entrance Exam will depend on the number of seats available in the respective branch against cancellations. The tentative date is 15th July 2025\r\n\n\r\n  Admission Registration 2025\r\n\n\n\n\n\n\n\n\n\r\n                     DuaL Degree Programs at SIT\r\n                       \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nAbout\n\n\nInstitute at a Glance\n\nUniversity Vision and Mission\nSymbiosis Leadership\nInstitute Vision and Mission\nThe Dean and Dire

In [None]:
soup.text

'\n\n\n\n\r\n         B Tech Engineering College in Pune - SIT Pune | Symbiosis      \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\r\n\t\t            Visit SIU Website\r\n\t\t             \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nInduction Program 2025 \n\n\nContact Us \n\n\nMandatory Disclosure \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\r\n Subsequent merit for SITEEE , JEE (Main) and Any State Govt. Engg. Entrance Exam will depend on the number of seats available in the respective branch against cancellations. The tentative date is 15th July 2025\r\n\n\r\n  Admission Registration 2025\r\n\n\n\n\n\n\n\n\n\r\n                     DuaL Degree Programs at SIT\r\n                       \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nAbout\n\n\nInstitute at a Glance\n\nUniversity Vision and Mission\nSymbiosis Leadership\nInstitute Vision and Mission\nThe Dean and Dire

## Extracting the < img > tags

In [None]:
img_src = soup.find_all("img")

In [None]:
img_src

[<img alt="SIT Pune MTech College in pune" class="c-desktop-logo" height="auto" src="https://www.sitpune.edu.in/index/assets/images/sit pune.png" width="500px"/>,
 <img alt="SIT Pune MTech College in pune" class="c-mobile-logo" src="https://www.sitpune.edu.in/index/assets/images/sit pune.png" style="width:250px;"/>,
 <img alt="SIT_ASTON UNIVERSITY" height="auto" src="https://www.sitpune.edu.in/assets/images/JEE Score Website Banner.jpg" width="100%"/>,
 <img alt="Inauguration Ceremony of Symbiosis International University Dubai" height="auto" src="https://www.sitpune.edu.in/assets/images/banner/SIT Admissions Open - Website Banner (1).jpg" width="100%"/>,
 <img alt="SIT_ASTON UNIVERSITY" height="auto" src="https://www.sitpune.edu.in/assets/images/banner-2024/webinar 8thh.jpg" width="100%"/>,
 <img alt="SIT_ASTON UNIVERSITY" height="auto" src="https://www.sitpune.edu.in/assets/images/Ranking Banner_2024.jpg" width="100%"/>,
 <img alt="SIT Pune Semester Abroad Program
                   

In [None]:
len(img_src)

53

## Extracting all the li tags and the text nested within them

In [None]:
li_tags = soup.find_all('li')

In [None]:
li_tags

[<li>
 <a class="button btn_top hidden-xs blinking-text" href="/assets/pdf/Induction 2025 Program-Schedule_compressed.pdf" role="button" style="background-image: linear-gradient(to bottom, #769DCC, #002040);color: #ffffff;
     margin-right: 0;
     padding: 10px 24px;
     border-radius: 50px;
     " target="_blank">Induction Program 2025 </a>
 </li>,
 <li>
 <a class="button btn_top hidden-xs blinking-text" href="contactus" role="button" style="background-image: linear-gradient(to bottom, #769DCC, #002040);color: #ffffff;
     margin-right: 0;
     padding: 10px 24px;
     border-radius: 50px;
     " target="_blank">Contact Us </a>
 </li>,
 <li>
 <a class="button btn-flash2 btn_top hidden-xs blinking-text" href="https://www.sitpune.edu.in/assets/pdf/Mandatory Disclosure 2024-25_compressed.pdf" role="button" style="background-image: linear-gradient(to bottom, #769DCC, #002040);color: red; margin-right:0; padding:10px 24px; border-radius:50px;" target="_blank">Mandatory Disclosure </a>


In [None]:
len(li_tags)

186

In [None]:
for li in li_tags:
    print(li.string)

None
None
None
None
None
University Vision and Mission
Symbiosis Leadership
Institute Vision and Mission
The Dean and Director's Message
None
 Deputy Director Academics 
None
Institute Committees
None
Our Media Presence
Gallery
SIU Administrative Dept
SIU Committees
Ombudsman
None
None
Artificial Intelligence and Machine Learning
Civil Engineering
Computer Science and Engineering
Electronics and Telecommunication Engineering
None
Mechanical Engineering
Robotics and Automation
None
Artificial Intelligence and Machine Learning
Automotive Technology
None
Engineering Design
Robotics and Automation
Geoinformatics
Ph.D.
None
None
None
None
None
Ph.D. Admissions
Fee Related
Educational Loan
None
Research Expertise
Research Scholars
Research Publications
None
None
Research Centres
None
 Research Policy
Patents
None
None
Scholarship Details
None
2024-25
2023-24
2022-23
2021-22
2020-21
Student Handbook
None
Chief Mentor
Mentoring Policy
Mentor List
Students Club/Societies
None
Hostels
Hostel All

## Extracting all the images from the website

In [None]:
img_links = []

In [None]:
link = soup.select('img[src^="https://www.sitpune.edu.in/assets/images"]')

In [None]:
for img in link:
    img_links.append(img['src'])

In [None]:
directory = '/content/Scrap_images'

In [None]:
for index, img_link in enumerate(img_links):
    img_data = requests.get(img_link).content
    with open(f'{directory}/'+str(index+1)+'.jpg','wb+') as f:
        f.write(img_data)
