## University Courses Scraper

In [1]:
# import libraries

import requests  # make a request to a url
from bs4 import BeautifulSoup  # parse the requests as html
import pandas as pd  # data manipulation
from time import sleep
import re

In [3]:
# Load the CSV file
csv_file = "./data/bsc_course_links.csv"  # Update with the actual file path
df = pd.read_csv(csv_file)

In [5]:
df.head()

Unnamed: 0,course_links
0,https://www.educations.com/institutions/univer...
1,https://www.educations.com/institutions/queens...
2,https://www.educations.com/institutions/ueurop...
3,https://www.educations.com/institutions/swanse...
4,https://www.educations.com/institutions/euclea...


In [7]:
df.shape

(10350, 1)

In [9]:
# List to store scraped data
scraped_data = []

In [11]:
# Function to scrape course details
def scrape_course(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad responses
        soup = BeautifulSoup(response.text, "html.parser")
        
        # Extract image source
        img_tag = soup.find("img", class_="!z-1 maw-w-full absolute h-full w-screen object-cover sm:rounded-none md:rounded-t-lg lg:rounded-t-lg")
        if img_tag and img_tag.has_attr("src"):
            img_src = img_tag["src"].strip()
        else:
            img_src = ""
            
        # Extract institution name
        a_tag = soup.find("a", class_="text-grey-800 block cursor-pointer text-xs font-semibold underline mb-2 md:text-base")
        if a_tag:
            institution_name = a_tag.text.strip()
        else:
            institution_name = ""
            # print("Institution Name:",institution_name)

        # Extract course title
        h1_tag = soup.find("h1", class_="title-xl2 text-base font-bold mb-4")
        if h1_tag:
            course_title = h1_tag.text.strip()
        else:
            course_title = ""

        # univerity location
        p_tag = soup.find("p", class_="text-xs font-normal text-gray-700 mb-6 md:text-base")
        if p_tag:
            university_location = p_tag.text.strip() 
        else:
            university_location = ""
            
        # Extract course duration
        infos = soup.find_all("p", class_="text-base")
        if infos:
            course_duration = infos[0].text
            study_mode = infos[2].text
            campus_type = infos[-1].text
          
        else:
            course_duration = ""
            study_mode = ""
            campus_type = ""


        # Extract Description
        div_tag  = soup.find("div", class_="block text")
        if div_tag:
            course_infomation = div_tag.text.strip()
        else:
            course_infomation = "Unknown"
            
        return {"Image Source": img_src, 
                "Institution Name": institution_name, 
                "Course Title": course_title,
                "University Location": university_location,
                "Course Duration": course_duration,
                "Study Mode": study_mode,
                "Campus Type": campus_type,
                "Course Infomation": course_infomation}

    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return {"URL": url}

In [17]:
df[0:10]

Unnamed: 0,course_links
0,https://www.educations.com/institutions/univer...
1,https://www.educations.com/institutions/queens...
2,https://www.educations.com/institutions/ueurop...
3,https://www.educations.com/institutions/swanse...
4,https://www.educations.com/institutions/euclea...
5,https://www.educations.com/institutions/euclea...
6,https://www.educations.com/institutions/regent...
7,https://www.educations.com/institutions/univer...
8,https://www.educations.com/institutions/univ-s...
9,https://www.educations.com/institutions/libf-o...


In [21]:
# Iterate through the course URLs and scrape data
for index, row in df[0:10].iterrows():
    course_url = row["course_links"]
    print(f"Scraping: {course_url}")
    data = scrape_course(course_url)
    scraped_data.append(data)
    
    # Delay between requests to avoid getting blocked
    sleep(2)

# Convert scraped data to a DataFrame
scraped_df = pd.DataFrame(scraped_data)



Scraping: https://www.educations.com/institutions/university-of-groningen/bsc-in-spatial-planning-and-design
Scraping: https://www.educations.com/institutions/queens-university-belfast-faculty-of-arts-humanities-and-social-sciences/bsc-in-international-business-with-french
Scraping: https://www.educations.com/institutions/ueurope/bsc-business-management-studies
Scraping: https://www.educations.com/institutions/swansea-university/bsc-psychology
Scraping: https://www.educations.com/institutions/euclea/bsc-fashion-technology
Scraping: https://www.educations.com/institutions/euclea/bsc-computing
Scraping: https://www.educations.com/institutions/regent-college-london/bsc-hons-business-management
Scraping: https://www.educations.com/institutions/university-of-bolton-academic-centre-%E2%80%93-ras-al-khaimah/bsc-hons-business-management
Scraping: https://www.educations.com/institutions/univ-szeged/agricultural-engineering-bsc
Scraping: https://www.educations.com/institutions/libf-online/bsc-ho

In [23]:
scraped_df

Unnamed: 0,Image Source,Institution Name,Course Title,University Location,Course Duration,Study Mode,Campus Type,Course Infomation
0,https://keystoneacademic-res.cloudinary.com/im...,University of Groningen,BSc in Spatial Planning and Design,"Groningen, Netherlands",3 Years,Full time,On-Campus,If you have a special interest in the living e...
1,https://keystoneacademic-res.cloudinary.com/im...,"Queen's University Belfast - Faculty of Arts, ...",BSc in International Business with French,"Belfast, United Kingdom",4 Years,Full time,On-Campus,This degree seeks to develop graduates with a ...
2,https://keystoneacademic-res.cloudinary.com/im...,University of Europe for Applied Sciences,BSc Business & Management Studies,,6 Semesters,Full time,On-Campus,"As a manager of tomorrow, entrepreneurial thin..."
3,https://keystoneacademic-res.cloudinary.com/im...,Swansea University,BSc Psychology,"Swansea, United Kingdom",3 Years,Full time,On-Campus,Studying a BSc in Psychology will give you exp...
4,https://keystoneacademic-res.cloudinary.com/im...,Euclea Business School,BSc Fashion Technology,"Sharjah, United Arab Emirates",3 Years,Full time,On-Campus,"If you are passionate about fashion, then our ..."
5,https://keystoneacademic-res.cloudinary.com/im...,Euclea Business School,BSc Computing,"Sharjah, United Arab Emirates",3 Years,Full time,On-Campus,The Bachelor of Science (BSc) in Computing pro...
6,https://keystoneacademic-res.cloudinary.com/im...,Regent College London,BSc (Hons) Business Management,"London, United Kingdom",3 Years,Full time,On-Campus,The BSc (Hons) Business Management offers you ...
7,https://keystoneacademic-res.cloudinary.com/im...,"University of Bolton, Academic Centre – Ras Al...",BSc (Hons) Business Management,"Ras Al-Khaimah, United Arab Emirates",3 Years,Full time,On-Campus,BSc (Hons) Business Management qualifications ...
8,https://keystoneacademic-res.cloudinary.com/im...,University of Szeged,Agricultural Engineering (BSc),,7 Semesters,Full time,On-Campus,About the programThe BSc program in Agricultur...
9,https://keystoneacademic-res.cloudinary.com/im...,LIBF Online,BSc (Hons) Cyber Security,Online,36 up to 72 Months,Full time,Distance Learning,Master the art of securing digital landscapes ...


In [25]:
scraped_df.isnull().sum()

Image Source           0
Institution Name       0
Course Title           0
University Location    0
Course Duration        0
Study Mode             0
Campus Type            0
Course Infomation      0
dtype: int64

In [29]:
# Save to CSV
scraped_df.to_csv("./scraped_bsc_courses.csv", index=False)
print("Scraping completed. Data saved to scraped_courses.csv")

Scraping completed. Data saved to scraped_courses.csv
