# World Universities Data Scraper

In [1]:
# import libraries

import requests  # make a request to a url
from bs4 import BeautifulSoup  # parse the requests as html
import pandas as pd  # data manipulation
from time import sleep
import re

In [70]:
course_single_link = []

In [71]:
def scrape_course_url(page_url: str) -> list:
    """scrape course url 
    
    function scrape all course details link on each page
    """
    
    response = requests.get(page_url)
    soup = BeautifulSoup(response.content, "html.parser")
    
    

    for i in range (0, 18):
        course_li = soup.find("li", id=f"program-{i}")
        if course_li:
            href_values = course_li.find("a").get("href")
        else:
            continue
            
        course_single_link.append(href_values)
        
    
    return course_single_link

In [72]:
BASE_URL = 'https://www.educations.com/bsc'

# Loop through the specified number of pages (adjust the range as needed)
for page_num in range(1, 576):  # Example: Looping through pages 1 to 2
    page_url = f"{BASE_URL}?page-{page_num}"  # Construct the URL for each page

    try:
        # Call the visit_site function to scrape data from the current page
        print(f"Collecting data from {page_url} ...")
        links = scrape_course_url(page_url)
        
        
        print(f"Done Collecting Data from {page_url}")
    except Exception as e:
        # Handle potential exceptions and continue with the next page
        print(f"Error collecting data from {page_url}: {e}")
    
print("Data collection complete!")


Collecting data from https://www.educations.com/bsc?page-1 ...
Done Collecting Data from https://www.educations.com/bsc?page-1
Collecting data from https://www.educations.com/bsc?page-2 ...
Done Collecting Data from https://www.educations.com/bsc?page-2
Collecting data from https://www.educations.com/bsc?page-3 ...
Done Collecting Data from https://www.educations.com/bsc?page-3
Collecting data from https://www.educations.com/bsc?page-4 ...
Done Collecting Data from https://www.educations.com/bsc?page-4
Collecting data from https://www.educations.com/bsc?page-5 ...
Done Collecting Data from https://www.educations.com/bsc?page-5
Collecting data from https://www.educations.com/bsc?page-6 ...
Done Collecting Data from https://www.educations.com/bsc?page-6
Collecting data from https://www.educations.com/bsc?page-7 ...
Done Collecting Data from https://www.educations.com/bsc?page-7
Collecting data from https://www.educations.com/bsc?page-8 ...
Done Collecting Data from https://www.educations

In [74]:
links

['https://www.educations.com/institutions/university-of-groningen/bsc-in-spatial-planning-and-design',
 'https://www.educations.com/institutions/queens-university-belfast-faculty-of-arts-humanities-and-social-sciences/bsc-in-international-business-with-french',
 'https://www.educations.com/institutions/ueurope/bsc-business-management-studies',
 'https://www.educations.com/institutions/swansea-university/bsc-psychology',
 'https://www.educations.com/institutions/euclea/bsc-fashion-technology',
 'https://www.educations.com/institutions/euclea/bsc-computing',
 'https://www.educations.com/institutions/regent-college-london/bsc-hons-business-management',
 'https://www.educations.com/institutions/university-of-bolton-academic-centre-%E2%80%93-ras-al-khaimah/bsc-hons-business-management',
 'https://www.educations.com/institutions/univ-szeged/agricultural-engineering-bsc',
 'https://www.educations.com/institutions/libf-online/bsc-hons-cyber-security',
 'https://www.educations.com/institutions/

In [75]:
df = pd.DataFrame(links, columns=["course_links"])  # Naming the column
df.head()

Unnamed: 0,course_links
0,https://www.educations.com/institutions/univer...
1,https://www.educations.com/institutions/queens...
2,https://www.educations.com/institutions/ueurop...
3,https://www.educations.com/institutions/swanse...
4,https://www.educations.com/institutions/euclea...


In [78]:
df.isnull().sum()

course_links    0
dtype: int64

In [79]:
df.duplicated().sum()

10332

In [80]:
df.shape

(10350, 1)

In [77]:
csv_file_path = './data/bsc_course_links.csv'
df.to_csv(csv_file_path, index=False)