In [9]:
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import re
import json 

Get Each Universities Link

In [10]:
regions_name = ['America & Canada', 'Asia', 'Australia', 'Europe', 'UK & Ireland']
main_page = requests.get("https://iisma.kemdikbud.go.id/info/host-universities-list/")
soup = BeautifulSoup(main_page.text, features='html.parser')
regions = soup.find_all('div', class_='elementor-image-carousel swiper-wrapper')
links_by_region = {}
for i, region in enumerate(regions):
    links = []
    universities = region.find('a')
    link = universities['href']
    links = link.split(',')
    links_by_region[regions_name[i]] = links

In [11]:
links_by_region

{'America & Canada': ['https://iisma.kemdikbud.go.id/info/31-boston-university-metropolitan-college/',
  'https://iisma.kemdikbud.go.id/info/56-arizona-state-university/',
  'https://iisma.kemdikbud.go.id/info/05-the-university-of-pennsylvania-college-of-liberal-and-professional-studies/',
  'https://iisma.kemdikbud.go.id/info/06-yale-university/',
  'https://iisma.kemdikbud.go.id/info/29-penn-state-university',
  'https://iisma.kemdikbud.go.id/info/30-university-of-california-davis/',
  'https://iisma.kemdikbud.go.id/info/35-uc-chile/',
  'https://iisma.kemdikbud.go.id/info/45-michigan-state-university/',
  'https://iisma.kemdikbud.go.id/info/64-university-of-colorado-boulder/',
  'https://iisma.kemdikbud.go.id/info/13-university-of-british-columbia/',
  'https://iisma.kemdikbud.go.id/info/42-university-of-waterloo/',
  'https://iisma.kemdikbud.go.id/info/46-western-university/',
  'https://iisma.kemdikbud.go.id/info/03-university-of-chicago/',
  'https://iisma.kemdikbud.go.id/info/21

Get Universities Primary Information

In [12]:
column_name = ["Name", "Image", "Location", "Description", "Official Link", "IISMA Link"]
df = pd.DataFrame(columns=column_name)

for region in links_by_region:
    for link in links_by_region[region]:
        details_page = requests.get(link)
        soup = BeautifulSoup(details_page.text, 'html.parser')
        
        # Left side
        left_info = soup.findAll("div", class_="elementor-widget-container")
        image = left_info[0].img['src']
        name = left_info[1].h4.text
        location = left_info[2].p.text
        official_link = left_info[4].text.strip()
        
        # Right side
        right_info = soup.findAll("div", class_="elementor-tab-content elementor-clearfix")
        description = right_info[0].text
        
        # Make dataframe
        new_row = [name, image, location, description, official_link, link]
    
        new_row_df = pd.DataFrame([new_row], columns=column_name)
        df = pd.concat([df, new_row_df], ignore_index = True)


In [13]:
region_list = []
for region in links_by_region.keys():
    for i in links_by_region[region]:
        region_list.append(region)
df["Region"] = region_list

In [14]:
df

Unnamed: 0,Name,Image,Location,Description,Official Link,IISMA Link,Region
0,Boston University Metropolitan College,https://i0.wp.com/iisma.kemdikbud.go.id/info/w...,"Boston, Massachusetts, USA",the MET International Undergraduate Program pr...,bu.edu/metinternational,https://iisma.kemdikbud.go.id/info/31-boston-u...,America & Canada
1,Arizona State University,https://i0.wp.com/iisma.kemdikbud.go.id/info/w...,"Tempe, Arizona, USA","Arizona State University (ASU), located in Tem...",www.asu.edu,https://iisma.kemdikbud.go.id/info/56-arizona-...,America & Canada
2,The University of Pennsylvania,https://i0.wp.com/iisma.kemdikbud.go.id/info/w...,United States of America,Penn is a private urban research university wi...,www.lps.upenn.edu/non-degree-programs/igsp,https://iisma.kemdikbud.go.id/info/05-the-univ...,America & Canada
3,Yale University,https://i0.wp.com/iisma.kemdikbud.go.id/info/w...,United States of America,"Since its founding in 1701, Yale has been dedi...",https://summer.yale.edu/,https://iisma.kemdikbud.go.id/info/06-yale-uni...,America & Canada
4,Penn State University,https://i0.wp.com/iisma.kemdikbud.go.id/info/w...,United States of America,"Penn State University, founded in 1855, is the...",global.psu.edu,https://iisma.kemdikbud.go.id/info/29-penn-sta...,America & Canada
...,...,...,...,...,...,...,...
108,University of Bath,https://i0.wp.com/iisma.kemdikbud.go.id/info/w...,United Kingdom,The University of Bath received its Royal Char...,www.ucl.ac.uk,https://iisma.kemdikbud.go.id/info/s93-univers...,UK & Ireland
109,University of Limerick,https://i0.wp.com/iisma.kemdikbud.go.id/info/w...,Ireland,"Established in 1972, the University of Limeric...",https://www.ul.ie/,https://iisma.kemdikbud.go.id/info/s96-univers...,UK & Ireland
110,University of Southampton,https://i0.wp.com/iisma.kemdikbud.go.id/info/w...,"Southampton, United Kingdom",University of Southampton (UoS) is a research ...,southampton.ac.uk,https://iisma.kemdikbud.go.id/info/s91-univers...,UK & Ireland
111,Queen's University Belfast,https://i0.wp.com/iisma.kemdikbud.go.id/info/w...,United Kingdom,Queen’s University has been making a differenc...,www.qub.ac.uk,https://iisma.kemdikbud.go.id/info/s94-queens-...,UK & Ireland


Setting Browser

In [15]:
browser_options = webdriver.ChromeOptions()
browser_options.add_argument('--headless')
browser_options.headless = True
driver = webdriver.Chrome(options=browser_options)

Get Universities Requirements

In [16]:
def get_number(text, toFloat: bool):
    match = re.search(r'\b\d+(?:\.\d+)?\b', text)

    if match:
        if toFloat:
            value = str(float(match.group()))
        else:
            value = str(match.group())
    else:
        value = "-"

    return value

In [17]:
requirements_data = []

for region in links_by_region:
    for link in links_by_region[region]:
        data = {}
        driver.get(link)

        try:
            driver.find_element(By.ID, "elementor-tab-title-4502").click()
            requirements = driver.find_element(By.ID,'elementor-tab-content-4502').text.split("\n")
        except NoSuchElementException:
            driver.find_element(By.ID, "elementor-tab-title-1812").click()
            requirements = driver.find_element(By.ID, 'elementor-tab-content-1812').text.split("\n")
            
        for req in requirements:
            if "GPA" in req:
                data["GPA"] = get_number(req, True)
            elif "English Language" not in req:
                if "IELTS" in req:
                    data["IELTS"] = get_number(req, True)
                    if data["IELTS"] == "-":
                        data["IELTS"] = "Not accepted"
                elif "DET" in req or "Duolingo" in req:
                    data["DET"] = get_number(req, False)
                    if data["DET"] == "-":
                        data["DET"] = "Not accepted"
                elif "TOEFL" in req:
                    data["TOEFL"] = get_number(req, False)
                    if data["TOEFL"] == "-":
                        data["TOEFL"] = "Not accepted"
        requirements_data.append(data)

In [18]:
df_requirements = pd.DataFrame(requirements_data)
df_requirements["University"] = df["Name"]
df_requirements = df_requirements.set_index("University").reset_index()
df_requirements

Unnamed: 0,University,GPA,IELTS,TOEFL,DET
0,Boston University Metropolitan College,3.0,6.0,84,115
1,Arizona State University,3.0,6.0,79,115
2,The University of Pennsylvania,3.3,7.0,100,120
3,Yale University,3.0,7.0,100,120
4,Penn State University,3.0,6.5,80,120
...,...,...,...,...,...
108,University of Bath,3.0,6.5,90,120
109,University of Limerick,3.0,6.0,78,100
110,University of Southampton,3.0,6.0,92,125
111,Queen's University Belfast,3.0,6.5,90,Not accepted


Get Universities Intake & New Universities

In [19]:
def get_range(text):
    parts = text.split(":")
    # match = re.search(r'\b\d+(?:\.\d+)?\b', text)

    if len(parts) > 1:
        range = parts[1].strip()
        return range

In [22]:
intakes_data = []
new_uni_data = []

# for x in range(1):
for region in links_by_region:
    # for y in range(1):
    for link in links_by_region[region]:
        new_uni = {}
        data = {}
        driver.get(link)
        key1 = ""

        uni_name = driver.find_element(By.TAG_NAME,"h4").text
        try:
            tab_click = driver.find_element(By.ID, "elementor-tab-title-4504")
            tab_click.click()
            intakes_el = driver.find_element(By.ID,'elementor-tab-content-4504')
        except NoSuchElementException:
            try:
                tab_click = driver.find_element(By.ID, "elementor-tab-title-1814")
                tab_click.click()
                intakes_el = driver.find_element(By.ID,'elementor-tab-title-1814')
            except NoSuchElementException:
                new_uni["University"] = uni_name
                new_uni["Link"] = link
                new_uni_data.append(new_uni)
                continue

        intakes_list = intakes_el.find_elements(By.TAG_NAME, "p")
        for intakes in intakes_list:
            data = {}
            data["University"] = uni_name
            splitted_intakes = intakes.text.split("\n") 
            for intake in splitted_intakes:
                if len(intake) > 0:
                    if ":" not in intake: 
                        if "Applicants" in intake:
                            key = "Applicants"
                            try:
                                year = re.findall(r"\d{4}", intake)[0]
                            except:
                                year = re.findall(r"\d{4}", tab_click.text)[0] # Kalau gak ada tahunnya, ikut judul tab
                        elif "Awardees" in intake:
                            key = "Awardees"
                        
                        if "Cofunding" in intake or "Co-funding" in intake or "Co-Funding" in intake:
                            key1 = "Co-funding" 
                        elif "Regular" in intake:
                            key1 = "Regular"
                    else:
                        data["Year"] = year
                        if key1 == "":
                            if "Cofunding" in intake or "Co-funding" in intake or "Co-Funding" in intake:
                                key1 = "Co-funding" 
                            elif "Regular" in intake:
                                key1 = "Regular" 
                            else:
                                if "students" in intake or "Students" in intake:
                                    key1 = "Regular"
                        data["Intake"] = key1
                        data["Type"] = key
                        
                        if "student" in intake or "Applicants" in intake or "Awardees" in intake:
                            data["Students"] = get_number(intake, False)
                        else:
                            key2 = ""
                            if "GPA" in intake:
                                key2 = "GPA"
                            elif "IELTS" in intake:
                                key2 = "IELTS"
                            elif "DET" in intake or "Duolingo" in intake:
                                key2 = "DET"
                            elif "TOEFL" in intake:
                                key2 = "TOEFL"

                            ranges = get_range(intake)
                            value = "–"
                            if len(ranges) > 1:
                                if "-" in ranges:
                                    value = ranges.split("-")
                                elif "–" in ranges:
                                    value = ranges.split("–")

                            if len(key2) > 1:
                                try:
                                    data[f"{key2} Min"] = value[0].strip()
                                    data[f"{key2} Max"] = value[1].strip()
                                except IndexError:
                                    data[f"{key2} Min"] = value[0].strip()
                                    data[f"{key2} Max"] = value[0].strip()

                            if key2 == "DET":
                                intakes_data.append(data)
                                data = {}
                                data["University"]=uni_name
                                key1=""

            # print(link)
            # print(splitted_intakes)


In [None]:
df_intakes = pd.DataFrame(intakes_data).groupby(["University", "Year", "Type", "Intake"], sort=False).sum().replace(0, "–")
df_intakes = df_intakes.reset_index()
display(df_intakes)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Students,GPA Min,GPA Max,TOEFL Min,TOEFL Max,IELTS Min,IELTS Max,DET Min,DET Max
University,Year,Type,Intake,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Boston University Metropolitan College,2022,Applicants,Regular,395,3.12,4.00,88,106,6.5,8.5,0,155
Boston University Metropolitan College,2022,Awardees,Regular,20,3.39,3.96,–,–,–,–,140,155
Boston University Metropolitan College,2023,Applicants,Regular,231,2.89,3.99,67,67,5.5,8.5,80,155
Boston University Metropolitan College,2023,Applicants,Co-funding,9,3.31,3.81,–,–,6.5,6.5,100,140
Boston University Metropolitan College,2023,Awardees,Regular,20,3.40,3.90,–,–,8.5,8.5,140,155
...,...,...,...,...,...,...,...,...,...,...,...,...
University College London,2023,Awardees,Regular,10,3.46,3.95,–,–,8,8.5,–,–
Queen Mary University of London,2022,Applicants,Regular,26,3.04,4.00,103,110,6,8,–,–
Queen Mary University of London,2022,Awardees,Regular,4,3.67,3.82,–,–,–,–,–,–
Queen Mary University of London,2023,Applicants,Regular,13,3.25,3.86,–,–,7,8,–,–


In [24]:
df_new_uni = pd.DataFrame(new_uni_data)
display(df_new_uni)

Unnamed: 0,University,Link
0,Cornell University,https://iisma.kemdikbud.go.id/info/79-cornell-...
1,University of Michigan,https://iisma.kemdikbud.go.id/info/s80-univers...
2,New York University,https://iisma.kemdikbud.go.id/info/s83-new-yor...
3,Georgetown University,https://iisma.kemdikbud.go.id/info/s82-georget...
4,University of Toronto,https://iisma.kemdikbud.go.id/info/s100-univer...
5,Sophia University,https://iisma.kemdikbud.go.id/info/s98-sophia-...
6,Taipei Medical University,https://iisma.kemdikbud.go.id/info/s117-taipei...
7,National University of Singapore,https://iisma.kemdikbud.go.id/info/s112-nation...
8,National Cheng Kung University (NCKU),https://iisma.kemdikbud.go.id/info/s115-nation...
9,National Taiwan Normal University,https://iisma.kemdikbud.go.id/info/s88-nationa...


Get Universities Courses

In [26]:
courses_data = []

for region in links_by_region:
    for link in links_by_region[region]:
        driver.get(link)

        uni_name = driver.find_element(By.TAG_NAME,"h4").text
        titles = driver.find_elements(By.CLASS_NAME, "elementor-toggle-item")
        
        data = {}
        data["University"] = uni_name
        
        courses = []
        for title in titles:
            courses.append(title.text)

        data["Courses"] = courses
        courses_data.append(data)
        

KeyboardInterrupt: 

In [None]:
df_courses = pd.DataFrame(courses_data)
display(df_courses)

Unnamed: 0,University,Courses
0,Boston University Metropolitan College,"[Entrepreneurial Management, Project Managemen..."
1,Arizona State University,[Principles of Marketing for Global Organizati...
2,The University of Pennsylvania,"[Essentials of Molecular Biology and Genetics,..."
3,Yale University,"[World Cinema, Education and Empire, What is L..."
4,Penn State University,"[Introductory Sociology, Race, Ethnicity and C..."
...,...,...
108,University of Bath,"[Introduction to English linguistics, Shakespe..."
109,University of Limerick,"[PRINCIPLES OF HUMAN RESOURCE MANAGEMENT, PSYC..."
110,University of Southampton,"[Foundations of Business Analytics, Introducti..."
111,Queen's University Belfast,"[Behaviour in Organisations, Global Business E..."


Dump to JSON

In [None]:
file_path_list = []
file_path_list += ["iisma_intakes_data.json"]
file_path_list += ["iisma_university_data.json"] 
file_path_list += ["iisma_requirements_data.json"]
file_path_list += ["iisma_new_uni_data.json"]
file_path_list += ["iisma_courses_data.json"]

In [None]:
json_list = []
json_list += [json.dumps(df_intakes.to_dict(orient='records'), indent=2)]
json_list += [json.dumps(df.to_dict(orient='records'), indent=2)]
json_list += [json.dumps(df_requirements.to_dict(orient='records'), indent=2)]
json_list += [json.dumps(df_new_uni.to_dict(orient='records'), indent=2)]
json_list += [json.dumps(df_courses.to_dict(orient='records'), indent=2)]

j = 0
for i in file_path_list:
    with open("json/"+i, "w") as json_file:
        json_file.write(json_list[j])
    j+=1

Excel

In [None]:
df_intakes.to_excel("excel/Intakes.xlsx")
df.to_excel("excel/University.xlsx")
df_requirements.to_excel("excel/Requirements.xlsx")
df_new_uni.to_excel("excel/New University.xlsx")
df_courses.to_excel("excel/Courses.xlsx")

In [None]:
driver.quit()