## **Scrapping for org name**

### Libraries

In [2]:
import requests
import pandas as pd
import time

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options


### Scraping

In [3]:
url = "https://www.topuniversities.com/world-university-rankings"


def send_req_get_dt(url, table_id, pages=-1):
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")
    driver = webdriver.Chrome(options=chrome_options)
    rnk_org = {
        "rank": [],
        "org_name": [],
    }
    
    try:
        driver.get(url)
        curr_page = 1
        while True:
            driver.implicitly_wait(5)

            table_div = driver.find_element(By.ID, table_id)
            table_html_el = table_div.get_attribute("innerHTML")
            soup = BeautifulSoup(table_html_el, "html.parser")

            ranks = [rank.text.strip().replace("=", "") for rank in soup.find_all("span", {"class": "rank-no"})]
            org_names = [org.text.strip() for org in soup.find_all("a", {"class": "uni-link"})]

            rnk_org["rank"].extend(ranks)
            rnk_org["org_name"].extend(org_names)

            if pages != -1 and curr_page >= pages:
                break

            try:
                next_page = driver.find_element(By.CSS_SELECTOR, "a.page-link.next")
                next_page.click()
                time.sleep(2)
                curr_page += 1
            except Exception as e:
                print("No more pages available")
                break
    
        df = pd.DataFrame(rnk_org)
        return df

    finally:
        driver.quit()
df_rnk_org = send_req_get_dt(url, "ranking-data-load")

No more pages available


In [4]:
import re

def remove_parentheses(text):
    return re.sub(r'\s*\([^)]*\)', '', text)

# Using string operations
def remove_parentheses_alt(text):
    start = text.find('(')
    end = text.find(')')
    if start != -1 and end != -1:
        return (text[:start] + text[end+1:]).strip()
    return text
df_rnk_org.drop_duplicates(subset=["org_name"], inplace=True)
df_rnk_org = df_rnk_org[df_rnk_org['org_name'].notna()]  # Remove NaN
df_rnk_org["org_name"] = df_rnk_org["org_name"].apply(remove_parentheses)
df_rnk_org["org_name"] = df_rnk_org["org_name"].apply(remove_parentheses_alt)
df_rnk_org["org_name"] = df_rnk_org["org_name"].str.strip()
df_rnk_org['org_name'] = df_rnk_org['org_name'].apply(lambda x: x.lower())
df_rnk_org = df_rnk_org[df_rnk_org['org_name'].str.strip() != '']  # Remove empty strings
df_rnk_org.reset_index(drop=True, inplace=True)
df_rnk_org["org_name"] = df_rnk_org["org_name"].str.strip('"\'"')  # Removes both single and double quotes
df_rnk_org.reset_index(drop=True, inplace=True)
df_rnk_org.to_csv("orgs.csv", index=False)


In [5]:
df_rnk_org.sample(5)

Unnamed: 0,rank,org_name
1041,1001-1200,indiana university–purdue university indianapolis
743,741-750,university of central florida
396,396,university of southern queensland
834,801-850,university of baghdad
536,535,university of eastern finland
