In [17]:
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from webdriver_manager.microsoft import EdgeChromiumDriverManager

In [18]:
service = Service(executable_path=EdgeChromiumDriverManager().install())

url_stats = 'https://www.timeshighereducation.com/world-university-rankings/2022/world-ranking#!/page/0/length/-1/sort_by/rank/sort_order/asc/cols/stats'

url_score = 'https://www.timeshighereducation.com/world-university-rankings/2022/world-ranking#!/page/0/length/-1/sort_by/rank/sort_order/asc/cols/scores'

In [19]:
web_driver_stats = webdriver.ChromiumEdge(service=service)
web_driver_stats.get(url_stats)

# Đợi cho đến khi tất cả các phần tử được tải
web_driver_stats.implicitly_wait(10)

# Lấy code HTML của trang web
stats_html = web_driver_stats.page_source

# Parse HTML
stats_soup = bs(stats_html, 'html.parser')

# Lấy các đối tượng cần thiết
uni_stats_rank = stats_soup.find_all('td', {'class':'rank sorting_1 sorting_2'})
uni_name = stats_soup.find_all(name=["a", "div"], attrs={"class": "ranking-institution-title"})
uni_location = stats_soup.findAll("div", {"class":"location"})
uni_stats_number_students = stats_soup.findAll("td", {"class":"stats stats_number_students"})
uni_stats_students_per_staff = stats_soup.findAll("td", {"class":"stats stats_student_staff_ratio"})
uni_stats_international_students = stats_soup.findAll("td", {"class":"stats stats_pc_intl_students"})
uni_stats_female_male_ratio = stats_soup.findAll("td", {"class":"stats stats_female_male_ratio"})

# Đóng trình duyệt
web_driver_stats.close()

In [20]:
# Score
web_driver_score = webdriver.ChromiumEdge(service=service)

web_driver_score.get(url_score)

# Đợi cho đến khi tất cả các phần tử được tải
web_driver_score.implicitly_wait(10)

# Lấy code HTML của trang web
score_html = web_driver_score.page_source

# Parse HTML
score_soup = bs(score_html, 'html.parser')

# Lấy các đối tượng cần thiết
# Vì thứ tự của các trường giống với `rank` nên ta chỉ cần bắt đầu lấy từ cột thứ 3
uni_overall_score = score_soup.findAll("td", {"class":"scores overall-score"})
uni_teaching_score = score_soup.findAll("td", {"class":"scores teaching-score"})
uni_research_score = score_soup.findAll("td", {"class":"scores research-score"})
uni_citations_score = score_soup.findAll("td", {"class":"scores citations-score"})
uni_industry_income_score = score_soup.findAll("td", {"class":"scores industry_income-score"})
uni_international_outlook_score = score_soup.findAll("td", {"class":"scores international_outlook-score"})

# Đóng trình duyệt
web_driver_score.close()

In [21]:
rank, names, countries, number_students, student_staff_ratio, intl_students, female_male_ratio, web_address = [], [], [], [], [], [], [], []

overall_score, teaching_score, research_score, citations_score, industry_income_score, international_outlook_score = [], [], [], [], [], []

for i in range(len(uni_name)):
  names.append(uni_name[i].text)
  rank.append(uni_stats_rank[i].text)
  countries.append(uni_location[i].text)
  number_students.append(uni_stats_number_students[i].text)
  student_staff_ratio.append(uni_stats_students_per_staff[i].text)
  intl_students.append(uni_stats_international_students[i].text)
  female_male_ratio.append(uni_stats_female_male_ratio[i].text[0:2])
  overall_score.append(uni_overall_score[i].text)
  teaching_score.append(uni_teaching_score[i].text)
  research_score.append(uni_research_score[i].text)
  citations_score.append(uni_citations_score[i].text)
  industry_income_score.append(uni_industry_income_score[i].text)
  international_outlook_score.append(uni_international_outlook_score[i].text)

df = pd.DataFrame({
  'rank': rank,
  'name': names,
  'country': countries,
  'number_of_students': number_students,
  'student_per_staff': student_staff_ratio,
  'international_students': intl_students,
  'famale_male_ratio': female_male_ratio,
  'overall_score': overall_score,
  'teaching_score': teaching_score,
  'research_score': research_score,
  'citations_score': citations_score,
  'industry_income_score': industry_income_score,
  'international_outlook_score': international_outlook_score
})
df

Unnamed: 0,rank,name,country,number_of_students,student_per_staff,international_students,famale_male_ratio,overall_score,teaching_score,research_score,citations_score,industry_income_score,international_outlook_score
0,1,University of Oxford,United Kingdom,20835,10.7,42%,47,95.7,91.0,99.6,98.0,74.4,96.3
1,=2,California Institute of Technology,United States,2233,6.3,34%,36,95.0,93.6,96.9,97.8,90.4,83.8
2,=2,Harvard University,United States,21574,9.5,24%,50,95.0,94.5,98.9,99.2,48.9,79.8
3,4,Stanford University,United States,16319,7.3,23%,46,94.9,92.3,96.8,99.9,91.0,79.7
4,=5,University of Cambridge,United Kingdom,19680,11.1,39%,47,94.6,90.9,99.5,96.2,56.7,95.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2107,Reporter,Yaşar University,Turkey,6847,13.0,2%,53,,,,,,
2108,Reporter,Yenepoya University,India,3104,6.1,0%,67,,,,,,
2109,Reporter,Yogyakarta State University,Indonesia,24988,20.3,1%,72,,,,,,
2110,Reporter,York St John University,United Kingdom,6030,18.0,8%,66,,,,,,


In [22]:
# loại bỏ các hàng có cột rank là 'Reporter'
df = df[df['rank'] != 'Reporter']
# loại bỏ kí tự '%' trong cột 'International Students'
df.loc[:,'international_students'] = df.loc[:,'international_students'].str.replace(pat='%', repl='')
# loại bỏ kí tự `=` trong cột 'Rank'
df.loc[:,'rank'] = df.loc[:,'rank'].str.replace(pat='=', repl='')
# loại bỏ kí tự `-` và những kí tự trước đó của cột overall score
df.loc[:,'overall_score'] = df.loc[:,'overall_score'].str.replace(pat='.*\–', repl='', regex=True)
# loại bỏ kí tự `-` và những kí tự sau đó của cột 'Rank' và loại bỏ kí tự `+`
df.loc[:,'rank'] = df.loc[:,'rank'].str.replace(pat='\–\d*|\+', repl='', regex=True)
# bỏ dấu `,` trong cột 'Number of Students'
df.loc[:,'number_of_students'] = df.loc[:,'number_of_students'].str.replace(pat=',', repl='')
# thay thế giá trị `n/a` trong mọi cột thành NaN
df = df.replace('n/a*', np.nan, regex=True)
df

Unnamed: 0,rank,name,country,number_of_students,student_per_staff,international_students,famale_male_ratio,overall_score,teaching_score,research_score,citations_score,industry_income_score,international_outlook_score
0,1,University of Oxford,United Kingdom,20835,10.7,42,47,95.7,91.0,99.6,98.0,74.4,96.3
1,2,California Institute of Technology,United States,2233,6.3,34,36,95.0,93.6,96.9,97.8,90.4,83.8
2,2,Harvard University,United States,21574,9.5,24,50,95.0,94.5,98.9,99.2,48.9,79.8
3,4,Stanford University,United States,16319,7.3,23,46,94.9,92.3,96.8,99.9,91.0,79.7
4,5,University of Cambridge,United Kingdom,19680,11.1,39,47,94.6,90.9,99.5,96.2,56.7,95.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1657,1201,Yokohama National University,Japan,9805,12.6,8,29,22.3,22.6,17.6,13.7,52.9,30.3
1658,1201,Yuan Ze University,Taiwan,8017,19.9,7,42,22.3,20.1,13.7,19.9,47.2,33.1
1659,1201,Yuri Gagarin State Technical University of Sar...,Russian Federation,11127,18.8,5,28,22.3,15.2,9.5,6.1,35.0,19.0
1660,1201,Yuriy Fedkovych Chernivtsi National University,Ukraine,11981,10.6,1,56,22.3,17.7,8.6,4.4,34.8,23.0


In [23]:
# xuất dữ liệu ra file csv
df.to_csv('data/world_university_ranking_2022.csv', index=False)
# json
df.to_json('data/world_university_ranking_2022.json', orient='records')
# excel
df.to_excel('data/world_university_ranking_2022.xlsx', index=False)

In [24]:
df[df['country'] == 'Vietnam']

Unnamed: 0,rank,name,country,number_of_students,student_per_staff,international_students,famale_male_ratio,overall_score,teaching_score,research_score,citations_score,industry_income_score,international_outlook_score
421,401,Duy Tan University,Vietnam,21519,23.6,0,64,44.0,13.1,10.8,100.0,35.9,49.4
484,401,Ton Duc Thang University,Vietnam,26833,22.3,1,57,44.0,14.5,13.4,99.3,38.4,62.6
1186,1001,"Vietnam National University, Hanoi",Vietnam,41803,14.3,1,64,27.1,19.4,10.5,41.5,36.6,45.6
1360,1201,Hanoi University of Science and Technology,Vietnam,32222,27.9,1,24,22.3,14.5,9.8,20.4,43.3,42.4
1636,1201,Vietnam National University (Ho Chi Minh City),Vietnam,75358,22.7,1,33,22.3,15.7,10.9,23.9,40.8,39.5
