### Import the required libraries

In [26]:
import requests
import csv
import json
import pandas as pd
from bs4 import BeautifulSoup

### Getting the request URL and extract the information

In [27]:

response = requests.get('https://en.wikipedia.org/wiki/List_of_largest_universities_and_university_networks_by_enrollment')

soup = BeautifulSoup(response.content)

In [28]:
table = soup.find('table', attrs={'class':'wikitable'})

In [29]:
trs = table.find_all('tr')

In [30]:
columns = []
for item in trs[0].find_all('th'):
    print(item.text.strip())
    columns.append(item.text.strip())


Rank
Institution
Location
Continent
Founded
Affiliation
Distance/In-Person[a]
Enrollment
Ref


In [31]:
headers = list(map(lambda x: x.text.strip(), trs[0].find_all('th')))
headers[6] = "Delivery_Method"
headers[-1] = "Link"

In [32]:
headers

['Rank',
 'Institution',
 'Location',
 'Continent',
 'Founded',
 'Affiliation',
 'Delivery_Method',
 'Enrollment',
 'Link']

In [33]:
rows = trs[1:]
rows[0].find_all('td')
x = list(map(lambda x: x.text.strip(), rows[0].find_all('td')))
x

['1',
 'Indira Gandhi National Open University',
 'New Delhi, India',
 'Asia',
 '1985',
 'Public',
 'Distance/In-Person',
 '7,140,000+',
 '[1][2][3]']

In [34]:
rows[0].find_all('td')[1].find_all('a')[1]['href']

'/wiki/Indira_Gandhi_National_Open_University'

In [35]:
def extract_data(row):
    row_list = row.find_all('td')
    row_item = list(map(lambda x: x.text.strip(), row_list))
    if len(row) < 6:
        print("#############################################")
        print(row)
    link = row_list[1].find_all('a')[1]['href'].lstrip('/')
    row_item[-1] = f"https://en.wikipedia.org/"+ link if link else ''
    return row_item

In [36]:
extract_data(rows[7])

['8',
 'Bangladesh Open University',
 'Gazipur, Bangladesh',
 'Asia',
 '1992',
 'Public',
 'Distance',
 '650,000',
 'https://en.wikipedia.org/wiki/Bangladesh_Open_University']

In [37]:
data = list(map(extract_data, rows))
# data

In [38]:
len(data)

90

### Saving the data to CSV

In [39]:
with open('universities.csv', 'w') as f:
    csvwriter = csv.writer(f)
    csvwriter.writerow(headers)
    csvwriter.writerows(data)

In [40]:
df =pd.DataFrame(data=data, columns=headers)
df.to_json('universities.json', orient='records',lines=True)

In [41]:
df.head(10)

Unnamed: 0,Rank,Institution,Location,Continent,Founded,Affiliation,Delivery_Method,Enrollment,Link
0,1,Indira Gandhi National Open University,"New Delhi, India",Asia,1985,Public,Distance/In-Person,"7,140,000+",https://en.wikipedia.org/wiki/Indira_Gandhi_Na...
1,2,"National University, Bangladesh","Gazipur, Bangladesh",Asia,1992,Public,In-Person,3425832,https://en.wikipedia.org/wiki/National_Univers...
2,3,Anadolu University,"Eskişehir, Turkey",Asia,1958,Public,Distance/In-Person,1974343,https://en.wikipedia.org/wiki/Anadolu_University
3,4,California Community Colleges,"California, United States",North America,1967,Public,In-Person,1800000,https://en.wikipedia.org/wiki/California_Commu...
4,5,Islamic Azad University,Iran,Asia,1982,Private,In-Person,1778000,https://en.wikipedia.org/wiki/Islamic_Azad_Uni...
5,6,Allama Iqbal Open University,"Islamabad, Pakistan",Asia,1974,Public,Distance/In-Person,1027000,https://en.wikipedia.org/wiki/Allama_Iqbal_Ope...
6,7,"Laureate Education, Inc.",International,Global,1999,Private,Distance/In-Person,875000,https://en.wikipedia.org/wiki/Laureate_Educati...
7,8,Bangladesh Open University,"Gazipur, Bangladesh",Asia,1992,Public,Distance,650000,https://en.wikipedia.org/wiki/Bangladesh_Open_...
8,9,Universitas Terbuka,"Jakarta, Indonesia",Asia,1984,Public,Distance,646467,https://en.wikipedia.org/wiki/Universitas_Terbuka
9,10,National Technological Institute of Mexico,Mexico,North America,1948 and 2014,Public,In-Person,"620,000 (2019)",https://en.wikipedia.org/wiki/National_Technol...
