In [1]:
import os
import warnings
from tqdm import tqdm
import random
import requests
from pprint import pprint
import json
import ast
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt

tqdm.pandas()
warnings.filterwarnings("ignore")

In [2]:
output_path = "./courses/"
base_url = "https://ams.ashoka.edu.in"
semesters = {"Monsoon 2024": "SCH00000031", "Summer 2024": "SCH00000030", "Spring 2024": "SCH00000029", "Monsoon 2023": "SCH00000028", "Summer 2023": "SCH00000027", "Spring 2023": "SCH00000026", "Monsoon 2022": "SCH00000025", "Summer 2022": "SCH00000024", "Spring 2022": "SCH00000023", "Monsoon 2021": "SCH00000022", "Summer 2021": "SCH00000021", "Spring 2021": "SCH00000020", "Monsoon 2020": "SCH00000019", "Summer 2020": "SCH00000018", "Spring 2020": "SCH00000017", "Monsoon 2019": "SCH00000016", "Summer 2019": "SCH00000015", "Spring 2019": "SCH00000014", "Monsoon 2018": "SCH00000013", "Summer 2018": "SCH00000012", "Spring 2018": "SCH00000011", "Monsoon 2017": "SCH00000010", "Summer 2017": "SCH00000009", "Spring 2017": "SCH00000008", "Monsoon 2016": "SCH00000007", "Summer 2016": "SCH00000006", "Spring 2016": "SCH00000005", "Monsoon 2015": "SCH00000004", "Summer 2015": "SCH00000003", "Spring 2015": "SCH00000002", "Monsoon 2014": "SCH00000001"}

session_id = "55i3wqxvjtlibnkq13rrudzv"

In [3]:
possible_headers = [
    {
        "User-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 "
                      "(KHTML, like Gecko) Version/13.1.1 Safari/605.1.15"
    },
    {
        "User-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0"
    },
    {
        "User-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 "
                      "(KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"
    },
    {
        "User-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0"
    },
    {
        "User-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                      "(KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"
    },
]

In [4]:
def grab_sem_courses(session, sem):

    url = base_url + "/Contents/Reports/CourseWiseRegisteredStudentReport_Student.aspx/BindCourse"

    # HTTP headers to make the server accept the request
    headers = {
        "User-Agent": possible_headers[random.randrange(0, 5)]["User-agent"],
        "Accept": "application/json, text/javascript, */*; q=0.01",
        "Accept-Language": "en-GB,en-US;q=0.7,en;q=0.3",
        "Accept-Encoding": "gzip, deflate, br, zstd",
        "Content-Type": "application/json",
        "Origin": base_url,
        "DNT": "1",
        "Connection": "keep-alive",
        "Referer": base_url + "/Contents/Reports/CourseWiseRegisteredStudentReport_Student.aspx",
        "Sec-Fetch-Dest": "empty",
        "Sec-Fetch-Mode": "cors",
        "Sec-Fetch-Site": "same-origin",
        "Sec-GPC": "1",
        "Cookie": "ASP.NET_SessionId=" + session_id + "; __AJAXAntiXsrfToken=n3We4vJupHXY87Eo_o8Jh5dNfU7FRpQ8gh4grRdJILiWG2qIAeZGoJp-dASQ5z9rGWZ-iyk3-CeUAOCH2QmuJgFUKGmUJkM8Zq5sKITXV0E1",
    }
    post_data = "{\"xml\":\"<tbl><tr><ScheduleSysGenId>" + semesters[sem] + "</ScheduleSysGenId></tr></tbl>\"}"

    try:
        # Return the html if successful
        response = session.post(url, data=post_data, headers=headers)
        if response.status_code == 200:
            return response.text
        else:
            # Return an error code if not
            print(f"Failed to download course list from: {url}, status code: {response.status_code}", )
            return response.status_code
    except Exception as e:
        print(f"Failed to download course list from: {url}, error: {e}")
        return

In [5]:
def parse_sem_courses(courses):
    course_list = ast.literal_eval(ast.literal_eval(courses.replace("\\n", "").replace("\\r", ""))['d'])
    for course in course_list:
        course['course_name'] = course['CourseCode'].split("]")[1].strip()
        course['course_code'] = course['CourseCode'].split("]")[0] + "]"
        del course['CourseCode']

    return course_list

In [6]:
def scrape_all_courses():

    all_courses = {}

    for semester in semesters.keys():
        with requests.Session() as session:
            all_courses[semester] = parse_sem_courses(grab_sem_courses(session, semester))

    return all_courses

In [7]:
def grab_info(session, sem_id, course_id, counter):

    url = base_url + "/Contents/Reports/CourseWiseRegisteredStudentReport_Student.aspx/GetListData"

    # HTTP headers to make the server accept the request
    headers = {
        "User-Agent": possible_headers[random.randrange(0, 5)]["User-agent"],
        "Accept": "application/json, text/javascript, */*; q=0.01",
        "Accept-Language": "en-GB,en-US;q=0.7,en;q=0.3",
        "Accept-Encoding": "gzip, deflate, br, zstd",
        "Content-Type": "application/json",
        "Origin": base_url,
        "DNT": "1",
        "Connection": "keep-alive",
        "Referer": base_url + "/Contents/Reports/CourseWiseRegisteredStudentReport_Student.aspx",
        "Sec-Fetch-Dest": "empty",
        "Sec-Fetch-Mode": "cors",
        "Sec-Fetch-Site": "same-origin",
        "Sec-GPC": "1",
        "Cookie": "ASP.NET_SessionId=" + session_id + "; __AJAXAntiXsrfToken=n3We4vJupHXY87Eo_o8Jh5dNfU7FRpQ8gh4grRdJILiWG2qIAeZGoJp-dASQ5z9rGWZ-iyk3-CeUAOCH2QmuJgFUKGmUJkM8Zq5sKITXV0E1",
    }
    post_data = "{\"xml\":\"<tbl><tr><CourseSysGenId>" + str(course_id) + "</CourseSysGenId><ScheduleSysGenId>" + str(sem_id) + "</ScheduleSysGenId><LSNo>" + str(counter) + "</LSNo><DSNo></DSNo><RegistrationStatus></RegistrationStatus></tr></tbl>\"}"

    try:
        # Return the html if successful
        response = session.post(url, data=post_data, headers=headers)
        if response.status_code == 200:
            return response.text
        else:
            # Return an error code if not
            print(f"Failed to download course info from: {url}, status code: {response.status_code}", )
            return response.status_code
    except Exception as e:
        print(f"Failed to download course info from: {url}, error: {e}")
        return

In [8]:
def parse_course_info(info):
    data = ast.literal_eval(str(info).replace("\\n", "").replace("\\r", "").replace("null", "\\\"null\\\""))["d"]
    student_list = ast.literal_eval(data)["Table"]
    student_list = [x for x in student_list if "student" not in x["AshokaEmailId"]]

    return student_list

In [41]:
def scrape_all_course_info(df):

    all_students = {}

    with requests.Session() as session:
        for index, course in tqdm(df.iterrows()):

            first_ls = parse_course_info(grab_info(session, course["schedule_id"], course["course_id"], 1))
            all_students[(course["schedule_id"], course["course_id"], 1)] = first_ls

            next_ls = grab_info(session, course["schedule_id"], course["course_id"], 2)
            if not ("PreRegistered" in next_ls or "Confirmed" in next_ls):
                continue

            all_students[(course["schedule_id"], course["course_id"], 2)] = parse_course_info(next_ls)
            counter = 3
            next_ls = grab_info(session, course["schedule_id"], course["course_id"], counter)

            while "PreRegistered" in next_ls or "Confirmed" in next_ls:
                all_students[(course["schedule_id"], course["course_id"], counter)] = parse_course_info(next_ls)
                counter += 1
                next_ls = grab_info(session, course["schedule_id"], course["course_id"], counter)

    return all_students

In [10]:
all_courses = scrape_all_courses()

In [11]:
temp = {}
for key in all_courses:
    sem = all_courses[key]
    temp[key] = {}
    temp[key]["name"] = [x['course_name'] for x in sem]
    temp[key]["code"] = [x['course_code'] for x in sem]
    temp[key]["schedule_id"] = [x['ScheduleSysGenId'] for x in sem]
    temp[key]["course_id"] = [x['CourseSysGenId'] for x in sem]

df = pd.DataFrame([
    {**{'semester': key}, **{subkey: value[i] if i < len(value) else None for subkey, value in subdict.items()}}
    for key, subdict in temp.items()
    for i in range(max(len(lst) for lst in subdict.values()))
])
df

Unnamed: 0,semester,name,code,schedule_id,course_id
0,Monsoon 2024,The Earth and Other Planets,[AST-2840/ AST-3840/ PHY-3840],SCH00000031,CRS00004743
1,Monsoon 2024,Cosmic Messengers -2 (Laboratory),[AST-3880/ AST-4880/ PHY-3880/ PHY-4880/ PHY-6...,SCH00000031,CRS00004745
2,Monsoon 2024,Introduction to Biology 3: Molecular Genetics ...,[BIO-2201],SCH00000031,CRS00003118
3,Monsoon 2024,Laboratory Course 2: Molecular Biology and Bio...,[BIO-2212],SCH00000031,CRS00000873
4,Monsoon 2024,Introduction to Biology II: Cell Biology,[BIO-2214],SCH00000031,CRS00003225
...,...,...,...,...,...
6415,Monsoon 2014,Introduction to Ethics and Global Citizenship,[PHI-209],SCH00000001,CRS00000518
6416,Monsoon 2014,Introduction to Political Theory,[POL-101],SCH00000001,CRS00000031
6417,Monsoon 2014,Introduction to Psychology,[PSY-1001],SCH00000001,CRS00000032
6418,Monsoon 2014,Visual Arts,[VA-001],SCH00000001,CRS00000034


In [12]:
df["term"] = df["semester"].apply(lambda x: x.split(" ")[0])
df["year"] = df["semester"].apply(lambda x: int(x.split(" ")[1]))
df["acad_year"] = df.apply(lambda x: str(x["year"]) + "-" + str(int(x["year"])+1) if x["term"] == "Monsoon" else str(int(x["year"])-1) + "-" + str(x["year"]), axis=1)
df["department"] = df["code"].apply(lambda x: x.split("-")[0].replace("[", "").partition("/")[0].strip())
df["is_ism"] = df["code"].apply(lambda x: True if "-IS-" in x else False)
df["is_crosslist"] = df["code"].apply(lambda x: True if "/" in x else False)
df = df[df["department"] != "MLS"]
df

Unnamed: 0,semester,name,code,schedule_id,course_id,term,year,acad_year,department,is_ism,is_crosslist
0,Monsoon 2024,The Earth and Other Planets,[AST-2840/ AST-3840/ PHY-3840],SCH00000031,CRS00004743,Monsoon,2024,2024-2025,AST,False,True
1,Monsoon 2024,Cosmic Messengers -2 (Laboratory),[AST-3880/ AST-4880/ PHY-3880/ PHY-4880/ PHY-6...,SCH00000031,CRS00004745,Monsoon,2024,2024-2025,AST,False,True
2,Monsoon 2024,Introduction to Biology 3: Molecular Genetics ...,[BIO-2201],SCH00000031,CRS00003118,Monsoon,2024,2024-2025,BIO,False,False
3,Monsoon 2024,Laboratory Course 2: Molecular Biology and Bio...,[BIO-2212],SCH00000031,CRS00000873,Monsoon,2024,2024-2025,BIO,False,False
4,Monsoon 2024,Introduction to Biology II: Cell Biology,[BIO-2214],SCH00000031,CRS00003225,Monsoon,2024,2024-2025,BIO,False,False
...,...,...,...,...,...,...,...,...,...,...,...
6415,Monsoon 2014,Introduction to Ethics and Global Citizenship,[PHI-209],SCH00000001,CRS00000518,Monsoon,2014,2014-2015,PHI,False,False
6416,Monsoon 2014,Introduction to Political Theory,[POL-101],SCH00000001,CRS00000031,Monsoon,2014,2014-2015,POL,False,False
6417,Monsoon 2014,Introduction to Psychology,[PSY-1001],SCH00000001,CRS00000032,Monsoon,2014,2014-2015,PSY,False,False
6418,Monsoon 2014,Visual Arts,[VA-001],SCH00000001,CRS00000034,Monsoon,2014,2014-2015,VA,False,False


In [13]:
df.to_csv(output_path+"courses.csv", index=False)

In [42]:
all_students = scrape_all_course_info(df)

6326it [10:20, 10.20it/s]


In [46]:
temp = {}
for key in all_students:
    course = all_students[key]
    temp[key] = {}
    temp[key]["schedule_id"] = [x['ScheduleSysGenId'] for x in course]
    temp[key]["ls"] = [x['LSNo'] for x in course]
    temp[key]["ds"] = [x['DSNo'] for x in course]
    temp[key]["name"] = [x['UserName'] for x in course]
    temp[key]["ashoka_id"] = [x['AshokaId'] for x in course]
    temp[key]["sysgen_id"] = [x['UserSysGenId'] for x in course]
    temp[key]["email"] = [x['AshokaEmailId'] for x in course]
    temp[key]["status"] = [x['Status'] for x in course]
    temp[key]["status"] = [x['Status'] for x in course]
    temp[key]["semester_id"] = [x['ScheduleSysGenId'] for x in course]

df2 = pd.DataFrame([
    {**{'course_id': key[1]}, **{subkey: value[i] if i < len(value) else None for subkey, value in subdict.items()}}
    for key, subdict in temp.items()
    for i in range(max(len(lst) for lst in subdict.values()))
])
df2.sort_values(by=["semester_id", "course_id"])

Unnamed: 0,course_id,schedule_id,ls,ds,name,ashoka_id,sysgen_id,email,status,semester_id
164927,CRS00000002,SCH00000001,1,1,Aania,UG-14-1609,USR00000541,aania@ashoka.edu.in,Confirmed,SCH00000001
164928,CRS00000002,SCH00000001,1,1,Adnan Kamal,UG-14-1783,USR00000546,adnan.kamal@ashoka.edu.in,Confirmed,SCH00000001
164929,CRS00000002,SCH00000001,1,3,Ahan Bezbaroa,UG-14-0336,USR00000547,ahan.bezbaroa@ashoka.edu.in,Confirmed,SCH00000001
164930,CRS00000002,SCH00000001,1,2,Ajay Sabharwal,UG-14-1711,USR00000548,ajay.sabharwal@ashoka.edu.in,Confirmed,SCH00000001
164931,CRS00000002,SCH00000001,1,3,Akashmegh Sharma,UG-14-0696,USR00000549,akashmegh.sharma@ashoka.edu.in,Confirmed,SCH00000001
...,...,...,...,...,...,...,...,...,...,...
13750,CRS00005591,SCH00000031,1,0,Gouri S Nair,1020211163,USR00014495,gouri.nair_ug24@ashoka.edu.in,PreRegistered,SCH00000031
13751,CRS00005591,SCH00000031,1,0,Soundarya Lahari Murari,1020211737,USR00043173,soundaryalahari.murari_asp25@ashoka.edu.in,PreRegistered,SCH00000031
13752,CRS00005591,SCH00000031,1,0,Sneha Patra,1020211538,USR00040407,sneha.patra_asp25@ashoka.edu.in,PreRegistered,SCH00000031
13753,CRS00005591,SCH00000031,1,0,Kanishk Alok Banthia,1020211139,USR00014584,kanishk.banthia_ug24@ashoka.edu.in,PreRegistered,SCH00000031


In [44]:
df2.to_csv(output_path+"students.csv", index=False)