In [104]:
import os
import warnings
from tqdm import tqdm
import ast
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from sankeyflow import Sankey

tqdm.pandas()
warnings.filterwarnings("ignore")

In [None]:
folder_path = "./major_minor"
output_path = "./datasets/major_minor.csv"

In [107]:
def extract_major(majors):
    creds = ast.literal_eval(majors)
    if "Major" in creds.keys():
        return creds["Major"]
    elif "Advanced Major" in creds.keys():
        return creds["Advanced Major"]
    elif "Second Major Equivalent" in creds.keys():
        return creds["Second Major Equivalent"]
    else:
        return None

In [108]:
def recurse_minor(creds):
    if "Minor" in creds.keys():
        return creds.pop("Minor") + "," + str(recurse_minor(creds))
    elif "Concentration" in creds.keys():
        return creds.pop("Concentration") + "," + str(recurse_minor(creds))
    else:
        return None

def extract_minor(minors):
    creds = ast.literal_eval(minors)
    output = recurse_minor(creds)
    return output[:-5] if output is not None else output


In [109]:
students = []
for file_path in tqdm(os.listdir(folder_path)):

    data = pd.read_csv(folder_path + "/" + file_path)
    soup = BeautifulSoup(data.columns[0], 'html.parser')

    for table in soup.find_all('table'):
        for row in table.find_all('tr')[1:]:
            cells = row.find_all('td')

            students.append((cells[0].get_text(strip=True), cells[1].get_text(strip=True),
                             cells[2].get_text(strip=True), cells[3].get_text(strip=True),
                             cells[4].get_text(strip=True),
                             ('{"' + ('", "'.join(cells[5].stripped_strings)).replace(" - ", '": "') + '"}')))
        break

100%|██████████| 19/19 [00:01<00:00, 11.41it/s]


In [110]:
df = pd.DataFrame(students, columns=["batch", "email", "id", "name", "status", "majors"])
df["major"] = df.apply(lambda x: extract_major(x["majors"]) if x["majors"] != '{""}' else None, axis=1)
df["minor"] = df.apply(lambda x: extract_minor(x["majors"]) if x["majors"] != '{""}' else None, axis=1)
df.drop(["majors"], axis=1, inplace=True)

In [111]:
df["program"] = df["batch"].apply(lambda x: x.split(" ")[0].strip())
df["grad"] = df["batch"].apply(lambda x: float("20" + x.split("-")[-1].strip()))
df.sort_values(by=["program", "grad", "email"], inplace=True)

In [112]:
df.to_csv(output_path, index=False)
df

Unnamed: 0,batch,email,id,name,status,major,minor,program,grad
0,ASP 2017-18,aania_asp18@ashoka.edu.in,UG-14-1609,Aania,Graduated,,,ASP,2018.0
1,ASP 2017-18,aashna.lal_asp18@ashoka.edu.in,UG-14-0568,Aashna Lal,Graduated,,CW,ASP,2018.0
2,ASP 2017-18,abhinav.srikant_asp18@ashoka.edu.in,UG-14-1833,Abhinav Srikant,Graduated,PSY,MS,ASP,2018.0
3,ASP 2017-18,aditya.prakash_asp18@ashoka.edu.in,UG-14-1812,Aditya Prakash,Graduated,,MAT,ASP,2018.0
4,ASP 2017-18,ahan.bezbaroa_asp18@ashoka.edu.in,UG-14-0336,Ahan Bezbaroa,Graduated,PPE,ENG,ASP,2018.0
...,...,...,...,...,...,...,...,...,...
7001,UG 2023-27,yuvakshi.dam_ug2023@ashoka.edu.in,1020231747,Yuvakshi Dam,Enrolled,,,UG,2027.0
7002,UG 2023-27,yuvraj.verma_ug2023@ashoka.edu.in,1020231748,Yuvraj Verma,Enrolled,,,UG,2027.0
7003,UG 2023-27,zahra.baqeri_ug2023@ashoka.edu.in,1020231020,Zahra Baqeri,Enrolled,,,UG,2027.0
7004,UG 2023-27,zahrah.imani_ug2023@ashoka.edu.in,1020231749,Zahrah Salim Imani,Enrolled,,,UG,2027.0
