In [1]:
import pandas as pd
import requests
import html5lib
from bs4 import BeautifulSoup

In [2]:
#accessing df from excel file
dmv_hbcu = pd.read_excel('dmv_hbcu.xlsx')
dmv_hbcu.head(15)

Unnamed: 0,Institution,City,State
0,Hampton University,Hampton,Virginia
1,Norfolk State University,Norfolk,Virginia
2,Virginia State University,Petersburg,Virginia
3,Virginia Union University,Richmond,Virginia
4,Virginia University of Lynchburg,Lynchburg,Virginia
5,University of the District of Columbia,Washington,District of Columbia
6,Howard University,Washington,District of Columbia
7,Bowie State University,Bowie,Maryland
8,Coppin State University,Baltimore,Maryland
9,University of Maryland Eastern Shore,Princess Anne,Maryland


In [3]:
grad = "https://catalog.bowiestate.edu/content.php?catoid=27&navoid=1988"

# Fetch the page
response = requests.get(grad, verify=False)
soup = BeautifulSoup(response.text, 'lxml')

# Select the <ul> element with class 'program-list' and extract its <li> elements
program_list = soup.select('ul.program-list li')

# Extract text from each <li> element (clean up whitespace)
programs_data = [li.get_text(strip=True) for li in program_list if li.get_text(strip=True)]

# Check if any program names were extracted
if not programs_data:
    print("No program names found.")
else:
    # Convert the list of program names into a pandas DataFrame
    bowie_grad_df = pd.DataFrame(programs_data, columns=["Program Name"])

    # Define keywords for classification
    certificate_keywords = ['certificate', 'certification']
    masters_keywords = ['masters', 'ma', 'ms']
    phd_keywords = ['phd', 'doctorate', 'doctoral']
    mba_keywords = ['mba', 'business administration']

    # Function to classify the program type based on keywords
    def classify_program_type(program_name):
        program_name_lower = program_name.lower()
        if any(keyword in program_name_lower for keyword in certificate_keywords):
            return 'Certificate'
        elif any(keyword in program_name_lower for keyword in masters_keywords):
            return 'Masters'
        elif any(keyword in program_name_lower for keyword in phd_keywords):
            return 'PhD'
        elif any(keyword in program_name_lower for keyword in mba_keywords):
            return 'MBA'
        else:
            return 'Unknown'  # Default classification if no match

    # Apply the classification function to each program name
    bowie_grad_df['Program Type'] = bowie_grad_df['Program Name'].apply(classify_program_type)

    # Print the DataFrame to show the program names and their classifications
    print(bowie_grad_df)




                                         Program Name Program Type
0                   •Addictions Counseling Certficate      Unknown
1   •Applied and Computational Mathematics Certifi...  Certificate
2          •Applied and Computational Mathematics, MS      Masters
3        •Applied Biotechnology and Molecular Biology      Unknown
4                       •Business Administration, MBA          MBA
5   •Computer Science Certificate - Database Manag...  Certificate
6   •Computer Science Certificate - Graphics and U...  Certificate
7   •Computer Science Certificate - Networks and D...  Certificate
8   •Computer Science Certificate - Scientific Sof...  Certificate
9   •Computer Science Certificate - Software Engin...  Certificate
10                             •Computer Science, DAS      Unknown
11                              •Computer Science, MS      Masters
12         •Counseling Psychology Adlerian Option, MA      Masters
13         •Counseling Psychology Eclectic Option, MA      Mas

In [4]:
undergrad = "https://catalog.bowiestate.edu/content.php?catoid=28&navoid=2041"

# Fetch the page
response = requests.get(undergrad, verify=False)
soup = BeautifulSoup(response.text, 'lxml')

# Select the <ul> element with class 'program-list' and extract its <li> elements
program_list = soup.select('ul.program-list li')

# Extract text from each <li> element (clean up whitespace)
programs_data = [li.get_text(strip=True) for li in program_list if li.get_text(strip=True)]

# Check if any program names were extracted
if not programs_data:
    print("No program names found.")
else:
    # Convert the list of program names into a pandas DataFrame
    bowie_undergrad_df = pd.DataFrame(programs_data, columns=["Program Name"])

    # Define keywords for classification
    certificate_keywords = ['certificate', 'certification']
    bs_keywords = ['bs']
    ba_keywords = ['ba']
    bs_ba_keywords = ['BA/BS']
    minor_keywords = ['minor']
    # mba_keywords = ['mba', 'business administration']

    # Function to classify the program type based on keywords
    def classify_program_type(program_name):
        program_name_lower = program_name.lower()
        if any(keyword in program_name_lower for keyword in certificate_keywords):
            return 'Certificate'
        elif any(keyword in program_name_lower for keyword in bs_keywords):
            return 'BS'
        elif any(keyword in program_name_lower for keyword in ba_keywords):
            return 'BA'
        elif any(keyword in program_name_lower for keyword in bs_ba_keywords):
            return 'BA/BS'
        elif any(keyword in program_name_lower for keyword in minor_keywords):
            return 'Minor'
        # elif any(keyword in program_name_lower for keyword in mba_keywords):
        #     return 'MBA'
        else:
            return 'NA'  # Default classification if no match

    # Apply the classification function to each program name
    bowie_undergrad_df['Program Type'] = bowie_undergrad_df['Program Name'].apply(classify_program_type)

    # Print the DataFrame to show the program names and their classifications
    print(bowie_undergrad_df)




                                          Program Name Program Type
0                                  •Bioinformatics, BS           BS
1                                         •Biology, BS           BS
2    •Business Administration - Accounting Concentr...           BS
3    •Business Administration - Banking and Finance...           BS
4    •Business Administration - Business Informatio...           BS
..                                                 ...          ...
100  •Upper Division Certificate in Cloud Applicati...  Certificate
101       •Upper Division Certificate in Cybersecurity  Certificate
102      •Upper Division Certificate in Data Analytics  Certificate
103    •Upper Division Certificate in Entrepreneurship  Certificate
104                         •University Honors Program           NA

[105 rows x 2 columns]


In [5]:
# Define a custom function to remove the bullet from the date column
def remove_bullet(column):
    return column.str.replace('•', '')

# Apply the custom function to the date column
bowie_grad_df['Program Name'] = remove_bullet(bowie_grad_df['Program Name'])
bowie_undergrad_df['Program Name'] = remove_bullet(bowie_undergrad_df['Program Name'])

In [6]:
bowie_grad_df.head(20)

Unnamed: 0,Program Name,Program Type
0,Addictions Counseling Certficate,Unknown
1,Applied and Computational Mathematics Certificate,Certificate
2,"Applied and Computational Mathematics, MS",Masters
3,Applied Biotechnology and Molecular Biology,Unknown
4,"Business Administration, MBA",MBA
5,Computer Science Certificate - Database Manage...,Certificate
6,Computer Science Certificate - Graphics and Us...,Certificate
7,Computer Science Certificate - Networks and Di...,Certificate
8,Computer Science Certificate - Scientific Soft...,Certificate
9,Computer Science Certificate - Software Engine...,Certificate


In [7]:
bowie_undergrad_df.head(50)

Unnamed: 0,Program Name,Program Type
0,"Bioinformatics, BS",BS
1,"Biology, BS",BS
2,Business Administration - Accounting Concentra...,BS
3,Business Administration - Banking and Finance ...,BS
4,Business Administration - Business Information...,BS
5,Business Administration - Data Analytics for D...,BS
6,Business Administration - Economics Concentrat...,BS
7,Business Administration - General Business Con...,BS
8,Business Administration - Management Concentra...,BS
9,Business Administration - Management Concentra...,BS
