In [1]:
import numpy as np
import pandas as pd
import spacy
import pyterrier as pt
import re
import os
from collections import Counter
import random

# <span Style='font-family: Georgia, serif; color:orange'> **Create Docset**

<span Style='font-family: Georgia, serif; color:orange'>This section only needs to be run if there is no 'final_docset.csv' in the 'final_curriculum_data' folder or if new curriculum data has been added. Otherwise, proceed to 'Create Index' section.
 </span>

### <span Style='font-family: Georgia, serif; color:orange'> **Read in CSVs** </span>

In [28]:
nj_df = pd.read_csv('final_curriculum_data\\nj_full.csv')
in_df = pd.read_csv('final_curriculum_data\\in_full.csv')
tx_df = pd.read_csv('final_curriculum_data\\texas_schools_with_CIPs.csv', index_col=0)
va_df = pd.read_csv('final_curriculum_data\\virginia_schools_with_CIPs.csv', index_col=0)
va_df.columns = [i.lower() for i in va_df.columns]
va_df = va_df.drop(columns='course_codes').rename(columns={'course_names':'courses'})
bowie_df = pd.read_csv('final_curriculum_data\\Bowie_State_Course_and_CIP.csv', index_col=0).drop(columns='Course_Codes')
jh_df = pd.read_csv('final_curriculum_data\\Hopkins_Course_and_CIP.csv', index_col=0).drop(columns='Course_Codes')
isu_df = pd.read_csv('final_curriculum_data\\ISU_Course_and_CIP.csv', index_col=0).drop(columns='Codes')
uiuc_df = pd.read_csv('final_curriculum_data\\UIUC_Course_and_CIP.csv', index_col=0)
uiuc_df['degree_level'] = ['undergraduate' if ', B' in i else 'graduate' for i in uiuc_df.Program]
memphis_df = pd.read_csv('final_curriculum_data\\Memphis_Course_and_CIP.csv', index_col=0).drop(columns='Course_Codes')
tsu_df = pd.read_csv('final_curriculum_data\\TSU_Course_and_CIP.csv', index_col=0).drop(columns='Course_Codes')
cu_df = pd.read_csv('final_curriculum_data\\Clemson_Course_and_CIP.csv', index_col=0)
cu_df['degree_level'] = ['undergraduate']*len(cu_df)


df_list = [nj_df, in_df, tx_df, va_df, bowie_df, jh_df, isu_df, uiuc_df, memphis_df, tsu_df, cu_df]
# for i in df_list:
#     display(i.head())

### <span Style='font-family: Georgia, serif; color:orange'> **Clean, Combine, and Group by CIP** </span>

In [29]:
def clean_combine_group(df_list):
    total_df = pd.DataFrame()
    for i in df_list:
        new_cols = []
        #standardize column names
        for x in range(len(i.columns)):
            if 'description' in i.columns[x].lower() or 'descs' in i.columns[x].lower():
                new_cols.append('descriptions')
            elif 'names' in i.columns[x].lower() or 'titles' in i.columns[x].lower():
                new_cols.append('courses')
            else:
                new_cols.append(i.columns[x].lower())
        i.columns = new_cols

        #clean program names
        cleaned = []
        for x in i['program']:
            if re.search(', [A-Z]\..*', x):
                cleaned.append(re.match('.*(?=, [A-Z]\..*)', x)[0].lower())
            elif re.search(' \(.*\)', x):
                cleaned.append(re.sub(' \(.*\)', '', x).lower())
            else:
                cleaned.append(x.lower())
        i['program'] = cleaned

        #combine
        total_df = pd.concat([total_df, i])

    cips = [str(i)[1:] if str(i)[0]=='0' else str(i) for i in total_df.cip]
    total_df['cip'] = cips

    final_dict = {'cip':[], 'institutions':[], 'programs':[],'degree_levels':[], 'courses':[], 'descriptions':[]}
    unique_cips = total_df.cip.unique()
    for i in unique_cips:
        if len(str(i)) > 5 and str(i)[:-2] not in final_dict['cip']:
            final_dict['cip'].append(str(i)[:-2])
        elif len(str(i)) <= 5 and str(i) not in final_dict['cip']:
            final_dict['cip'].append(str(i))
    for i in final_dict['cip']:
        cip_df = total_df[total_df['cip'].astype(str)==i]
        final_dict['institutions'].append(set(cip_df.institution.str.lower()))
        final_dict['programs'].append(set(cip_df.program))
        final_dict['degree_levels'].append(set(cip_df.degree_level.str.lower()))
        final_dict['courses'].append(cip_df.courses.str.cat(sep=' '))
        final_dict['descriptions'].append(cip_df.descriptions.str.cat(sep=' '))

    final_df = pd.DataFrame(final_dict)
    final_df = final_df[~((final_df['descriptions'].isna()) | (final_df['descriptions'] == '') | (final_df['cip']=='nan'))]
    return final_df

In [30]:
total_df = clean_combine_group(df_list)
total_df.iloc[165]

cip                                                          45.99
institutions                              {bowie state university}
programs         {technology - criminal justice/law enforcement...
degree_levels                                      {undergraduate}
courses          Deviant Behavior | Juvenile Delinquency | Crim...
descriptions     Prerequisite(s): SOCI 101. This course examine...
Name: 177, dtype: object

### <span Style='font-family: Georgia, serif; color:orange'> **Clean Courses & Descriptions** </span>

In [31]:
def clean_txt(in_df):
    nlp = spacy.load('en_core_web_sm')

    courses_cleaned = []
    descs_cleaned = []

    for i, r in in_df.iterrows():
        descs = r['descriptions']
        courses = r['courses']

        desc_no_special = re.sub('[^a-zA-Z ]', '', descs)
        courses_no_special = re.sub('[^a-zA-Z ]', '', courses)

        clean_descs = " ".join(token.lemma_ for token in nlp(desc_no_special.lower()) if not token.is_stop and token.has_vector)
        clean_courses = " ".join(token.lemma_ for token in nlp(courses_no_special.lower()) if not token.is_stop and token.has_vector)

        descs_cleaned.append(clean_descs)
        courses_cleaned.append(clean_courses)

    new_df = in_df.copy()
    new_df['courses'] = courses_cleaned
    new_df['descriptions'] = descs_cleaned

    return new_df


In [32]:
final = clean_txt(total_df)

In [33]:
final.to_csv('final_curriculum_data\\final_docset.csv')

# <span Style='font-family: Georgia, serif; color:orange'> **Create Index** </span>

In [2]:
if not pt.started():
    pt.init()

PyTerrier 0.9.1 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.7



In [3]:
final = pd.read_csv('final_curriculum_data\\final_docset.csv', index_col=0)
final['cip'] = final['cip'].astype(str)
final['docno'] = [i for i in range(1, len(final)+1)]

cip_titles = pd.read_csv('final_curriculum_data\\cip_names.csv')[['Title', 'CIP Code']]
cip_titles['CIP Code'] = [i[2:-1] if i[2] != '0' else i[3:-1] for i in cip_titles['CIP Code']]
cip_titles['CIP Code'] = [i[:-1] if i[-1] == '0' else i for i in cip_titles['CIP Code']]
final = final[final['cip'].isin(cip_titles['CIP Code'])]

In [4]:
cwd = os.getcwd()
pt_index_path = cwd + '\\curriculum_docs'

docset = final.drop(columns=['institutions', 'programs', 'degree_levels'])
docset = docset.to_dict(orient='records')

if not os.path.exists(pt_index_path + '\\data_1.properties'):
    indexer = pt.IterDictIndexer(pt_index_path, overwrite=True, meta={'docno': 20, 'cip': 20, 'courses':60000})
    index_ref = indexer.index(docset, fields=['descriptions'])
else:
    index_ref = pt.IndexRef.of(pt_index_path + "\\data_1.properties")
index = pt.IndexFactory.of(index_ref)

# <span Style='font-family: Georgia, serif; color:orange'> **Generate Training Data** </span>

In [5]:
topic_of_interest = ['Biology','Chemistry','Physics','Mathematics','Computer Science','Engineering','Psychology','Sociology','Anthropology','Political Science',
'History','Philosophy','English','Education','Art','Music','Theater','Dance','Journalism','Business','Marketing','Economics','Finance','Accounting',
'Management','International Business','Entrepreneurship','Human Resources','Law','Criminal Justice','Forensic Science','Environmental Science','Geology',
'Geography','Agriculture','Nutrition','Public Health','Nursing','Medicine','Veterinary Science','Dental Science','Physical Therapy','Occupational Therapy',
'Speech Therapy','Social Work','Counseling','Library Science','Archival Studies','Museum Studies','Information Technology','Data Science','Artificial Intelligence',
'Machine Learning','Cybersecurity','Cryptography','Web Development','Mobile Development','Game Development','Multimedia','Graphic Design','Interior Design','Fashion Design',
'Industrial Design','Urban Planning','Architecture','Construction Management','Real Estate','Surveying','Aerospace Engineering','Mechanical Engineering',
'Electrical Engineering','Civil Engineering','Chemical Engineering','Materials Science','Nuclear Engineering','Marine Science','Oceanography','Meteorology',
'Astronomy','Zoology','Botany','Ecology','Conservation','Forestry','Horticulture','Landscape Architecture','Sports Science','Kinesiology','Exercise Science','Sports Medicine',
'Coaching','Physical Education','Recreation','Tourism','Hospitality','Culinary Arts','Wine Studies','Performing Arts','Creative Writing', "Agricultural Science"
"Astrophysics","Behavioral Science","Biochemistry","Biomedical Engineering","Biostatistics","Cognitive Science","Communication Disorders","Comparative Literature","Creative Writing",
"Criminology","Cultural Studies","Data Analytics","Demography","Developmental Psychology","Digital Humanities","Early Childhood Education","East Asian Studies","Econometrics","Educational Psychology",
"Electronics Engineering","Energy Studies","Engineering Physics","Entomology","Environmental Engineering","Ethnic Studies","European Studies","Evolutionary Biology","Film Studies","Food Science",
"French Language and Literature","Gender Studies","Genetics","Geographic Information Systems","German Language and Literature","Global Studies","Health Administration",
"Healthcare Management","Hispanic Studies","Humanities","Industrial Psychology","Information Science","International Studies","Italian Language and Literature","Jewish Studies","Latin American Studies",
"Linguistic Anthropology","Marine Biology","Marketing Research","Materials Engineering","Mathematical Biology","Medical Anthropology","Medical Physics",
"Medical Sociology","Medieval Studies","Microbiology","Middle Eastern Studies","Molecular Biology","Museum Management","Music Education","Neuroscience",
"Nuclear Physics","Nursing Science","Operations Research","Organic Chemistry","Organizational Psychology","Paleontology","Peace and Conflict Studies","Pediatric Nursing",
"Philosophy of Science","Physical Chemistry","Physical Oceanography","Plant Science","Polymer Science","Portuguese Language and Literature","Psychobiology""Public Administration",
"Public Policy","Quantum Physics","Radiation Oncology","Religious Studies","Robotics","Russian Language and Literature","Science and Technology Studies","Science Education",
"Science Journalism","Science Writing","Social Psychology","Social Statistics","Social Theory","Sociolinguistics","Software Engineering","Spanish Language and Literature","Special Education",
"Sport Management","Statistics","Structural Engineering","Supply Chain Management","Systems Biology","Theoretical Physics"]

In [6]:
def generate_queries(term_list):
    queries = []

    for i in range(25):
        query_terms = []
        k = random.randint(1,5)
        for x in range(k):
            while True:
                term = random.choice(term_list).lower()
                if term not in query_terms:
                    query_terms.append(term)
                    break
                else:
                    continue

        queries.append(query_terms)

    return queries

In [7]:
def generate_training_data(queries, index, cips, docset, weight_results=True):
    bm25 = pt.BatchRetrieve(index, wmodel='BM25')
    pl2 = pt.BatchRetrieve(index, wmodel='PL2')

    training_dict = {'query':[], 'cip_code':[], 'cip_name':[]}

    for i in queries:
        total_scores = pd.DataFrame()
        for x in i:
            results_bm25 = (bm25%50).search(x)
            results_pl2 = (pl2%50).search(x)
            results_pl2 = results_pl2[~results_pl2['docno'].isin(results_bm25['docno'])]
            all_results = pd.concat([results_bm25, results_pl2])
            if weight_results == True:
                k = i.index(x) + 1
                all_results['score'] = all_results['score'] / k

            total_scores = pd.concat([total_scores, all_results])

        total_scores = total_scores.groupby('docno').sum(numeric_only=True).sort_values('score', ascending=False).head(70)
        cip_codes = [docset[docset['docno']==int(i)].iloc[0]['cip'] for i in total_scores.index]
        cip_names = [cips[cips['CIP Code']==i].iloc[0]['Title'] for i in cip_codes]

        training_dict['query'].extend([' + '.join(i)]*len(total_scores))
        training_dict['cip_code'].extend(cip_codes)
        training_dict['cip_name'].extend(cip_names)

    training_data = pd.DataFrame(training_dict)

    return training_data

In [8]:
queries = generate_queries(topic_of_interest)
training_data = generate_training_data(queries, index, cip_titles, final)

In [9]:
training_data

Unnamed: 0,query,cip_code,cip_name
0,engineering physics + paleontology + software ...,11.04,Information Science/Studies.
1,engineering physics + paleontology + software ...,50.06,Film/Video and Photographic Arts.
2,engineering physics + paleontology + software ...,29.02,"Intelligence, Command Control and Information ..."
3,engineering physics + paleontology + software ...,1.06,Applied Horticulture and Horticultural Busines...
4,engineering physics + paleontology + software ...,54.01,History.
...,...,...,...
1702,health administration + archival studies,52.13,Management Sciences and Quantitative Methods.
1703,health administration + archival studies,1.01,Agricultural Business and Management.
1704,health administration + archival studies,1.06,Applied Horticulture and Horticultural Busines...
1705,health administration + archival studies,43.01,Criminal Justice and Corrections.
