In [22]:
import os
import pandas as pd
import numpy as np
import openai
import spacy
from transformers import pipeline, RobertaModel, AutoTokenizer
from selenium import webdriver
from selenium.webdriver.common.by import By
from tqdm import tqdm
from dotenv import load_dotenv
load_dotenv()
pd.options.mode.chained_assignment = None

In [155]:
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")

Downloading (…)lve/main/config.json: 100%|██████████| 1.80k/1.80k [00:00<00:00, 871kB/s]
Downloading pytorch_model.bin: 100%|██████████| 1.22G/1.22G [01:49<00:00, 11.2MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 26.0/26.0 [00:00<00:00, 10.7kB/s]
Downloading (…)olve/main/vocab.json: 100%|██████████| 899k/899k [00:00<00:00, 1.32MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 1.27MB/s]


In [4]:
openai.api_key = os.getenv("OPENAI_KEY")

In [29]:
model_id = "gpt-3.5-turbo"

In [8]:
cip = pd.read_csv("CIPCode2010.csv", dtype={"CIPCode": str})

In [9]:
cip.head(10)

Unnamed: 0,CIPCode,CIPTitle,CIPDefinition
0,1.0,"AGRICULTURE, AGRICULTURE OPERATIONS, AND RELAT...",Instructional programs that focus on agricultu...
1,1.0,"Agriculture, General.",Instructional content is defined in code 01.0000.
2,1.0,"Agriculture, General.",A program that focuses on the general principl...
3,1.01,Agricultural Business and Management.,Instructional content for this group of progra...
4,1.0101,"Agricultural Business and Management, General.",A general program that focuses on modern busi...
5,1.0102,Agribusiness/Agricultural Business Operations.,A program that prepares individuals to manage ...
6,1.0103,Agricultural Economics.,A program that focuses on the application of e...
7,1.0104,Farm/Farm and Ranch Management.,A program that prepares individuals to manage ...
8,1.0105,Agricultural/Farm Supplies Retailing and Whole...,A program that prepares individuals to sell a...
9,1.0106,Agricultural Business Technology.,A program that prepares individuals to perform...


In [10]:
url = "https://nces.ed.gov/ipeds/cipcode/"

In [15]:
driver = webdriver.Chrome()
driver.get(url + "browse.aspx?y=55")

In [16]:
driver.find_element(By.XPATH, "//img[@title='Collapse All']").click()

In [17]:
elements = driver.find_elements(By.XPATH, "//a[@title='View this CIP']")
codes = [element.text.split(")")[0] for element in elements] 
urls = [element.get_attribute("href") for element in elements]

In [18]:
driver.close()

In [19]:
cip_urls = pd.DataFrame({"CIPCode": codes, "url": urls})

In [20]:
cip_merged = cip.merge(cip_urls, how="inner")

In [21]:
cip_merged_4_6 = cip_merged[cip_merged['CIPCode'].str.len() > 2]

In [22]:
cip_merged_4_6.loc[cip_merged_4_6['CIPDefinition'].str.contains("Instructional content"), 'CIPDefinition'] = ""

In [23]:
cip_merged_4_6['CIPDefinition_concat'] = cip_merged_4_6.groupby(cip_merged_4_6['CIPCode'].str[:5])['CIPDefinition'].transform(lambda x: ' '.join(x))

In [24]:
cip_merged_4 = cip_merged_4_6[cip_merged_4_6['CIPCode'].str.len() == 5].drop(columns=['CIPDefinition']).reset_index(drop=True)

In [None]:
summaries = []
for cip in tqdm(cip_merged_4['CIPDefinition_concat']):
    response = openai.ChatCompletion.create(model=model_id, messages=[{"role": "system", "content": f"Summarize this program description in fewer than 30 words: {cip[:4000]}"}])
    summary = response.choices[0].message['content']
    summaries.append(summary)

In [56]:
cip_merged_4['CIPDefinition_summary'] = summaries

## Retrieving similar CIPs

In [117]:
cip_description = pd.read_csv("../ir_system_setup_and_dev/final_curriculum_data/final_docset.csv")

In [78]:
cip_list = list(cip_description['descriptions'])

In [None]:
!python -m spacy download en_core_web_md

In [25]:
nlp = spacy.load('en_core_web_md')

In [26]:
docs = [nlp(cip) for cip in cip_list]

In [99]:
cip_index = {}
for idx_i, i in enumerate(docs):
    docs_sim = {}
    for idx_j, j in enumerate(docs):
        docs_sim[idx_j] = i.similarity(j)
    indexes = sorted(docs_sim, key=docs_sim.get, reverse=True)[1:4]
    cip_index[idx_i] = indexes

In [128]:
cips = list(cip_description['cip'].apply(lambda x: '{:05.2f}'.format(float(x.rstrip(".")))))
cip_sim_dict = {}
for idx in cip_index:
    cip_sim_dict[cips[idx]] = [cips[i] for i in cip_index[idx]]

In [133]:
cip_sim_dict

{'05.02': ['05.01', '45.02', '45.11'],
 '45.02': ['45.11', '45.12', '03.05'],
 '04.02': ['04.06', '24.01', '30.44'],
 '50.07': ['50.04', '50.05', '30.50'],
 '40.02': ['40.04', '40.06', '26.02'],
 '40.05': ['26.02', '26.01', '25.40'],
 '16.12': ['16.04', '23.01', '16.99'],
 '16.01': ['16.05', '16.99', '16.09'],
 '11.07': ['06.11', '11.01', '14.18'],
 '05.01': ['16.99', '05.02', '45.02'],
 '26.13': ['31.51', '22.31', '42.01'],
 '45.06': ['23.13', '13.13', '26.12'],
 '23.01': ['16.99', '05.01', '16.12'],
 '16.99': ['05.01', '23.01', '16.01'],
 '40.06': ['26.02', '14.08', '14.07'],
 '16.05': ['16.01', '16.09', '16.99'],
 '54.01': ['45.02', '05.01', '45.11'],
 '27.01': ['26.11', '14.07', '50.10'],
 '26.02': ['40.06', '51.07', '13.13'],
 '50.09': ['51.22', '52.02', '10.02'],
 '26.15': ['26.01', '50.10', '40.06'],
 '38.01': ['45.02', '45.11', '51.99'],
 '40.08': ['14.18', '14.07', '09.14'],
 '45.10': ['28.45', '16.24', '27.43'],
 '45.09': ['28.45', '44.05', '45.10'],
 '42.01': ['26.42', '27.4

In [142]:
for idx, row in cip_merged_4.iterrows():
    if row['CIPCode'] in cip_sim_dict.keys():
        cip_merged_4.loc[idx, 'related'] = ','.join(cip_sim_dict[row['CIPCode']])

In [151]:
cip_merged_4.to_csv("../assets/data/cip_url_summary.csv", index=False)