In [3]:
import os
import pandas as pd
import numpy as np
import openai
from transformers import pipeline
from selenium import webdriver
from selenium.webdriver.common.by import By
from tqdm import tqdm
from dotenv import load_dotenv
load_dotenv()
pd.options.mode.chained_assignment = None

In [155]:
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")

Downloading (…)lve/main/config.json: 100%|██████████| 1.80k/1.80k [00:00<00:00, 871kB/s]
Downloading pytorch_model.bin: 100%|██████████| 1.22G/1.22G [01:49<00:00, 11.2MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 26.0/26.0 [00:00<00:00, 10.7kB/s]
Downloading (…)olve/main/vocab.json: 100%|██████████| 899k/899k [00:00<00:00, 1.32MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 1.27MB/s]


In [4]:
openai.api_key = os.getenv("OPENAI_KEY")

In [29]:
model_id = "gpt-3.5-turbo"

In [8]:
cip = pd.read_csv("CIPCode2010.csv", dtype={"CIPCode": str})

In [9]:
cip.head(10)

Unnamed: 0,CIPCode,CIPTitle,CIPDefinition
0,1.0,"AGRICULTURE, AGRICULTURE OPERATIONS, AND RELAT...",Instructional programs that focus on agricultu...
1,1.0,"Agriculture, General.",Instructional content is defined in code 01.0000.
2,1.0,"Agriculture, General.",A program that focuses on the general principl...
3,1.01,Agricultural Business and Management.,Instructional content for this group of progra...
4,1.0101,"Agricultural Business and Management, General.",A general program that focuses on modern busi...
5,1.0102,Agribusiness/Agricultural Business Operations.,A program that prepares individuals to manage ...
6,1.0103,Agricultural Economics.,A program that focuses on the application of e...
7,1.0104,Farm/Farm and Ranch Management.,A program that prepares individuals to manage ...
8,1.0105,Agricultural/Farm Supplies Retailing and Whole...,A program that prepares individuals to sell a...
9,1.0106,Agricultural Business Technology.,A program that prepares individuals to perform...


In [10]:
url = "https://nces.ed.gov/ipeds/cipcode/"

In [15]:
driver = webdriver.Chrome()
driver.get(url + "browse.aspx?y=55")

In [16]:
driver.find_element(By.XPATH, "//img[@title='Collapse All']").click()

In [17]:
elements = driver.find_elements(By.XPATH, "//a[@title='View this CIP']")
codes = [element.text.split(")")[0] for element in elements] 
urls = [element.get_attribute("href") for element in elements]

In [18]:
driver.close()

In [19]:
cip_urls = pd.DataFrame({"CIPCode": codes, "url": urls})

In [20]:
cip_merged = cip.merge(cip_urls, how="inner")

In [21]:
cip_merged_4_6 = cip_merged[cip_merged['CIPCode'].str.len() > 2]

In [22]:
cip_merged_4_6.loc[cip_merged_4_6['CIPDefinition'].str.contains("Instructional content"), 'CIPDefinition'] = ""

In [23]:
cip_merged_4_6['CIPDefinition_concat'] = cip_merged_4_6.groupby(cip_merged_4_6['CIPCode'].str[:5])['CIPDefinition'].transform(lambda x: ' '.join(x))

In [24]:
cip_merged_4 = cip_merged_4_6[cip_merged_4_6['CIPCode'].str.len() == 5].drop(columns=['CIPDefinition']).reset_index(drop=True)

In [None]:
summaries = []
for cip in tqdm(cip_merged_4['CIPDefinition_concat']):
    response = openai.ChatCompletion.create(model=model_id, messages=[{"role": "system", "content": f"Summarize this program description in fewer than 30 words: {cip[:4000]}"}])
    summary = response.choices[0].message['content']
    summaries.append(summary)

In [56]:
cip_merged_4['CIPDefinition_summary'] = summaries

In [61]:
cip_merged_4.to_csv("cip_url_summary.csv", index=False)

In [79]:
cip_merged_4

Unnamed: 0,index,CIPCode,CIPTitle,url,CIPDefinition_concat,CIPDefinition_summary
0,1,01.00,"Agriculture, General.",https://nces.ed.gov/ipeds/cipcode/cipdetail.as...,A program that focuses on the general princip...,This program provides knowledge of agricultura...
1,3,01.01,Agricultural Business and Management.,https://nces.ed.gov/ipeds/cipcode/cipdetail.as...,A general program that focuses on modern bus...,This program prepares individuals to manage va...
2,11,01.02,Agricultural Mechanization.,https://nces.ed.gov/ipeds/cipcode/cipdetail.as...,A program that generally prepares individuals...,"This program prepares individuals to sell, sel..."
3,16,01.03,Agricultural Production Operations.,https://nces.ed.gov/ipeds/cipcode/cipdetail.as...,A program that focuses on the general plannin...,"Programs in agriculture, livestock care, aquat..."
4,26,01.04,Agricultural and Food Products Processing.,https://nces.ed.gov/ipeds/cipcode/cipdetail.as...,A program that prepares individuals to receiv...,"This program prepares individuals to store, pr..."
...,...,...,...,...,...,...
383,1997,60.01,Dental Residency Programs.,https://nces.ed.gov/ipeds/cipcode/cipdetail.as...,A residency training program that prepares de...,Dental residency programs prepare dentists in ...
384,2008,60.03,Veterinary Residency Programs.,https://nces.ed.gov/ipeds/cipcode/cipdetail.as...,A residency training program that prepares ve...,Residency training programs in veterinary medi...
385,2030,60.04,Medical Residency Programs - General Certifica...,https://nces.ed.gov/ipeds/cipcode/cipdetail.as...,A residency training program that prepares ph...,"Numerous medical residency training programs, ..."
386,2066,60.05,Medical Residency Programs - Subspecialty Cert...,https://nces.ed.gov/ipeds/cipcode/cipdetail.as...,A residency training program that prepares ph...,Various residency training programs in special...
