<a href="https://colab.research.google.com/github/iyoob-utexas/ds4e/blob/main/notebooks/generate_metrics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Resident Match - Generate Metrics
This code generates the metrics for Resident Matching.

## Initialize

###Install Libraries

In [None]:
!pip install PyMuPDF
!pip install openai
!pip install pandas
!pip install colab-env -qU

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for colab-env (setup.py) ... [?25l[?25hdone


In [None]:
import fitz  # PyMuPDF for PDF processing
import pandas as pd
import re
import os
import openai

###Load Creds

In [None]:
import colab_env

openai.api_key = os.getenv("OPENAI_API_KEY")

Mounted at /content/gdrive


### Helper Functions

In [None]:
# Function to load openai api credentials if stored in json (NON-COLAB)
def load_credentials():
    with open("config/credentials.json", "r") as f:
        creds = json.load(f)
    return creds

credentials = load_credentials()
openai.api_type = credentials["api_type"]
openai.api_base = credentials["api_base"]
openai.api_version = credentials["api_version"]
openai.api_key = credentials["api_key"]

In [None]:
# Function to send a prompt to ChatGPT
def ChatAPI(prompt):
    client = openai.OpenAI()
    #load_dotenv()
    response = openai.chat.completions.create(
      # engine="test-poc",
      model = "gpt-4o",
      messages = [{"role":"system","content":"You are an AI assistant that helps people find information."},{"role":"user","content":prompt}],
      temperature=0,
      max_tokens=1000,
      top_p=0.95,
      frequency_penalty=0,
      presence_penalty=0,
      stop=None)
    text = response.choices[0].message.content
    return text

## Load Data

In [None]:
current_year = 2024

In [None]:
# File paths
input_path = '/content/gdrive/MyDrive/Colab Notebooks/dellmc/data/'
ref_path = '/content/gdrive/MyDrive/Colab Notebooks/dellmc/ref/'
output_path = '/content/gdrive/MyDrive/Colab Notebooks/dellmc/out/'

#input_path = 'raw/'
#ref_path = 'ref/'
#output_path = 'output/'

In [None]:
# Load all input pdf files
files = [file for file in os.listdir(input_path) if file!='.DS_Store']
files_sort = [[i, i[6:10] + '-' + i[0:2] + '-' + i[3:5]] for i in files]
files_sort.sort(key = lambda x: x[1])
files = [i[0] for i in files_sort]

In [None]:
# Load ref files
state_regions = pd.read_csv('' + ref_path + 'States_and_Regions_Categorization.csv')

### Extract Text

In [None]:
# Extract text from the PDF
def extract_text_from_pdf(doc):
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

In [None]:
%%time
candidates_df = pd.DataFrame(columns=['candidate_id', 'candidate_doc'])
pdf_text_dict = {}
for file in files:
    candidate_id = file.split('_')[-3]
    pdf_path = input_path + file
    doc = fitz.open(pdf_path)
    pdf_text = extract_text_from_pdf(doc)

    # Adding candidate PDF text to pdf_text_dict
    pdf_text_dict[candidate_id] = pdf_text
    candidates_df.loc[len(candidates_df)] = [candidate_id, file]

CPU times: user 968 ms, sys: 67 ms, total: 1.03 s
Wall time: 8.47 s


## Generate Metrics

In [None]:
# Function to extract state from an address
def address_state_region(address):
    state_matches = []
    if address:
        # Add District Of Colubmia to region file
        if 'district of columbia' in address.lower():
            return 'NE'
        for state_name in state_regions['State'].unique():
            if state_name in address:
                state_matches.append(state_name)
        if len(state_matches) > 1:
            # print('Multiple states identified in this address: ' + address)
            # The following line of code finds which matched state occurred last in the address string, this will be the one that is actually the state
            state_matches = sorted(state_matches, key=lambda word: address.rfind(word))
            return state_regions.loc[state_regions['State']==state_matches[-1]]['Region'].iloc[0]
        elif len(state_matches) == 0:
            for state in state_regions['State'].unique():
                if state in address:
                    state_matches.append(state)
            if len(state_matches) > 1:
                # print('Multiple states identified in this address: ' + address)
                state_matches = sorted(state_matches, key=lambda word: address.rfind(word))
                return state_regions.loc[state_regions['State']==state_matches[-1]]['Region'].iloc[0]
            elif len(state_matches) == 1:
                return state_regions.loc[state_regions['State']==state_matches[-1]]['Region'].iloc[0]
            else:
                return
        else:
            return state_regions.loc[state_regions['State']==state_matches[-1]]['Region'].iloc[0]
    else:
        return

### Candidate Metadata

In [None]:
def candidate_metadata(candidate_id):
    pdf_text = pdf_text_dict[candidate_id]

    # Including text until EXAMINATIONS section for the basic info section in case text is out of place
    basic_info_pattern = r'(.*?)\nEXAMINATIONS'
    basic_info_match = re.search(basic_info_pattern, pdf_text, re.DOTALL)
    if basic_info_match:
        basic_info_text = basic_info_match.group(1)

        # Extracting candidate name from basic info section
        name_pattern = r'\n(.*?)\s*ID:'
        name_match = re.findall(name_pattern, basic_info_text)
        if len(name_match) > 0:
            name = name_match[0]
        else:
            name = None

        # Extracting candidate email from basic info section
        email_pattern = r'Email:\n(.*?)\s*\nCell Phone'
        email_match = re.findall(email_pattern, basic_info_text)
        if len(email_match) > 0:
            email = email_match[0]
        else:
            email = None

        # Extracting candidate phone number from basic info section
        phone_pattern = r'Cell Phone:\n(.*?)\s*\nPronouns'
        phone_match = re.findall(phone_pattern, basic_info_text)
        if len(phone_match) > 0:
            phone = phone_match[0]
        else:
            phone = None

        # Extracting candidate medical school from basic info section
        # med_school_pattern = r'Medical School:\n(.*?)\s*\nMedical School Country'
        # med_school_match = re.findall(med_school_pattern, basic_info_text)
        # if len(med_school_match) > 0:
        #     med_school = med_school_match[0]
        # else:
        #     med_school = None

        # Extracting candidate permanent address and region
        candidate_address_pattern = r'Address:\n(.*?)\s*\nPermanent Address'
        candidate_address_match = re.findall(candidate_address_pattern, basic_info_text)
        if len(candidate_address_match) > 0:
            candidate_address = candidate_address_match[0]
            candidate_address_region = address_state_region(candidate_address)
        else:
            candidate_address = None
            candidate_address_region = None

    else:
        name = None
        email = None
        phone = None
        candidate_address = None
        candidate_address_region = None
        # med_school = None

    return pd.Series([name, email, phone, candidate_address, candidate_address_region])

In [None]:
candidates_df[['candidate_name', 'candidate_email', 'candidate_phone', 'candidate_address', 'candidate_address_region']] = candidates_df.apply(lambda x: candidate_metadata(x['candidate_id']), axis=1)
candidates_df = candidates_df[['candidate_id', 'candidate_name', 'candidate_email', 'candidate_phone', 'candidate_address', 'candidate_address_region', 'candidate_doc']]

In [None]:
candidates_df.to_csv('' + output_path + 'candidate.csv', index=False)

In [None]:
candidates_df = pd.DataFrame()
candidates_df['candidate_id'] = pdf_text_dict.keys()

### Education

In [None]:
def education_metrics(candidate_id):
    pdf_text = pdf_text_dict[candidate_id]
    education_pattern = r'EDUCATION\n(.*?)\nEXAMINATIONS'
    education_match = re.search(education_pattern, pdf_text, re.DOTALL)

    if education_match:
        education_text = education_match.group(1)
        undergrad_gpa_pattern = r'GPA:\s*(\d+\.\d+)'
        undergrad_gpa_match = re.findall(undergrad_gpa_pattern, education_text)
        # Take the last GPA found in the education text, as this will be the undergrad GPA - confirm this logic
        undergrad_gpa = float(undergrad_gpa_match[-1]) if undergrad_gpa_match else None

        # aoa_pattern = r'AOA:\s*(Yes|No)'
        aoa_result_pattern = r'AOA\s?:\s?(.*?)\n'
        aoa_result_match = re.search(aoa_result_pattern, education_text)
        if aoa_result_match:
            aoa_result_text = aoa_result_match.group(1)
            aoa_selected_pattern = r'\b(I have been selected|AOA)\b'
            aoa_selected_result = re.findall(aoa_selected_pattern, aoa_result_text)
            aoa = 1 if len(aoa_selected_result) > 0 else 0
        else:
            aoa = 0

        medical_school_name_pattern = r'Medical School\n(.*?)\nAddress'
        medical_school_name_match = re.findall(medical_school_name_pattern, education_text, re.DOTALL)
        if len(medical_school_name_match) > 0:
            medical_school_name = medical_school_name_match[0]
        else:
            medical_school_name = None

        medical_school_address_pattern = r'Medical School.*?Address:\s?(.*?)\nAAMC'
        medical_school_address_match = re.findall(medical_school_address_pattern, education_text, re.DOTALL)
        if len(medical_school_address_match) > 0:
            medical_school_address = medical_school_address_match[0]
            medical_school_region = address_state_region(medical_school_address)
        else:
            medical_school_address = None
            medical_school_region = None

    else:
        undergrad_gpa = None
        aoa = 0
        medical_school_name = None
        medical_school_address = None
        medical_school_region = None

    return pd.Series([undergrad_gpa, aoa, medical_school_name, medical_school_address, medical_school_region])

In [None]:
# Education Output
education_df = candidates_df.copy()
education_df[['education_undergrad_gpa', 'education_aoa', 'education_medschool', 'education_medical_school_address', 'education_medical_school_region']] = education_df.apply(lambda x: education_metrics(x['candidate_id']), axis=1)
education_df.to_csv(output_path + 'education.csv', index=False)

### Examinations

In [None]:
# Extracting Examinations Information
# Run this block to extract the examination-related information
def examinations_metrics(candidate_id):
    pdf_text = pdf_text_dict[candidate_id]
    # usmle_step2_pattern = r'Step 2 CK\s+(\d{3})'
    usmle_step2_pattern = r'Step 2 CK\n.*\n(\d{3})'
    usmle_step2_match = re.search(usmle_step2_pattern, pdf_text)
    usmle_step2_score = int(usmle_step2_match.group(1)) if usmle_step2_match else None

    return usmle_step2_score

In [None]:
# Examinations Output
examinations_df = candidates_df.copy()
examinations_df['examinations_step2_score'] = examinations_df.apply(lambda x: examinations_metrics(x['candidate_id']), axis=1)
examinations_df.to_csv(output_path + 'examinations.csv', index=False)

### Employment

In [None]:
# Extracting Employment Information
# Run this block to extract the employment-related information
def employment_metrics(candidate_id):
    pdf_text = pdf_text_dict[candidate_id]
    employment_pattern = r'EMPLOYMENT\n(.*?)\nPUBLICATIONS'
    employment_match = re.search(employment_pattern, pdf_text, re.DOTALL)

    if employment_match:
        employment_text = employment_match.group(1)
        years_experience_pattern = r'(\d{4}) - .*?(\d{4})'
        years_experience_matches = re.findall(years_experience_pattern, employment_text)
        # Check for current jobs (present)
        present_jobs_pattern = r'\d{4} - .*?present'
        present_jobs_matches = re.findall(present_jobs_pattern, employment_text)
        if len(years_experience_matches) > 0:
            first_year_experience = int(min([min(i) for i in years_experience_matches]))
            latest_year_experience = int(max([max(i) for i in years_experience_matches]))
            # If candidate has a current job, set latest year of experience to current year
            if len(present_jobs_matches) > 0:
                latest_year_experience = current_year
            if latest_year_experience == first_year_experience:
                years_experience = 1
            else:
                years_experience = latest_year_experience - first_year_experience
        else:
            years_experience = 0

        number_of_jobs = len(years_experience_matches) + len(present_jobs_matches)

        job_description_pattern = r'\n(.*?)\nEmployer name'
        job_descriptions = re.findall(job_description_pattern, employment_text)
        service_experience_gpt_call = 'From the following list of job descriptions, are any of the jobs service industry jobs, for example server or cashier? Return either 1 for yes or 0 for No.'
        service_experience = ChatAPI(service_experience_gpt_call + str(job_descriptions))

        try:
            service_experience = int(service_experience)
        except:
            service_experience = 0

    else:
        years_experience = 0
        number_of_jobs = 0
        service_experience = 0

    return pd.Series([years_experience, number_of_jobs, service_experience])

In [None]:
# Employment Output
employment_df = candidates_df.copy()
employment_df[['employment_years_experience', 'employment_number_of_jobs', 'employment_service_experience']] = employment_df.apply(lambda x: employment_metrics(x['candidate_id']), axis=1)
employment_df.to_csv(output_path + 'employment.csv', index=False)

### Publications

In [None]:
# Extracting Publications Information
# Run this block to extract the publications information
def publications_metrics(candidate_id):
    pdf_text = pdf_text_dict[candidate_id]
    # Expanding text window to start at education section, because some of the papers appeared before the PUBLICATIONS header in the pdf text
    publications_pattern = r'EDUCATION\n(.*?)\nHONORS AND INTERESTS'
    publications_match = re.search(publications_pattern, pdf_text, re.DOTALL)

    if publications_match:
        publications_text = publications_match.group(1)
        peer_reviewed_publication_pattern = r'Type:\s?Peer-Reviewed Article \(Published\)'
        peer_reviewed_publication_matches = re.findall(peer_reviewed_publication_pattern, publications_text)
        peer_reviewed_publications = len(peer_reviewed_publication_matches)

        # Only finding number of first authors for peer reviewed published articles
        first_author_pattern = r'Type:\s?Peer-Reviewed Article \(Published\).*?First/Senior Author:\s?([a-zA-Z]+)\n'
        first_author_matches = re.findall(first_author_pattern, publications_text, flags=re.DOTALL)
        first_author_matches = [i for i in first_author_matches if i.lower()=='yes']
        first_authors = len(first_author_matches)

        oral_presentation_location_pattern = r'Type:\s?Oral Presentation.*?Address:\s?(.*?)\n'
        oral_presentation_locations = re.findall(oral_presentation_location_pattern, publications_text, flags=re.DOTALL)

    else:
        peer_reviewed_publications = 0
        first_authors = 0

    return pd.Series([peer_reviewed_publications, first_authors])

In [None]:
# Publications Output
publications_df = candidates_df.copy()
publications_df[['publications_peer_reviewed', 'publications_first_authors']] = publications_df.apply(lambda x: publications_metrics(x['candidate_id']), axis=1)
publications_df.to_csv(output_path + 'publications.csv', index=False)

### Honors and Interests

In [None]:
# Extracting Honors and Interests Information
# Run this block to extract the honors and interests information
def honors_interests_metrics(candidate_id):
    pdf_text = pdf_text_dict[candidate_id]
    honors_interests_pattern = r'HONORS AND INTERESTS\n(.*?)\nREQUIRED SUPPLEMENTAL FORM'
    honors_interests_match = re.search(honors_interests_pattern, pdf_text, re.DOTALL)

    if honors_interests_match:
        honors_interests_text = honors_interests_match.group(1)
        # Keywords that indicate monetary value
        monetary_value_list = ['grant', 'scholarship']
        grant_scholarship_pattern = r'\b(grant|scholarship)\b'
        grant_scholarship_match = re.findall(grant_scholarship_pattern, honors_interests_text, flags = re.IGNORECASE)
        # Probably double counting some occurences
        # Turning this into a binary output until we fix above bug
        grants_scholarships = len(grant_scholarship_match)
        if grants_scholarships > 0:
            grants_scholarships = 1

        # There is a label at the beginning of the Honors and Interests section that includes hobbies, must look for the hobbies header after this
        hobbies_pattern = r'hobbies\n.*?\s*Hobbies\s*(.*?)$'
        hobbies_match = re.search(hobbies_pattern, honors_interests_text, flags= re.DOTALL | re.IGNORECASE)
        if hobbies_match:
            hobbies_text = hobbies_match.group(1)
            # In hobbies section taking every text snippet after a new line as a hobby, this will lead to some invalid values, but GPT might be able to ignore these
            # Formatting of this section is inconsistent between applications
            hobbies_list_pattern = r'[^\n]+'
            hobbies_list_match = re.findall(hobbies_list_pattern, hobbies_text)
        else:
            # print(candidate_id)
            distinctive_hobbies = 0

    else:
        grants_scholarships = 0

    return pd.Series([grants_scholarships])

In [None]:
# Honors and Interests Output
honors_interests_df = candidates_df.copy()
honors_interests_df[['honors_interests_grants_scholarships']] = honors_interests_df.apply(lambda x: honors_interests_metrics(x['candidate_id']), axis=1)
honors_interests_df.to_csv(output_path + 'honors_interests.csv', index=False)