In [None]:
import pandas as pd
from pathlib import Path
import re
# in this notebook, i am creating a new csv that merges parsed resume data
# with the survey data that contains assigned internship labels

#paths
PARSED = Path("parsed_resumes_all.csv")
SURVEY = Path("Merged_Survey_Data_Aman.xlsx")

SURVEY_ID_COLUMN = "Response ID"
SURVEY_ANSWER_COLUMN = "What years have you participated in an internship? (Check all that apply) - Selected Choice"

parsed_df = pd.read_csv(PARSED)
survey_df = pd.read_excel(SURVEY)

#parsed_df.head()
#survey_df.head()


In [38]:
#extract response ids from resume filenames
def extract_response_id(filename):
    match = re.match(r"^(R_[A-Za-z0-9]+)", str(filename))
    return match.group(1) if match else None

parsed_df["Response ID"] = parsed_df["filename"].apply(extract_response_id)
#parsed_df[["filename", "Response ID"]].head()

In [39]:
#label survey answers as yes or no internship
def convert_internship_response(text):
    text = str(text).lower().strip()
    if text == "i have not interned yet":
        return 0
    if "interning this summer" in text:
        return 1
    return 1

survey_df["got_internship"] = survey_df[SURVEY_ANSWER_COLUMN].apply(convert_internship_response)
survey_df[[SURVEY_ID_COLUMN, SURVEY_ANSWER_COLUMN, "got_internship"]].head(15)

Unnamed: 0,Response ID,What years have you participated in an internship? (Check all that apply) - Selected Choice,got_internship
0,R_vDnloj7X9FlZ5dv,I have not interned yet,0
1,R_1Cxr8EqR6R5Nwy8,I have not interned yet,0
2,R_1JR3WstNWsiLNzz,"I have not interned yet,I am interning this su...",1
3,R_XmQCFHTUYKEOkN3,I have not interned yet,0
4,R_3k4CtlRwArrK8cK,I have not interned yet,0
5,R_pc6A2m7aKYFOhBD,I have not interned yet,0
6,R_3OeURbnOEVcERJk,I am interning this summer for the first time,1
7,R_O2vYdFrkC9gSAO5,Freshman,1
8,R_3ssvroQgoWxeLzR,I have not interned yet,0
9,R_1jwvmeHUnrs6a3P,Freshman,1


In [40]:
#keep only the columns we need from Aman's survey
survey_keep = survey_df[["Response ID", "got_internship"]].drop_duplicates("Response ID")
merged_df = parsed_df.merge(survey_keep, on = "Response ID", how = "inner", validate = "one_to_one")
print("parsed resumes: ", len(parsed_df))
print("survey rows(unique ids): ", len(survey_keep))
print("merged usable rows: ", len(merged_df))

parsed resumes:  213
survey rows(unique ids):  741
merged usable rows:  162


In [41]:
#combine text feilds for model training
TEXT_COLS = [c for c in [
    "summary", "skills", "experience", "projects", "education", "research", "relevant_classwork"] if c in merged_df.columns]
for col in TEXT_COLS:
    merged_df[col] = merged_df[col].fillna("")
merged_df["text"] = merged_df[TEXT_COLS].agg(" ".join, axis = 1)

#save new csv for training
output_path = "resumes_with_labels.csv"
merged_df.to_csv(output_path, index = False)