In [1]:
import numpy as np
import pandas as pd
import random
import math

In [2]:
'''
A list of the name of major cities, zip code, job title, college,
'''

cities = [
    "New York", "Los Angeles", "Chicago", "Houston", "Phoenix", "Philadelphia",
    "San Antonio", "San Diego", "Dallas", "San Jose", "Austin", "Jacksonville",
    "Fort Worth", "Columbus", "Charlotte", "San Francisco", "Providence", "Boston", "Miami"
]

zip_codes = [
    "02802", "02804", "02806", "02807", "02808", "02809", "02812", "02813", "02814", "02815",
    "02816", "02817", "02818", "02822", "02823", "02824", "02825", "02826", "02827", "02828",
    "02829", "02830", "02831", "02832", "02833", "02835", "02836", "02837", "02838", "02839",
    "02840", "02841", "02842", "02852", "02857", "02858", "02859", "02860", "02861", "02863",
    "02864", "02865", "02871", "02872", "02873", "02874", "02875", "02876", "02878", "02879",
    "02881", "02882", "02885", "02886", "02888", "02889", "02891", "02892", "02893", "02894",
    "02895", "02896", "02898", "02903", "02904", "02905", "02906", "02907", "02908", "02909",
    "02910", "02911", "02912", "02914", "02915", "02916", "02917", "02918", "02919", "02920",
    "02921"
]

job_titles = [
    "Software Engineer", "Data Scientist",
    "Lawyer", "Machine Learning Engineer",
    "Research Assistant", "Product Manager",
    "Marketing Coordinator", "Financial Analyst", "Human Resources Manager",
    "Sales Representative", "Graphic Designer", "Web Developer", "N/A"
]

rhode_island_colleges = [
    "Brown University",
    "University of Rhode Island",
    "Rhode Island School of Design",
    "Providence College",
    "Bryant University",
    "Roger Williams University",
    "Johnson & Wales University",
    "Rhode Island College",
    "Salve Regina University",
    "New England Institute of Technology"
]


dictionary = {
    "School Name" :   rhode_island_colleges,
    "Degree" :        ["Bachelor's", "Master's", "PhD"],
    "Location" :      cities,
    "Gender" :        ["F", "M", "N/A"],
    "Veteran status": [1, 0 , "N/A"], #1 for Yes, 0 for No, N/A for not provided
    "Work authorization" : [1, 0],
    "Disability" : [1, 0 , "N/A"],
    "Ethnicity" : ['White', 'Black', 'Native American', 'Asian American & Pacific Islander', 'Other'],
    "Role" : job_titles

}

In [3]:
def parse_date(date_str):
    '''
    Parse the date string formatted as mm/yy into a datetime object assuming the first of the month

    Input: mm/yy
    Output: mm-yy-01
    '''
    if date_str == "N/A":
        return None
    month, year = map(int, date_str.split('/'))
    return np.datetime64(f"20{year:02d}-{month:02d}-01")

# def random_date(start_year, end_year):
#     """
#     Generate a random month and year within the range, formatted as mm/yy.
#     """
#     year = np.random.choice(range(start_year, end_year))
#     month = np.random.choice(range(1, 13))
#     return f"{month:02d}/{year % 100:02d}"

def generate_period(start_year, end_year, prev_end_date=None):
    """ Generate a start and end date, ensuring the start date is after prev_end_date if provided. """
    if prev_end_date and prev_end_date != "N/A":
        prev_end_date = parse_date(prev_end_date)  # Convert previous end date to datetime if not None
        prev_year = prev_end_date.astype('datetime64[Y]').astype(int) + 1970
        prev_month = prev_end_date.astype('datetime64[M]').astype(int) % 12 + 1

        # Start the new period in the next month of the previous end date
        if prev_month == 12:
            start_year = prev_year + 1
            start_month = 1
        else:
            start_year = prev_year
            start_month = prev_month + 1
    else:
        # Choose a random start month and year if no previous end date is provided
        start_year = np.random.choice(range(start_year, end_year))
        start_month = np.random.choice(range(1, 13))

    start_date = f"{start_month}/{start_year % 100:02d}"
    end_month = np.random.choice(range(start_month, 13))
    end_year = start_year
    end_date = f"{end_month}/{end_year % 100:02d}"

    return start_date, end_date


In [4]:
import pandas as pd
import numpy as np

# Ensure dictionary and job_titles are properly set up
# Example:
# dictionary = {"School Name": ["School A", "School B"], "Degree": ["BS", "MS"], "Location": ["Location A", "Location B"], "Gender": ["Male", "Female"], "Veteran status": ["Yes", "No"], "Work authorization": ["Authorized", "Not Authorized"], "Disability": ["Yes", "No"], "Ethnicity": ["Ethnicity A", "Ethnicity B"]}
# job_titles = ["Engineer", "Developer", "Manager"]

num_entries = 500
# last_applicant_id = existing_data['Applicant ID'].max()

# Generate synthetic data
new_data = pd.DataFrame({
    'Applicant ID': range(1, 1 + num_entries),
    'School Name': np.random.choice(dictionary["School Name"], num_entries),
    'GPA': np.clip(np.round(np.random.normal(3.1, 0.4, num_entries), 2), 0.0, 4.0),
    'Degree': np.random.choice(dictionary["Degree"], num_entries),
    'Location': np.random.choice(dictionary["Location"], num_entries),
    'Gender': np.random.choice(dictionary["Gender"], num_entries),
    'Veteran status': np.random.choice(dictionary["Veteran status"], num_entries),
    'Work authorization': np.random.choice(dictionary["Work authorization"], num_entries),
    'Disability': np.random.choice(dictionary["Disability"], num_entries),
    'Ethnicity': np.random.choice(dictionary["Ethnicity"], num_entries),
})


# Assign roles and dates ensuring dependencies

for index in new_data.index:
    prev_end_date = None
    for i in range(1, 4):
        role_column = f'Role {i}'
        start_column = f'Start {i}'
        end_column = f'End {i}'

        # Check if previous role exists before setting current role
        if i == 1 or str(new_data.at[index, f'Role {i-1}']) != "N/A":
            new_data.at[index, role_column] = np.random.choice(job_titles + ["N/A"]*i)
            if str(new_data.at[index, role_column]) != "N/A":
              start_date, end_date = generate_period(2015, 2023, prev_end_date)
            else :
              start_date, end_date = "N/A", "N/A"
            new_data.at[index, start_column] = start_date
            new_data.at[index, end_column] = end_date
            prev_end_date = end_date  # Update previous end date to the current end date
        else:
            new_data.at[index, role_column] = "N/A"
            new_data.at[index, start_column] = "N/A"
            new_data.at[index, end_column] = "N/A"

# Combine and save data
generated_data = new_data
generated_data.to_csv('generated_data.csv', index=False)


In [5]:
d = pd.read_csv('generated_data.csv')
d

Unnamed: 0,Applicant ID,School Name,GPA,Degree,Location,Gender,Veteran status,Work authorization,Disability,Ethnicity,Role 1,Start 1,End 1,Role 2,Start 2,End 2,Role 3,Start 3,End 3
0,1,Rhode Island College,2.68,Master's,New York,F,,1,,White,Lawyer,8/22,9/22,,,,,,
1,2,Bryant University,3.32,Master's,Phoenix,F,,0,0.0,Native American,Product Manager,9/17,12/17,Financial Analyst,1/18,10/18,,,
2,3,Providence College,2.96,Master's,San Francisco,,0.0,0,1.0,White,Product Manager,6/17,11/17,Software Engineer,12/17,12/17,,,
3,4,New England Institute of Technology,3.29,Master's,Fort Worth,,1.0,1,,Black,Sales Representative,10/18,10/18,Data Scientist,11/18,12/18,Graphic Designer,1/19,10/19
4,5,Johnson & Wales University,3.34,Bachelor's,Chicago,F,1.0,1,1.0,White,Web Developer,7/21,12/21,Web Developer,1/22,2/22,Lawyer,3/22,5/22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,496,Providence College,3.06,PhD,Boston,F,0.0,1,1.0,White,Research Assistant,2/20,12/20,Financial Analyst,1/21,2/21,Web Developer,3/21,3/21
496,497,New England Institute of Technology,3.48,PhD,Houston,F,,0,,Asian American & Pacific Islander,Product Manager,12/22,12/22,Marketing Coordinator,1/23,11/23,Financial Analyst,12/23,12/23
497,498,Rhode Island School of Design,3.26,Master's,San Diego,,0.0,0,0.0,Asian American & Pacific Islander,Financial Analyst,11/18,12/18,Web Developer,1/19,12/19,,,
498,499,Rhode Island College,3.55,Master's,Los Angeles,,1.0,0,1.0,Asian American & Pacific Islander,,,,,,,,,


In [None]:
import requests
import pandas as pd

def upload_csv_to_api(file_path, api_url):
    try:
        # Load CSV file
        data = pd.read_csv(file_path)
        # Convert DataFrame to JSON
        json_data = data.to_json(orient='records')

        # Send POST request to the API
        headers = {'Content-Type': 'application/json'}
        response = requests.post(api_url, data=json_data, headers=headers)
        print('response', response)

        # Check if the request was successful
        if response.status_code == 200:
            print("File uploaded successfully and processed.")
            return response.json()  # Assuming the API returns JSON with error details or results
        else:
            print("Failed to process file. Status code:", response.status_code)
            print("Response:", response.text)
    except Exception as e:
        print("An error occurred:", e)

def analyze_errors(api_response):
    # This function needs to be customized based on how the API response looks.
    # Assuming the API returns a list of errors with row indices or similar identifiers.
    print("Analyzing errors...")
    if 'errors' in api_response:
        errors = api_response['errors']
        for error in errors:
            print(f"Error in row {error['row']}: {error['message']}")
    else:
        print("No errors found, or API response format is unexpected.")


In [None]:
import json

api_url = 'https://jennjwang.pythonanywhere.com'
file_path = 'generated_data.csv'
response = upload_csv_to_api(file_path, api_url)
if response:
    analyze_errors(response)

predict = json.loads(response['prediction'])
generated_data['score'] = pd.DataFrame(predict)['score']
generated_data.head()

File uploaded successfully and processed.
Analyzing errors...
No errors found, or API response format is unexpected.


Unnamed: 0,Applicant ID,School Name,GPA,Degree,Location,Gender,Veteran status,Work authorization,Disability,Ethnicity,Role 1,Start 1,End 1,Role 2,Start 2,End 2,Role 3,Start 3,End 3,score
0,1,Bryant University,3.44,Master's,Charlotte,M,1.0,1,,Native American,Product Manager,12/16,12/16,Web Developer,1/17,4/17,Human Resources Manager,5/17,9/17,4.36
1,2,Salve Regina University,3.48,PhD,Fort Worth,M,1.0,0,0.0,Black,Data Scientist,4/18,8/18,,,,,,,0.28
2,3,Roger Williams University,3.57,PhD,San Jose,,,0,0.0,White,Sales Representative,5/17,7/17,Web Developer,8/17,10/17,,,,2.91
3,4,Roger Williams University,2.42,Bachelor's,San Jose,M,0.0,0,1.0,Black,Marketing Coordinator,5/16,7/16,Data Scientist,8/16,10/16,Software Engineer,11/16,12/16,1.55
4,5,Rhode Island College,3.51,PhD,New York,M,,1,0.0,White,,,,,,,,,,8.35


In [None]:
d.to_csv('generated_data_with_score.csv', index=False)

# Test the resume score generator by changing only one feature at a time

Test 1: All five applicants, each from a different race, are new graduate Software Engineers who began working in August 2023. They all graduated from Brown University with a 3.5 GPA, live in Providence, are male, do not have disabilities, possess work authorization, and are not veterans.

In [None]:
# Generate synthetic data
num_entries = 5
test_data1 = pd.DataFrame({
    'Applicant ID': range(1, 1 + num_entries),
    'School Name': ["Brown University"]*num_entries,
    'GPA': [3.5]*num_entries,
    'Degree': ["Bachelor's"]*num_entries,
    'Location': ['Providence']*num_entries,
    'Gender': ['M']*num_entries,
    'Veteran status': [0]*num_entries,
    'Work authorization': [1]*num_entries,
    'Disability': [0]*num_entries,
    'Ethnicity': ['White', 'Black', 'Native American', 'Asian American & Pacific Islander', 'Other'],
    'Role 1': ['Software Engineer']*num_entries,
    'Start 1': ['8/23']*num_entries,
    'End 1': ['12/23']*num_entries,
    'Role 2': ['Data Scientist']*num_entries,
    'Start 2': ['1/24']*num_entries,
    'End 2': ['N/A']*num_entries,
    'Role 3': ['N/A']*num_entries,
    'Start 3': ['N/A']*num_entries,
    'End 3': ['N/A']*num_entries
})


test_data1.to_csv('test_data1.csv', index=False)
test_data1

Unnamed: 0,Applicant ID,School Name,GPA,Degree,Location,Gender,Veteran status,Work authorization,Disability,Ethnicity,Role 1,Start 1,End 1,Role 2,Start 2,End 2,Role 3,Start 3,End 3
0,1,Brown University,3.5,Bachelor's,Providence,M,0,1,0,White,Software Engineer,8/23,12/23,Data Scientist,1/24,,,,
1,2,Brown University,3.5,Bachelor's,Providence,M,0,1,0,Black,Software Engineer,8/23,12/23,Data Scientist,1/24,,,,
2,3,Brown University,3.5,Bachelor's,Providence,M,0,1,0,Native American,Software Engineer,8/23,12/23,Data Scientist,1/24,,,,
3,4,Brown University,3.5,Bachelor's,Providence,M,0,1,0,Asian American & Pacific Islander,Software Engineer,8/23,12/23,Data Scientist,1/24,,,,
4,5,Brown University,3.5,Bachelor's,Providence,M,0,1,0,Other,Software Engineer,8/23,12/23,Data Scientist,1/24,,,,


In [None]:
# Canddiate scorer

api_url = 'https://jennjwang.pythonanywhere.com'
file_path = 'test_data1.csv'
response = upload_csv_to_api(file_path, api_url)
if response:
    analyze_errors(response)

predict = json.loads(response['prediction'])
test_data1['Resume score'] = pd.DataFrame(predict)['score']

test_data1.to_csv("result.csv", index=False)

response <Response [200]>
File uploaded successfully and processed.
Analyzing errors...
No errors found, or API response format is unexpected.


In [None]:
test_data1

Unnamed: 0,Applicant ID,School Name,GPA,Degree,Location,Gender,Veteran status,Work authorization,Disability,Ethnicity,Role 1,Start 1,End 1,Role 2,Start 2,End 2,Role 3,Start 3,End 3,Resume score
0,1,Brown University,3.5,Bachelor's,Providence,M,0,1,0,White,Software Engineer,8/23,12/23,Data Scientist,1/24,,,,,3.8
1,2,Brown University,3.5,Bachelor's,Providence,M,0,1,0,Black,Software Engineer,8/23,12/23,Data Scientist,1/24,,,,,1.76
2,3,Brown University,3.5,Bachelor's,Providence,M,0,1,0,Native American,Software Engineer,8/23,12/23,Data Scientist,1/24,,,,,9.94
3,4,Brown University,3.5,Bachelor's,Providence,M,0,1,0,Asian American & Pacific Islander,Software Engineer,8/23,12/23,Data Scientist,1/24,,,,,4.2
4,5,Brown University,3.5,Bachelor's,Providence,M,0,1,0,Other,Software Engineer,8/23,12/23,Data Scientist,1/24,,,,,0.61


In [None]:
candidate_url = 'https://heonlee.pythonanywhere.com/'
file_path = "candidate1.csv"
response_cand = upload_csv_to_api(file_path, candidate_url)

response_cand

# predict = json.loads(response['prediction'])
# test_data1['Resume score'] = pd.DataFrame(predict)['score']

# test_data1.to_csv("candidate_data1.csv", index=False)

response <Response [500]>
Failed to process file. Status code: 500
Response: <!doctype html>
<html lang=en>
<title>500 Internal Server Error</title>
<h1>Internal Server Error</h1>
<p>The server encountered an internal error and was unable to complete your request. Either the server is overloaded or there is an error in the application.</p>



First run

In [None]:
import json

api_url = 'https://jennjwang.pythonanywhere.com'
file_path = 'test_data1.csv'
# response = upload_csv_to_api(file_path, api_url)
# if response:
#     analyze_errors(response)

for i in range(1, 6):
  response = upload_csv_to_api(file_path, api_url)
  predict = json.loads(response['prediction'])
  test_data1[f'score {i}'] = pd.DataFrame(predict)['score']

test_data1

File uploaded successfully and processed.
File uploaded successfully and processed.
File uploaded successfully and processed.
File uploaded successfully and processed.
File uploaded successfully and processed.


Unnamed: 0,Applicant ID,School Name,GPA,Degree,Location,Gender,Veteran status,Work authorization,Disability,Ethnicity,...,Start 2,End 2,Role 3,Start 3,End 3,score 1,score 2,score 3,score 4,score 5
0,1,Brown University,3.5,Bachelor's,Providence,M,0,1,0,White,...,1/24,,,,,5.83,5.06,6.04,6.99,5.65
1,2,Brown University,3.5,Bachelor's,Providence,M,0,1,0,Black,...,1/24,,,,,1.09,1.85,2.65,9.15,4.23
2,3,Brown University,3.5,Bachelor's,Providence,M,0,1,0,Native American,...,1/24,,,,,3.49,2.81,9.52,7.79,5.58
3,4,Brown University,3.5,Bachelor's,Providence,M,0,1,0,Asian American & Pacific Islander,...,1/24,,,,,2.65,1.0,2.7,9.81,5.56
4,5,Brown University,3.5,Bachelor's,Providence,M,0,1,0,Other,...,1/24,,,,,0.79,0.2,0.54,0.41,3.26


It clearly shows that the 2nd applicant who is in the Black ethnicity group scored very low compared to other applicants.
Other > Asian American & Pacific Islander > White > Native American > Black.

Test2 : Compare Black female and Black male applicant

In [None]:
# Generate synthetic data
num_entries = 3
test_data2 = pd.DataFrame({
    'Applicant ID': range(1, 1 + num_entries),
    'School Name': ["Brown University"]*num_entries,
    'GPA': [3.5]*num_entries,
    'Degree': ["Bachelor's"]*num_entries,
    'Location': ['Providence']*num_entries,
    'Gender': ['M','F','N/A'],
    'Veteran status': [0]*num_entries,
    'Work authorization': [1]*num_entries,
    'Disability': [0]*num_entries,
    'Ethnicity': ['Black']*num_entries,
    'Role 1': ['Software Engineer']*num_entries,
    'Start 1': ['8/23']*num_entries,
    'End 1': ['N/A']*num_entries,
    'Role 2': ['N/A']*num_entries,
    'Start 2': ['N/A']*num_entries,
    'End 2': ['N/A']*num_entries,
    'Role 3': ['N/A']*num_entries,
    'Start 3': ['N/A']*num_entries,
    'End 3': ['N/A']*num_entries
})


test_data2.to_csv('test_data2.csv', index=False)

api_url = 'https://jennjwang.pythonanywhere.com'
file_path = 'test_data2.csv'
# response = upload_csv_to_api(file_path, api_url)
# if response:
#     analyze_errors(response)

for i in range(1, 6):
  response = upload_csv_to_api(file_path, api_url)
  predict = json.loads(response['prediction'])
  test_data2[f'score {i}'] = pd.DataFrame(predict)['score']

test_data2


File uploaded successfully and processed.
File uploaded successfully and processed.
File uploaded successfully and processed.
File uploaded successfully and processed.
File uploaded successfully and processed.


Unnamed: 0,Applicant ID,School Name,GPA,Degree,Location,Gender,Veteran status,Work authorization,Disability,Ethnicity,...,Start 2,End 2,Role 3,Start 3,End 3,score 1,score 2,score 3,score 4,score 5
0,1,Brown University,3.5,Bachelor's,Providence,M,0,1,0,Black,...,,,,,,2.0,6.78,0.02,9.44,6.48
1,2,Brown University,3.5,Bachelor's,Providence,F,0,1,0,Black,...,,,,,,3.47,3.08,4.83,7.11,2.59
2,3,Brown University,3.5,Bachelor's,Providence,,0,1,0,Black,...,,,,,,1.17,3.61,0.39,3.42,6.89


In [None]:
# White ethnicity group - changing only gener
num_entries = 3
test_data2 = pd.DataFrame({
    'Applicant ID': range(1, 1 + num_entries),
    'School Name': ["Brown University"]*num_entries,
    'GPA': [3.5]*num_entries,
    'Degree': ["Bachelor's"]*num_entries,
    'Location': ['Providence']*num_entries,
    'Gender': ['M','F','N/A'],
    'Veteran status': [0]*num_entries,
    'Work authorization': [1]*num_entries,
    'Disability': [0]*num_entries,
    'Ethnicity': ['White']*num_entries,
    'Role 1': ['Software Engineer']*num_entries,
    'Start 1': ['8/23']*num_entries,
    'End 1': ['N/A']*num_entries,
    'Role 2': ['N/A']*num_entries,
    'Start 2': ['N/A']*num_entries,
    'End 2': ['N/A']*num_entries,
    'Role 3': ['N/A']*num_entries,
    'Start 3': ['N/A']*num_entries,
    'End 3': ['N/A']*num_entries
})


test_data2.to_csv('test_data2.csv', index=False)

api_url = 'https://jennjwang.pythonanywhere.com'
file_path = 'test_data2.csv'
# response = upload_csv_to_api(file_path, api_url)
# if response:
#     analyze_errors(response)

for i in range(1, 6):
  response = upload_csv_to_api(file_path, api_url)
  predict = json.loads(response['prediction'])
  test_data2[f'score {i}'] = pd.DataFrame(predict)['score']

test_data2


File uploaded successfully and processed.
File uploaded successfully and processed.
File uploaded successfully and processed.
File uploaded successfully and processed.
File uploaded successfully and processed.


Unnamed: 0,Applicant ID,School Name,GPA,Degree,Location,Gender,Veteran status,Work authorization,Disability,Ethnicity,...,Start 2,End 2,Role 3,Start 3,End 3,score 1,score 2,score 3,score 4,score 5
0,1,Brown University,3.5,Bachelor's,Providence,M,0,1,0,White,...,,,,,,7.16,8.1,6.11,5.19,7.0
1,2,Brown University,3.5,Bachelor's,Providence,F,0,1,0,White,...,,,,,,4.46,6.28,8.31,9.95,5.44
2,3,Brown University,3.5,Bachelor's,Providence,,0,1,0,White,...,,,,,,4.96,7.53,2.65,4.4,3.15


In [None]:
# Native American ethnicity group - changing only gener
num_entries = 3
test_data2 = pd.DataFrame({
    'Applicant ID': range(1, 1 + num_entries),
    'School Name': ["Brown University"]*num_entries,
    'GPA': [3.5]*num_entries,
    'Degree': ["Bachelor's"]*num_entries,
    'Location': ['Providence']*num_entries,
    'Gender': ['M','F','N/A'],
    'Veteran status': [0]*num_entries,
    'Work authorization': [1]*num_entries,
    'Disability': [0]*num_entries,
    'Ethnicity': ['Native American']*num_entries,
    'Role 1': ['Software Engineer']*num_entries,
    'Start 1': ['8/23']*num_entries,
    'End 1': ['N/A']*num_entries,
    'Role 2': ['N/A']*num_entries,
    'Start 2': ['N/A']*num_entries,
    'End 2': ['N/A']*num_entries,
    'Role 3': ['N/A']*num_entries,
    'Start 3': ['N/A']*num_entries,
    'End 3': ['N/A']*num_entries
})


test_data2.to_csv('test_data2.csv', index=False)

api_url = 'https://jennjwang.pythonanywhere.com'
file_path = 'test_data2.csv'
# response = upload_csv_to_api(file_path, api_url)
# if response:
#     analyze_errors(response)

for i in range(1, 6):
  response = upload_csv_to_api(file_path, api_url)
  predict = json.loads(response['prediction'])
  test_data2[f'score {i}'] = pd.DataFrame(predict)['score']

test_data2


File uploaded successfully and processed.
File uploaded successfully and processed.
File uploaded successfully and processed.
File uploaded successfully and processed.
File uploaded successfully and processed.


Unnamed: 0,Applicant ID,School Name,GPA,Degree,Location,Gender,Veteran status,Work authorization,Disability,Ethnicity,...,Start 2,End 2,Role 3,Start 3,End 3,score 1,score 2,score 3,score 4,score 5
0,1,Brown University,3.5,Bachelor's,Providence,M,0,1,0,Native American,...,,,,,,3.56,6.84,5.38,3.17,5.8
1,2,Brown University,3.5,Bachelor's,Providence,F,0,1,0,Native American,...,,,,,,8.87,9.32,7.59,9.66,0.03
2,3,Brown University,3.5,Bachelor's,Providence,,0,1,0,Native American,...,,,,,,1.97,5.48,3.31,3.3,8.09


In [None]:
# Native American ethnicity group - changing only gener
num_entries = 3
test_data2 = pd.DataFrame({
    'Applicant ID': range(1, 1 + num_entries),
    'School Name': ["Brown University"]*num_entries,
    'GPA': [3.5]*num_entries,
    'Degree': ["Bachelor's"]*num_entries,
    'Location': ['Providence']*num_entries,
    'Gender': ['M','F','N/A'],
    'Veteran status': [0]*num_entries,
    'Work authorization': [1]*num_entries,
    'Disability': [0]*num_entries,
    'Ethnicity': ['Asian American & Pacific Islander']*num_entries,
    'Role 1': ['Software Engineer']*num_entries,
    'Start 1': ['8/23']*num_entries,
    'End 1': ['N/A']*num_entries,
    'Role 2': ['N/A']*num_entries,
    'Start 2': ['N/A']*num_entries,
    'End 2': ['N/A']*num_entries,
    'Role 3': ['N/A']*num_entries,
    'Start 3': ['N/A']*num_entries,
    'End 3': ['N/A']*num_entries
})


test_data2.to_csv('test_data2.csv', index=False)

api_url = 'https://jennjwang.pythonanywhere.com'
file_path = 'test_data2.csv'
# response = upload_csv_to_api(file_path, api_url)
# if response:
#     analyze_errors(response)

for i in range(1, 6):
  response = upload_csv_to_api(file_path, api_url)
  predict = json.loads(response['prediction'])
  test_data2[f'score {i}'] = pd.DataFrame(predict)['score']

test_data2


File uploaded successfully and processed.
File uploaded successfully and processed.
File uploaded successfully and processed.
File uploaded successfully and processed.
File uploaded successfully and processed.


Unnamed: 0,Applicant ID,School Name,GPA,Degree,Location,Gender,Veteran status,Work authorization,Disability,Ethnicity,...,Start 2,End 2,Role 3,Start 3,End 3,score 1,score 2,score 3,score 4,score 5
0,1,Brown University,3.5,Bachelor's,Providence,M,0,1,0,Asian American & Pacific Islander,...,,,,,,6.27,3.79,5.09,4.32,7.26
1,2,Brown University,3.5,Bachelor's,Providence,F,0,1,0,Asian American & Pacific Islander,...,,,,,,9.05,4.73,1.61,9.61,0.66
2,3,Brown University,3.5,Bachelor's,Providence,,0,1,0,Asian American & Pacific Islander,...,,,,,,5.94,7.91,8.01,4.32,5.05


In [None]:
# Pacific Islander/Asian American ethnicity group - changing only gener
num_entries = 3
test_data2 = pd.DataFrame({
    'Applicant ID': range(1, 1 + num_entries),
    'School Name': ["Brown University"]*num_entries,
    'GPA': [3.5]*num_entries,
    'Degree': ["Bachelor's"]*num_entries,
    'Location': ['Providence']*num_entries,
    'Gender': ['M','F','N/A'],
    'Veteran status': [0]*num_entries,
    'Work authorization': [1]*num_entries,
    'Disability': [0]*num_entries,
    'Ethnicity': ['Other']*num_entries,
    'Role 1': ['Software Engineer']*num_entries,
    'Start 1': ['8/23']*num_entries,
    'End 1': ['N/A']*num_entries,
    'Role 2': ['N/A']*num_entries,
    'Start 2': ['N/A']*num_entries,
    'End 2': ['N/A']*num_entries,
    'Role 3': ['N/A']*num_entries,
    'Start 3': ['N/A']*num_entries,
    'End 3': ['N/A']*num_entries
})


test_data2.to_csv('test_data2.csv', index=False)

api_url = 'https://jennjwang.pythonanywhere.com'
file_path = 'test_data2.csv'
# response = upload_csv_to_api(file_path, api_url)
# if response:
#     analyze_errors(response)

for i in range(1, 6):
  response = upload_csv_to_api(file_path, api_url)
  predict = json.loads(response['prediction'])
  test_data2[f'score {i}'] = pd.DataFrame(predict)['score']

test_data2


File uploaded successfully and processed.
File uploaded successfully and processed.
File uploaded successfully and processed.
File uploaded successfully and processed.
File uploaded successfully and processed.


Unnamed: 0,Applicant ID,School Name,GPA,Degree,Location,Gender,Veteran status,Work authorization,Disability,Ethnicity,...,Start 2,End 2,Role 3,Start 3,End 3,score 1,score 2,score 3,score 4,score 5
0,1,Brown University,3.5,Bachelor's,Providence,M,0,1,0,Other,...,,,,,,2.41,2.64,6.63,6.78,8.92
1,2,Brown University,3.5,Bachelor's,Providence,F,0,1,0,Other,...,,,,,,6.47,5.31,7.81,2.46,2.93
2,3,Brown University,3.5,Bachelor's,Providence,,0,1,0,Other,...,,,,,,0.44,2.11,2.57,3.85,8.35


Test3 : Without Work authorization

In [None]:
# Generate synthetic data
num_entries = 2
test_data3 = pd.DataFrame({
    'Applicant ID': range(1, 1 + num_entries),
    'School Name': ["Brown University"]*num_entries,
    'GPA': [3.5]*num_entries,
    'Degree': ['Bachelors']*num_entries,
    'Location': ['Providence']*num_entries,
    'Gender': ['M']*num_entries,
    'Veteran status': [0]*num_entries,
    'Work authorization': [0,1],
    'Disability': [0]*num_entries,
    'Ethnicity': ['Black']*num_entries,
    'Role 1': ['Software Engineer']*num_entries,
    'Start 1': ['8/23']*num_entries,
    'End 1': ['N/A']*num_entries,
    'Role 2': ['N/A']*num_entries,
    'Start 2': ['N/A']*num_entries,
    'End 2': ['N/A']*num_entries,
    'Role 3': ['N/A']*num_entries,
    'Start 3': ['N/A']*num_entries,
    'End 3': ['N/A']*num_entries
})


test_data3.to_csv('test_data3.csv', index=False)

api_url = 'https://jennjwang.pythonanywhere.com'
file_path = 'test_data3.csv'

for i in range(1, 6):
  response = upload_csv_to_api(file_path, api_url)
  predict = json.loads(response['prediction'])
  test_data3[f'score {i}'] = pd.DataFrame(predict)['score']

test_data3


File uploaded successfully and processed.
File uploaded successfully and processed.
File uploaded successfully and processed.
File uploaded successfully and processed.
File uploaded successfully and processed.


Unnamed: 0,Applicant ID,School Name,GPA,Degree,Location,Gender,Veteran status,Work authorization,Disability,Ethnicity,...,Start 2,End 2,Role 3,Start 3,End 3,score 1,score 2,score 3,score 4,score 5
0,1,Brown University,3.5,Bachelors,Providence,M,0,0,0,Black,...,,,,,,9.15,9.73,7.24,2.11,9.81
1,2,Brown University,3.5,Bachelors,Providence,M,0,1,0,Black,...,,,,,,7.21,8.01,5.06,1.76,9.82


Having work authorization improves resume score by 0.12

Test4: Different Degree

In [None]:
# Generate synthetic data
num_entries = 3
test_data4 = pd.DataFrame({
    'Applicant ID': range(1, 1 + num_entries),
    'School Name': ["Brown University"]*num_entries,
    'GPA': [3.5]*num_entries,
    'Degree': ["Bachelor's", "Master's", "PhD"],
    'Location': ['Providence']*num_entries,
    'Gender': ['M']*num_entries,
    'Veteran status': [0]*num_entries,
    'Work authorization': [1]*num_entries,
    'Disability': [0]*num_entries,
    'Ethnicity': ['White']*num_entries,
    'Role 1': ['Software Engineer']*num_entries,
    'Start 1': ['8/23']*num_entries,
    'End 1': ['N/A']*num_entries,
    'Role 2': ['N/A']*num_entries,
    'Start 2': ['N/A']*num_entries,
    'End 2': ['N/A']*num_entries,
    'Role 3': ['N/A']*num_entries,
    'Start 3': ['N/A']*num_entries,
    'End 3': ['N/A']*num_entries
})


test_data4.to_csv('test_data4.csv', index=False)

api_url = 'https://jennjwang.pythonanywhere.com'
file_path = 'test_data4.csv'

for i in range(1, 6):
  response = upload_csv_to_api(file_path, api_url)
  predict = json.loads(response['prediction'])
  test_data4[f'score {i}'] = pd.DataFrame(predict)['score']

test_data4


File uploaded successfully and processed.
File uploaded successfully and processed.
File uploaded successfully and processed.
File uploaded successfully and processed.
File uploaded successfully and processed.


Unnamed: 0,Applicant ID,School Name,GPA,Degree,Location,Gender,Veteran status,Work authorization,Disability,Ethnicity,...,Start 2,End 2,Role 3,Start 3,End 3,score 1,score 2,score 3,score 4,score 5
0,1,Brown University,3.5,Bachelor's,Providence,M,0,1,0,White,...,,,,,,4.21,3.33,2.87,3.13,6.06
1,2,Brown University,3.5,Master's,Providence,M,0,1,0,White,...,,,,,,8.14,9.41,9.12,6.49,4.54
2,3,Brown University,3.5,PhD,Providence,M,0,1,0,White,...,,,,,,0.86,9.08,1.19,3.41,6.97


Financial Analyst

In [None]:
# Generate synthetic data
num_entries = 3
test_data5 = pd.DataFrame({
    'Applicant ID': range(1, 1 + num_entries),
    'School Name': ["Central University"]*num_entries,
    'GPA': [3.5]*num_entries,
    'Degree': ["Bachelor's"]*num_entries,
    'Location': ['Providence']*num_entries,
    'Gender': ['M','F','N/A'],
    'Veteran status': [0]*num_entries,
    'Work authorization': [1]*num_entries,
    'Disability': [0]*num_entries,
    'Ethnicity': ['White']*num_entries,
    'Role 1': ["Financial Analyst"]*num_entries,
    'Start 1': ['8/23']*num_entries,
    'End 1': ['N/A']*num_entries,
    'Role 2': ['N/A']*num_entries,
    'Start 2': ['N/A']*num_entries,
    'End 2': ['N/A']*num_entries,
    'Role 3': ['N/A']*num_entries,
    'Start 3': ['N/A']*num_entries,
    'End 3': ['N/A']*num_entries
})


test_data5.to_csv('test_data5.csv', index=False)

api_url = 'https://jennjwang.pythonanywhere.com'
file_path = 'test_data5.csv'
# response = upload_csv_to_api(file_path, api_url)
# if response:
#     analyze_errors(response)

for i in range(1, 6):
  response = upload_csv_to_api(file_path, api_url)
  predict = json.loads(response['prediction'])
  test_data5[f'score {i}'] = pd.DataFrame(predict)['score']

test_data5


File uploaded successfully and processed.
File uploaded successfully and processed.
File uploaded successfully and processed.
File uploaded successfully and processed.
File uploaded successfully and processed.


Unnamed: 0,Applicant ID,School Name,GPA,Degree,Location,Gender,Veteran status,Work authorization,Disability,Ethnicity,...,Start 2,End 2,Role 3,Start 3,End 3,score 1,score 2,score 3,score 4,score 5
0,1,Central University,3.5,Bachelor's,Providence,M,0,1,0,White,...,,,,,,3.26,7.58,2.11,2.22,2.84
1,2,Central University,3.5,Bachelor's,Providence,F,0,1,0,White,...,,,,,,7.07,5.76,2.61,4.81,1.58
2,3,Central University,3.5,Bachelor's,Providence,,0,1,0,White,...,,,,,,5.79,9.19,6.25,6.37,2.87


Candidate Scorer

In [None]:
# Generate synthetic data
num_entries = 3
test_data5 = pd.DataFrame({
    'Applicant ID': range(1, 1 + num_entries),
    'School Name': ["Central University"]*num_entries,
    'GPA': [3.5]*num_entries,
    'Degree': ["Bachelor's"]*num_entries,
    'Location': ['Providence']*num_entries,
    'Gender': ['M','F','N/A'],
    'Veteran status': [0]*num_entries,
    'Work authorization': [1]*num_entries,
    'Disability': [0]*num_entries,
    'Ethnicity': ['White']*num_entries,
    'Role 1': ["Financial Analyst"]*num_entries,
    'Start 1': ['8/23']*num_entries,
    'End 1': ['N/A']*num_entries,
    'Role 2': ['N/A']*num_entries,
    'Start 2': ['N/A']*num_entries,
    'End 2': ['N/A']*num_entries,
    'Role 3': ['N/A']*num_entries,
    'Start 3': ['N/A']*num_entries,
    'End 3': ['N/A']*num_entries
})


test_data5.to_csv('test_data5.csv', index=False)

api_url = 'https://jennjwang.pythonanywhere.com'
file_path = 'test_data5.csv'
# response = upload_csv_to_api(file_path, api_url)
# if response:
#     analyze_errors(response)

predict = json.loads(response['prediction'])
test_data5['Resume score'] = pd.DataFrame(predict)['score']

test_data5.to_csv("candidate_data5.csv", index=False)


In [None]:
test_data5

Unnamed: 0,Applicant ID,School Name,GPA,Degree,Location,Gender,Veteran status,Work authorization,Disability,Ethnicity,Role 1,Start 1,End 1,Role 2,Start 2,End 2,Role 3,Start 3,End 3,Resume score
0,1,Central University,3.5,Bachelor's,Providence,M,0,1,0,White,Financial Analyst,8/23,,,,,,,,2.84
1,2,Central University,3.5,Bachelor's,Providence,F,0,1,0,White,Financial Analyst,8/23,,,,,,,,1.58
2,3,Central University,3.5,Bachelor's,Providence,,0,1,0,White,Financial Analyst,8/23,,,,,,,,2.87


In [None]:
test_data5['Resume score'] = [3,3,3]
test_data5

Unnamed: 0,Applicant ID,School Name,GPA,Degree,Location,Gender,Veteran status,Work authorization,Disability,Ethnicity,Role 1,Start 1,End 1,Role 2,Start 2,End 2,Role 3,Start 3,End 3,Resume score
0,1,Central University,3.5,Bachelor's,Providence,M,0,1,0,White,Financial Analyst,8/23,,,,,,,,3
1,2,Central University,3.5,Bachelor's,Providence,F,0,1,0,White,Financial Analyst,8/23,,,,,,,,3
2,3,Central University,3.5,Bachelor's,Providence,,0,1,0,White,Financial Analyst,8/23,,,,,,,,3


In [None]:
test_data5.to_csv("tweaked_score.csv", index=False)

Candidate scorer- Male SWEs, different ethnicity group

In [None]:
# Generate synthetic data
num_entries = 10
test_data6 = pd.DataFrame({
    'Applicant ID': range(1, 1 + num_entries),
    'School Name': ["Central University"]*num_entries,
    'GPA': [3.5]*num_entries,
    'Degree': ["Bachelor's", "Master's", "Bachelor's", "Master's", "Bachelor's", "Master's",
               "Bachelor's", "Master's", "Bachelor's", "Master's"],
    'Location': ['Providence']*num_entries,
    'Gender': ['M']*num_entries,
    'Veteran status': [0]*num_entries,
    'Work authorization': [1]*num_entries,
    'Disability': [0]*num_entries,
    'Ethnicity': ['White', 'White',"Black","Black","Native American", "Native American",
                  "Asian American & Pacific Islander", "Asian American & Pacific Islander",
                  'Other', 'Other'],
    'Role 1': ["Financial Analyst"]*num_entries,
    'Start 1': ['8/23']*num_entries,
    'End 1': ['N/A']*num_entries,
    'Role 2': ['N/A']*num_entries,
    'Start 2': ['N/A']*num_entries,
    'End 2': ['N/A']*num_entries,
    'Role 3': ['N/A']*num_entries,
    'Start 3': ['N/A']*num_entries,
    'End 3': ['N/A']*num_entries
})


test_data6.to_csv('test_data6.csv', index=False)

api_url = 'https://jennjwang.pythonanywhere.com'
file_path = 'test_data6.csv'
response = upload_csv_to_api(file_path, api_url)
if response:
    analyze_errors(response)

predict = json.loads(response['prediction'])
test_data6['Resume score'] = pd.DataFrame(predict)['score']

test_data6.to_csv("candidate_data6.csv", index=False)
test_data6

response <Response [200]>
File uploaded successfully and processed.
Analyzing errors...
No errors found, or API response format is unexpected.


Unnamed: 0,Applicant ID,School Name,GPA,Degree,Location,Gender,Veteran status,Work authorization,Disability,Ethnicity,Role 1,Start 1,End 1,Role 2,Start 2,End 2,Role 3,Start 3,End 3,Resume score
0,1,Central University,3.5,Bachelor's,Providence,M,0,1,0,White,Financial Analyst,8/23,,,,,,,,9.02
1,2,Central University,3.5,Master's,Providence,M,0,1,0,White,Financial Analyst,8/23,,,,,,,,1.41
2,3,Central University,3.5,Bachelor's,Providence,M,0,1,0,Black,Financial Analyst,8/23,,,,,,,,6.01
3,4,Central University,3.5,Master's,Providence,M,0,1,0,Black,Financial Analyst,8/23,,,,,,,,3.28
4,5,Central University,3.5,Bachelor's,Providence,M,0,1,0,Native American,Financial Analyst,8/23,,,,,,,,7.63
5,6,Central University,3.5,Master's,Providence,M,0,1,0,Native American,Financial Analyst,8/23,,,,,,,,2.96
6,7,Central University,3.5,Bachelor's,Providence,M,0,1,0,Asian American & Pacific Islander,Financial Analyst,8/23,,,,,,,,8.16
7,8,Central University,3.5,Master's,Providence,M,0,1,0,Asian American & Pacific Islander,Financial Analyst,8/23,,,,,,,,1.59
8,9,Central University,3.5,Bachelor's,Providence,M,0,1,0,Other,Financial Analyst,8/23,,,,,,,,4.0
9,10,Central University,3.5,Master's,Providence,M,0,1,0,Other,Financial Analyst,8/23,,,,,,,,9.45


In [None]:
# Female

# Generate synthetic data
num_entries = 10
test_data7 = pd.DataFrame({
    'Applicant ID': range(1, 1 + num_entries),
    'School Name': ["Central University"]*num_entries,
    'GPA': [3.5]*num_entries,
    'Degree': ["Bachelor's", "Master's", "Bachelor's", "Master's", "Bachelor's", "Master's",
               "Bachelor's", "Master's", "Bachelor's", "Master's"],
    'Location': ['Providence']*num_entries,
    'Gender': ['F']*num_entries,
    'Veteran status': [0]*num_entries,
    'Work authorization': [1]*num_entries,
    'Disability': [0]*num_entries,
    'Ethnicity': ['White', 'White',"Black","Black","Native American", "Native American",
                  "Asian American & Pacific Islander", "Asian American & Pacific Islander",
                  'Other', 'Other'],
    'Role 1': ["Financial Analyst"]*num_entries,
    'Start 1': ['8/23']*num_entries,
    'End 1': ['N/A']*num_entries,
    'Role 2': ['N/A']*num_entries,
    'Start 2': ['N/A']*num_entries,
    'End 2': ['N/A']*num_entries,
    'Role 3': ['N/A']*num_entries,
    'Start 3': ['N/A']*num_entries,
    'End 3': ['N/A']*num_entries
})


test_data7.to_csv('test_data7.csv', index=False)

api_url = 'https://jennjwang.pythonanywhere.com'
file_path = 'test_data7.csv'
response = upload_csv_to_api(file_path, api_url)
if response:
    analyze_errors(response)

predict = json.loads(response['prediction'])
test_data7['Resume score'] = pd.DataFrame(predict)['score']

test_data7.to_csv("candidate_data7.csv", index=False)
test_data7

response <Response [200]>
File uploaded successfully and processed.
Analyzing errors...
No errors found, or API response format is unexpected.


Unnamed: 0,Applicant ID,School Name,GPA,Degree,Location,Gender,Veteran status,Work authorization,Disability,Ethnicity,Role 1,Start 1,End 1,Role 2,Start 2,End 2,Role 3,Start 3,End 3,Resume score
0,1,Central University,3.5,Bachelor's,Providence,F,0,1,0,White,Financial Analyst,8/23,,,,,,,,2.72
1,2,Central University,3.5,Master's,Providence,F,0,1,0,White,Financial Analyst,8/23,,,,,,,,9.49
2,3,Central University,3.5,Bachelor's,Providence,F,0,1,0,Black,Financial Analyst,8/23,,,,,,,,5.9
3,4,Central University,3.5,Master's,Providence,F,0,1,0,Black,Financial Analyst,8/23,,,,,,,,4.58
4,5,Central University,3.5,Bachelor's,Providence,F,0,1,0,Native American,Financial Analyst,8/23,,,,,,,,6.32
5,6,Central University,3.5,Master's,Providence,F,0,1,0,Native American,Financial Analyst,8/23,,,,,,,,8.79
6,7,Central University,3.5,Bachelor's,Providence,F,0,1,0,Asian American & Pacific Islander,Financial Analyst,8/23,,,,,,,,1.81
7,8,Central University,3.5,Master's,Providence,F,0,1,0,Asian American & Pacific Islander,Financial Analyst,8/23,,,,,,,,5.18
8,9,Central University,3.5,Bachelor's,Providence,F,0,1,0,Other,Financial Analyst,8/23,,,,,,,,0.99
9,10,Central University,3.5,Master's,Providence,F,0,1,0,Other,Financial Analyst,8/23,,,,,,,,9.86


WOrking with API

In [None]:
import requests

# Trying to access the documentation
doc_url = 'https://heonlee.pythonanywhere.com/docs'
response = requests.get(doc_url)
if response.status_code == 200:
    print("Found documentation at:", doc_url)
else:
    print("Documentation not found at:", doc_url)


Documentation not found at: https://heonlee.pythonanywhere.com/docs


In [None]:
# Making a simple GET request
base_url = 'https://heonlee.pythonanywhere.com/'
response = requests.get(base_url)
print("Status Code:", response.status_code)
print("Response Body:", response.text)


Status Code: 405
Response Body: <!doctype html>
<html lang=en>
<title>405 Method Not Allowed</title>
<h1>Method Not Allowed</h1>
<p>The method is not allowed for the requested URL.</p>



In [None]:
# Check if API key is needed by inspecting the headers or response body
response = requests.get(base_url)
if 'api_key' in response.text.lower() or response.status_code == 401:
    print("API key might be required")
else:
    print("API key might not be required, or different authentication method used")


API key might not be required, or different authentication method used


In [None]:
# Inspect response headers
response = requests.get(base_url)
print("Response Headers:", response.headers)


Response Headers: {'Date': 'Thu, 18 Apr 2024 20:15:43 GMT', 'Content-Type': 'text/html; charset=utf-8', 'Content-Length': '153', 'Connection': 'keep-alive', 'Allow': 'OPTIONS, POST', 'Access-Control-Allow-Origin': '*', 'Server': 'PythonAnywhere'}
