In [9]:
import numpy as np
import pandas as pd
import requests


In [3]:
'''
A list of the name of major cities, zip code, job title, college,
'''

cities = [
    "New York", "Los Angeles", "Chicago", "Houston", "Phoenix", "Philadelphia",
    "San Antonio", "San Diego", "Dallas", "San Jose", "Austin", "Jacksonville",
    "Fort Worth", "Columbus", "Charlotte", "San Francisco", "Providence", "Boston", "Miami"
]

zip_codes = [
    "02802", "02804", "02806", "02807", "02808", "02809", "02812", "02813", "02814", "02815",
    "02816", "02817", "02818", "02822", "02823", "02824", "02825", "02826", "02827", "02828",
    "02829", "02830", "02831", "02832", "02833", "02835", "02836", "02837", "02838", "02839",
    "02840", "02841", "02842", "02852", "02857", "02858", "02859", "02860", "02861", "02863",
    "02864", "02865", "02871", "02872", "02873", "02874", "02875", "02876", "02878", "02879",
    "02881", "02882", "02885", "02886", "02888", "02889", "02891", "02892", "02893", "02894",
    "02895", "02896", "02898", "02903", "02904", "02905", "02906", "02907", "02908", "02909",
    "02910", "02911", "02912", "02914", "02915", "02916", "02917", "02918", "02919", "02920",
    "02921"
]

job_titles = [
    "Software Engineer", "Data Scientist",
    "Lawyer", "Machine Learning Engineer",
    "Research Assistant", "Product Manager",
    "Marketing Coordinator", "Financial Analyst", "Human Resources Manager",
    "Sales Representative", "Graphic Designer", "Web Developer", "N/A"
]

rhode_island_colleges = [
    "Brown University",
    "University of Rhode Island",
    "Rhode Island School of Design",
    "Providence College",
    "Bryant University",
    "Roger Williams University",
    "Johnson & Wales University",
    "Rhode Island College",
    "Salve Regina University",
    "New England Institute of Technology"
]


dictionary = {
    "School Name" :   rhode_island_colleges,
    "Degree" :        ["Bachelor's", "Master's", "PhD"],
    "Location" :      cities,
    "Gender" :        ["F", "M", "N/A"],
    "Veteran status": [1, 0 , "N/A"], #1 for Yes, 0 for No, N/A for not provided
    "Work authorization" : [1, 0],
    "Disability" : [1, 0 , "N/A"],
    "Ethnicity" : ['White', 'Black', 'Native American', 'Asian American & Pacific Islander', 'Other'],
    "Role" : job_titles

}

In [6]:
def parse_date(date_str):
    '''
    Parse the date string formatted as mm/yy into a datetime object assuming the first of the month
    '''
    if date_str == "N/A":
        return None
    month, year = map(int, date_str.split('/'))
    return np.datetime64(f"20{year:02d}-{month:02d}-01")

def generate_period(start_year, end_year, prev_end_date=None):
    '''
    Generate a start and end date, ensuring the start date is after prev_end_date if provided.
    '''
    if prev_end_date and prev_end_date != "N/A":
        prev_end_date = parse_date(prev_end_date)  # Convert previous end date to datetime if not None
        prev_year = prev_end_date.astype('datetime64[Y]').astype(int) + 1970
        prev_month = prev_end_date.astype('datetime64[M]').astype(int) % 12 + 1

        # Start the new period in the next month of the previous end date
        if prev_month == 12:
            start_year = prev_year + 1
            start_month = 1
        else:
            start_year = prev_year
            start_month = prev_month + 1
    else:
        # Choose a random start month and year if no previous end date is provided
        start_year = np.random.choice(range(start_year, end_year))
        start_month = np.random.choice(range(1, 13))

    start_date = f"{start_month}/{start_year % 100:02d}"
    end_month = np.random.choice(range(start_month, 13))
    end_year = start_year
    end_date = f"{end_month}/{end_year % 100:02d}"

    return start_date, end_date


In [7]:
# Set the number of candidates you wish to generate
num_entries = 500

# Generate synthetic data
new_data = pd.DataFrame({
    'Applicant ID': range(1, 1 + num_entries),
    'School Name': np.random.choice(dictionary["School Name"], num_entries),
    'GPA': np.clip(np.round(np.random.normal(3.1, 0.4, num_entries), 2), 0.0, 4.0),
    'Degree': np.random.choice(dictionary["Degree"], num_entries),
    'Location': np.random.choice(dictionary["Location"], num_entries),
    'Gender': np.random.choice(dictionary["Gender"], num_entries),
    'Veteran status': np.random.choice(dictionary["Veteran status"], num_entries),
    'Work authorization': np.random.choice(dictionary["Work authorization"], num_entries),
    'Disability': np.random.choice(dictionary["Disability"], num_entries),
    'Ethnicity': np.random.choice(dictionary["Ethnicity"], num_entries),
})


# Assign roles and dates ensuring dependencies

for index in new_data.index:
    prev_end_date = None
    for i in range(1, 4):
        role_column = f'Role {i}'
        start_column = f'Start {i}'
        end_column = f'End {i}'

        # Check if previous role exists before setting current role
        if i == 1 or str(new_data.at[index, f'Role {i-1}']) != "N/A":
            new_data.at[index, role_column] = np.random.choice(job_titles + ["N/A"]*i)
            if str(new_data.at[index, role_column]) != "N/A":
              start_date, end_date = generate_period(2015, 2023, prev_end_date)
            else :
              start_date, end_date = "N/A", "N/A"
            new_data.at[index, start_column] = start_date
            new_data.at[index, end_column] = end_date
            prev_end_date = end_date  # Update previous end date to the current end date
        else:
            new_data.at[index, role_column] = "N/A"
            new_data.at[index, start_column] = "N/A"
            new_data.at[index, end_column] = "N/A"

# Combine and save data
generated_data = new_data
generated_data.to_csv('generated_data.csv', index=False)


In [8]:
generated_data

Unnamed: 0,Applicant ID,School Name,GPA,Degree,Location,Gender,Veteran status,Work authorization,Disability,Ethnicity,Role 1,Start 1,End 1,Role 2,Start 2,End 2,Role 3,Start 3,End 3
0,1,Providence College,2.70,Bachelor's,Dallas,,0,1,1,Native American,Financial Analyst,8/22,11/22,Graphic Designer,12/22,12/22,Sales Representative,1/23,12/23
1,2,Rhode Island School of Design,2.97,Bachelor's,Dallas,,1,0,1,Native American,Data Scientist,7/15,9/15,Graphic Designer,10/15,11/15,Software Engineer,12/15,12/15
2,3,Salve Regina University,3.68,Master's,San Jose,,0,1,1,Black,Research Assistant,9/18,12/18,Financial Analyst,1/19,4/19,Machine Learning Engineer,5/19,9/19
3,4,Providence College,2.45,Master's,San Jose,F,1,0,0,Native American,Marketing Coordinator,1/17,11/17,Financial Analyst,12/17,12/17,,,
4,5,University of Rhode Island,2.85,Master's,Phoenix,,,0,1,White,Machine Learning Engineer,8/20,8/20,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,496,Bryant University,3.58,Master's,Los Angeles,,1,1,0,Black,Software Engineer,6/22,7/22,Web Developer,8/22,10/22,Data Scientist,11/22,12/22
496,497,Providence College,4.00,Bachelor's,Miami,,0,0,,Asian American & Pacific Islander,Product Manager,5/20,8/20,Web Developer,9/20,11/20,Product Manager,12/20,12/20
497,498,University of Rhode Island,3.26,Bachelor's,Fort Worth,M,,1,0,Asian American & Pacific Islander,Web Developer,11/21,12/21,Sales Representative,1/22,7/22,Human Resources Manager,8/22,11/22
498,499,Rhode Island School of Design,2.75,PhD,Los Angeles,M,0,1,,Other,Lawyer,12/19,12/19,Machine Learning Engineer,1/20,3/20,Marketing Coordinator,4/20,6/20


Append Resume score to dataset

In [10]:
# Use the Candidate scorer API

import requests
import pandas as pd

def upload_csv_to_api(file_path, api_url):
    try:
        # Load CSV file
        data = pd.read_csv(file_path)
        # Convert DataFrame to JSON
        json_data = data.to_json(orient='records')

        # Send POST request to the API
        headers = {'Content-Type': 'application/json'}
        response = requests.post(api_url, data=json_data, headers=headers)
        print('response', response)

        # Check if the request was successful
        if response.status_code == 200:
            print("File uploaded successfully and processed.")
            return response.json()  # Assuming the API returns JSON with error details or results
        else:
            print("Failed to process file. Status code:", response.status_code)
            print("Response:", response.text)
    except Exception as e:
        print("An error occurred:", e)

def analyze_errors(api_response):
    # This function needs to be customized based on how the API response looks.
    # Assuming the API returns a list of errors with row indices or similar identifiers.
    print("Analyzing errors...")
    if 'errors' in api_response:
        errors = api_response['errors']
        for error in errors:
            print(f"Error in row {error['row']}: {error['message']}")
    else:
        print("No errors found, or API response format is unexpected.")


In [11]:
import json

api_url = 'https://jennjwang.pythonanywhere.com'
file_path = 'generated_data.csv'
response = upload_csv_to_api(file_path, api_url)
if response:
    analyze_errors(response)

predict = json.loads(response['prediction'])
generated_data['score'] = pd.DataFrame(predict)['score']
generated_data.head()

response <Response [200]>
File uploaded successfully and processed.
Analyzing errors...
No errors found, or API response format is unexpected.


Unnamed: 0,Applicant ID,School Name,GPA,Degree,Location,Gender,Veteran status,Work authorization,Disability,Ethnicity,Role 1,Start 1,End 1,Role 2,Start 2,End 2,Role 3,Start 3,End 3,score
0,1,Providence College,2.7,Bachelor's,Dallas,,0.0,1,1,Native American,Financial Analyst,8/22,11/22,Graphic Designer,12/22,12/22,Sales Representative,1/23,12/23,6.22
1,2,Rhode Island School of Design,2.97,Bachelor's,Dallas,,1.0,0,1,Native American,Data Scientist,7/15,9/15,Graphic Designer,10/15,11/15,Software Engineer,12/15,12/15,7.06
2,3,Salve Regina University,3.68,Master's,San Jose,,0.0,1,1,Black,Research Assistant,9/18,12/18,Financial Analyst,1/19,4/19,Machine Learning Engineer,5/19,9/19,4.02
3,4,Providence College,2.45,Master's,San Jose,F,1.0,0,0,Native American,Marketing Coordinator,1/17,11/17,Financial Analyst,12/17,12/17,,,,1.27
4,5,University of Rhode Island,2.85,Master's,Phoenix,,,0,1,White,Machine Learning Engineer,8/20,8/20,,,,,,,8.59
