<center><h1>Datapoint Generation for AI Hiring System</center></h1>

In [14]:
import numpy as np
import pandas as pd

## Features for Datapoint Generation

In [15]:
# potential schools for datapoints 
schools = ['Brown University', 'Columbia University', 'SUNY Binghamton University', 'SUNY New Paltz',
           'Providence College','Rhode Island School of Design', 'Bentley University', 'Colgate University']

# potential gpas for datapoints 
gpa = np.arange(1.2,4.1,0.1,dtype=float)

# degree options for datapoints 
degrees = ['Bachelors','Masters','Phd']

# potential locations for datapoints 
locations = ['Providence','Boston','New York City','Los Angeles','Miami','Chicago', 'Detroit', 'Washington D.C.']

# potential genders 
gender = ['M', 'F', 'N/A']

# potential veteran status 
veteran = ['0','1','N/A']

# potential work authorization
work_ath = ['0','1']

# potential value for disability 
disability = ['0','1','N/A']

# potential ethnicities 
ethnicity = ['0','1','2','3','4']

# potential roles 
roles = ['Junior SWE', 'Senior SWE','Data scientist','Lawyer','ML Engineer','N/A', 'Chef','Bus Driver']

# master list for all attributes not include roles 
candidate_atts = [schools, gpa, degrees, locations, gender, veteran, work_ath, disability, ethnicity]

# columns for dataframe 
COLUMNS = ['School Name','GPA','Degree','Location','Gender',
        'Veteran status','Work authorization','Disability','Ethnicity',
        'Role 1', 'Start 1', 'End 1','Role 2','Start 2','End 2','Role 3','Start 3','End 3']


In [28]:
import numpy as np
import pandas as pd

MONTHS = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
COLUMNS = ['School Name', 'GPA', 'Degree', 'Location', 'Gender',
           'Veteran status', 'Work authorization', 'Disability', 'Ethnicity',
           'Role 1', 'Start 1', 'End 1', 'Role 2', 'Start 2', 'End 2', 'Role 3', 'Start 3', 'End 3']

def generate_data(n_samples: int, candidate_atts: list, roles: list, random_seed: int) -> pd.DataFrame:
    np.random.seed(random_seed)
    all_candidates = []  # List for all datapoints

    # Generate for each datapoint
    for i in range(n_samples):
        candidate = [i + 1]  # New candidate starting with Applicant ID

        # Loop through each attribute type for given candidate
        for attribute in candidate_atts:
            # Pick random attribute from list
            candidate.append(attribute[np.random.randint(0, len(attribute))])

        # Generate role types
        cand_roles = [roles[np.random.randint(0, len(roles))] for _ in range(3)]
        role_history = []  # Candidates job history

        # Loop through candidate roles to populate dates
        for index, role in enumerate(cand_roles):
            if role == 'N/A':
                while len(role_history) != 9:
                    role_history.append('N/A')
                break
            else:
                start_month = MONTHS[np.random.randint(0, len(MONTHS))]
                end_month = start_month  # Initialize end_month same as start_month

                start_year = np.random.randint(10, 24)
                range_high = 23 - start_year
                if range_high <= 0:  # Safeguard against low >= high
                    end_year = 23  # Set to max year if no range is available
                else:
                    end_year = start_year + np.random.randint(0, range_high)

                # If the start year equals the end year, adjust the end month to be after the start month
                if start_year == end_year:
                    if start_month == 12:  # Special case where start month is December
                        end_month = 1
                        end_year += 1  # Increment the year if end month cannot be later
                    else:
                        end_month = np.random.randint(start_month + 1, 13)

                start = f"{start_month}/{start_year}"
                if index == 0 and np.random.rand() < 0.1:  # 10% chance to end as 'N/A'
                    end = 'N/A'
                else:
                    end = f"{end_month}/{end_year}"

                role_history.append(role)
                role_history.append(start)
                role_history.append(end)

        candidate.extend(role_history)
        all_candidates.append(candidate)

    # Insert columns including Applicant ID at the start
    columns_with_id = ['Applicant ID'] + COLUMNS
    df = pd.DataFrame(all_candidates, columns=columns_with_id)

    return df

# Generate the DataFrame
df = generate_data(4000, candidate_atts, roles, 1951)

In [29]:
df

Unnamed: 0,Applicant ID,School Name,GPA,Degree,Location,Gender,Veteran status,Work authorization,Disability,Ethnicity,Role 1,Start 1,End 1,Role 2,Start 2,End 2,Role 3,Start 3,End 3
0,1,Rhode Island School of Design,3.1,Bachelors,Miami,M,1,0,0,2,ML Engineer,7/16,7/22,Junior SWE,11/19,11/22,Senior SWE,12/21,12/22
1,2,Bentley University,1.9,Phd,New York City,,0,0,1,0,Bus Driver,11/13,11/18,ML Engineer,11/14,12/14,,,
2,3,SUNY New Paltz,1.7,Masters,Washington D.C.,M,1,1,1,1,Data scientist,4/22,6/22,,,,,,
3,4,Brown University,2.8,Bachelors,Los Angeles,F,1,0,1,2,Senior SWE,6/22,11/22,Lawyer,1/12,4/12,Lawyer,8/13,8/14
4,5,SUNY Binghamton University,2.8,Masters,Los Angeles,,1,1,,4,Lawyer,6/11,,Junior SWE,6/10,7/10,Junior SWE,7/23,10/23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,3996,Rhode Island School of Design,2.3,Masters,Detroit,F,1,0,,0,ML Engineer,3/15,3/18,Lawyer,5/21,5/22,,,
3996,3997,SUNY Binghamton University,1.6,Phd,Providence,,0,0,,0,Lawyer,10/17,10/18,Data scientist,5/21,5/22,Junior SWE,5/16,5/22
3997,3998,SUNY New Paltz,1.2,Bachelors,New York City,F,0,1,,4,Chef,8/10,8/17,,,,,,
3998,3999,Brown University,1.8,Bachelors,Washington D.C.,,0,1,1,4,Bus Driver,1/16,1/20,Chef,9/11,9/13,Senior SWE,9/14,9/22


In [30]:
# save data to local drive
csv_file_path = '../data/candidate_data_1.csv'  
df.to_csv(csv_file_path, index=False)