In [2]:
import re
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score

def extract_job_details(job_description):
    # Define patterns to extract details
    experience_pattern = re.compile(r"(\d+)\s+years?\s+of\s+coding\s+experience", re.IGNORECASE)
    languages_pattern = re.compile(r"\b(?:with|using|in)\s+([\w\s,]+)\s+programming\s+language", re.IGNORECASE)
    degree_pattern = re.compile(r"\b(High School Diploma|Associate's Degree|Bachelor's Degree|Master's Degree)\b", re.IGNORECASE)
    majors_pattern = re.compile(r"\b(?:major in|degree in)\s+([\w\s,]+)", re.IGNORECASE)
    internship_pattern = re.compile(r"\b(internship experience is)\s+(required|not required)", re.IGNORECASE)
    industry_exp_pattern = re.compile(r"(\d+)\s+years?\s+of\s+desired\s+industry\s+experience", re.IGNORECASE)

    # Extract details using regex patterns
    coding_experience = int(experience_pattern.search(job_description).group(1)) if experience_pattern.search(job_description) else 0
    languages = languages_pattern.search(job_description).group(1) if languages_pattern.search(job_description) else ""
    degree = degree_pattern.search(job_description).group(1) if degree_pattern.search(job_description) else ""
    majors = majors_pattern.search(job_description).group(1) if majors_pattern.search(job_description) else ""
    internship_required = 'Yes' if internship_pattern.search(job_description) and internship_pattern.search(job_description).group(2).lower() == 'required' else 'No'
    industry_experience = int(industry_exp_pattern.search(job_description).group(1)) if industry_exp_pattern.search(job_description) else 0

    return coding_experience, languages, degree, majors, internship_required, industry_experience

def get_job_description():
    print("Enter the job description: ")
    job_description = []
    while True:
        line = input()
        if line:
            job_description.append(line)
        else:
            break
    return " ".join(job_description)

# Get the job description from the user
job_description = get_job_description()

# Extract details from the job description
ce_y, ce_l, ed, m, i, ie = extract_job_details(job_description)

# Load the dataset
data = pd.read_csv("../Data/INFO 498 I Mock Data Prep - OpenAI ChatGPT.csv")
#data = pd.read_csv("../Data/INFO 498 I Mock Data Prep - Meta Llama 2.csv")

# Function to calculate qualification rating and likelihood
def qualification_rating(cy, cl, ed, m, i, ie):
    ratings = []
    likelihood = []

    for index, row in data.iterrows():
        curr_rating = 0

        # Coding Experience (Years)
        if row["Coding Experience (Years)"] >= cy:
            curr_rating += row["Coding Experience (Years)"] - cy
        else:
            curr_rating -= 1

        # Coding Experience (Languages)
        lang = cl.split(", ")
        if isinstance(row["Coding Experience (Languages)"], str):
            if any(language in row["Coding Experience (Languages)"].split(", ") for language in lang):
                curr_rating += 1
            else:
                curr_rating -= 1
        else:
            curr_rating -= 1

        # Education
        ed_vals = {"High School Diploma": 0, "Associate's Degree": 1, "Bachelor's Degree": 2, "Master's Degree": 3}
        if ed_vals.get(row["Education"], -1) >= ed_vals.get(ed, -1):
            curr_rating += ed_vals[row["Education"]] - ed_vals[ed] + 1

        # Major
        if row["Major"] in m.split(", "):
            curr_rating += 1
        else:
            curr_rating -= 1

        # Internship
        if (i == "No" and row["Internship"] == "Yes") or (i == "Yes" and row["Internship"] == "No"):
            curr_rating -= 1

        # Industry Experience
        if row["Industry Experience"] >= ie:
            curr_rating += row["Industry Experience"] - ie
        else:
            curr_rating -= 1

        ratings.append(curr_rating)
        likelihood.append(1 if curr_rating >= 4 else 0)

    return ratings, likelihood

data["Qualified"], data["Likelihood"] = qualification_rating(ce_y, ce_l, ed, m, i, ie)

# Sort candidates based on qualification ratings
top_candidates = data.sort_values(by='Qualified', ascending=False).head(20)

# Output the top 20 candidate's first/last name, and number
for index, candidate in top_candidates.iterrows():
    print(f"{candidate['First Name']} {candidate['Last Name']}: {candidate['Qualified']}")
    print(f"Likelihood: {candidate['Likelihood']}")
    print(f"Gender: {candidate['Gender']}")
    print(f"Ethnicity: {candidate['Ethnicity']}")
    print(f"Disability: {candidate['Disabilities']}")
    print(f"Veteran: {candidate['Veteran']}")
    print(f"Marital Status: {candidate['Marital Status']}")
    print()

# Select relevant features
X = data[['Coding Experience (Years)', 'Coding Experience (Languages)', 'Education', 'Major', 'Internship', 'Industry Experience', 'Gender', 'Ethnicity', 'Disabilities', 'Veteran', 'Marital Status']]
y = data['Likelihood']

# Split the data into training, validation, and testing sets
train_data, test_and_validation_data = train_test_split(data, test_size=0.2, random_state=3)
validation_data, test_data = train_test_split(test_and_validation_data, test_size=0.5, random_state=3)

# Preprocessing and Model pipeline
numeric_features = ['Coding Experience (Years)', 'Industry Experience']
categorical_features = ['Gender', 'Ethnicity', 'Disabilities', 'Veteran', 'Marital Status', 'Education', 'Major', 'Internship']

# Imputers
numeric_imputer = SimpleImputer(strategy='mean')
categorical_imputer = SimpleImputer(strategy='most_frequent')

# Transformers
numeric_transformer = Pipeline(steps=[
    ('imputer', numeric_imputer),
    ('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[
    ('imputer', categorical_imputer),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression())])

# Prepare data for training
X_train = train_data[numeric_features + categorical_features]
y_train = train_data['Likelihood']
X_test = test_data[numeric_features + categorical_features]
y_test = test_data['Likelihood']

# Fit the model
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Enter the job description: 


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc

# Predict probabilities
y_prob = clf.predict_proba(X_test)[:, 1]

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred, labels=clf.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=clf.classes_)
disp.plot(cmap=plt.cm.Blues)
plt.title("Confusion Matrix")
plt.show()

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()
