# Determining FLSA Status with NLP

Jackson Aquino

In [1]:
import pandas

In [92]:
jds = pandas.read_csv('All JDs.csv')

In [94]:
exemption = pandas.read_csv('US_ExemptionStatus.csv')

Need to create a key to merge the two dataframes

In [96]:
jds['key'] = jds['Job Family'] + jds['Career Level']

In [97]:
exemption['key'] = exemption['Job Family'] + exemption['Management Level']

Now merging the two dataframes:

In [98]:
dfexemptionMerge = exemption.drop(columns=['Management Level','Job Family','Job Category','Sales Indicator'])

In [99]:
df = jds.merge(dfexemptionMerge, on='key',how = 'left')

Dropping duplicates

In [101]:
df = df.drop_duplicates(subset=['key'])

Dealing with NANs

In [102]:
df = df.dropna(subset=['Job Profile Summary'])


In [103]:
df = df.fillna('')

Creating a single JD column that will include the whole job description instead of having different fields

In [104]:
df.columns

Index(['Job Family', 'Job Code', 'Career Level', 'Job Title', 'Job Profile',
       'Job Profile Summary', 'Job Specific Skills and Certifications',
       'Responsibilities', 'Education and Experience',
       'Additional Education and Experience', 'key', 'Job Family Group',
       'Exempt Status'],
      dtype='object')

In [105]:
df['JD'] = df['Job Profile Summary'] + '\n' + df['Responsibilities'] + '\n' + df['Education and Experience'] + '\n' + df['Additional Education and Experience'] + '\n' + df['Job Specific Skills and Certifications']

In [106]:
df = df.drop(columns=['Job Profile Summary','Responsibilities','Education and Experience','Additional Education and Experience','Job Specific Skills and Certifications'])

In [107]:
df.columns

Index(['Job Family', 'Job Code', 'Career Level', 'Job Title', 'Job Profile',
       'key', 'Job Family Group', 'Exempt Status', 'JD'],
      dtype='object')

In [108]:
jd_df = df[['Job Family','Career Level','JD','Exempt Status']]

Checking the length of the data frame

In [110]:
len(jd_df)

2025

Checking for NANs

In [111]:
any(jd_df['Exempt Status'].isna())

False

Checking for blanks

In [112]:
jd_df['Exempt Status'].value_counts()

Exempt Status
Exempt        1723
Non-Exempt     246
                56
Name: count, dtype: int64

Dropping blanks

In [113]:
jd_df = jd_df[jd_df['Exempt Status']!='']

In [114]:
len(jd_df)

1969

In [115]:
jd_df['Exempt Status'].value_counts()

Exempt Status
Exempt        1723
Non-Exempt     246
Name: count, dtype: int64

Now the job descriptions are ready to be analyzed

In [116]:
jd_df['Exempt Status'].values[0]

'Non-Exempt'

In [117]:
jd_df.columns

Index(['Job Family', 'Career Level', 'JD', 'Exempt Status'], dtype='object')

In [118]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


# Feature Extraction
tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')

X = tfidf_vectorizer.fit_transform(jd_df['JD'])
y = jd_df['Exempt Status']

# Model Training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LogisticRegression()
model.fit(X_train, y_train)

# Model Evaluation
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Prediction ( test for new job descriptions)
new_jd_text = "Your new job description text here..."
new_jd_vectorized = tfidf_vectorizer.transform([new_jd_text])
predicted_status = model.predict(new_jd_vectorized)
print("Predicted Exempt Status:", predicted_status)

Accuracy: 0.9314720812182741
Predicted Exempt Status: ['Exempt']


In [120]:
jd_df['Exempt Status'].values[0]

'Non-Exempt'

In [121]:
from sklearn.tree import DecisionTreeClassifier

model_reg = DecisionTreeClassifier()
model_reg.fit(X_train, y_train)

# Model Evaluation
y_pred_reg = model_reg.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_reg)
print("Accuracy:", accuracy)

# Prediction (for new job descriptions)
new_jd_text = jd_df['JD'].values[0]
new_jd_vectorized = tfidf_vectorizer.transform([new_jd_text])
predicted_status_reg = model_reg.predict(new_jd_vectorized)
print("Predicted Exempt Status:", predicted_status_reg)


Accuracy: 0.9289340101522843
Predicted Exempt Status: ['Non-Exempt']


In [122]:
# Prediction (for new job descriptions)
new_jd_text = jd_df['JD'].values[0]
new_jd_vectorized = tfidf_vectorizer.transform([new_jd_text])
predicted_status = model.predict(new_jd_vectorized)
print("Predicted Exempt Status:", predicted_status)

Predicted Exempt Status: ['Exempt']


Getting models from previous project

In [123]:
jobs = jd_df[['Job Family','Career Level','Exempt Status']]

In [125]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

# Define a dictionary to store label encoders and mappings
label_encoders = {}
label_encoder_mappings = {}

# Encode non-numerical variables and store label encoders and mappings
for column in jobs.columns:
    if jobs[column].dtype == 'object':
        label_encoders[column] = LabelEncoder()
        jobs[column] = label_encoders[column].fit_transform(jobs[column])

        # Storing the mapping between original categories and encoded values
        label_encoder_mappings[column] = dict(zip(label_encoders[column].classes_, label_encoders[column].transform(label_encoders[column].classes_)))
        print(dict(zip(label_encoders[column].classes_, label_encoders[column].transform(label_encoders[column].classes_))))
# Define features (X) and target (y)
X = jobs.drop('Exempt Status', axis=1)
y = jobs['Exempt Status']

# Split dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the decision tree model
model_1 = DecisionTreeClassifier(random_state=42)
model_1.fit(X_train, y_train)

# Predict Exempt Status on the test set
y_pred_1 = model_1.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred_1)
print("Accuracy of the decision tree model:", accuracy)

# Reverse the encoding using the label encoders
reverse_label_encoders = {col: {label: encoded for encoded, label in mapping.items()} for col, mapping in label_encoder_mappings.items()}


Accuracy of the decision tree model: 0.9441624365482234


In [126]:
# Mapping dictionary
label_map = {0: "Exempt", 1: "Non-Exempt"}

# Replace values in the array with labels
y_pred_1_list = [label_map[value] for value in y_pred_1]

In [129]:
#Now training a logistic regression
from sklearn.linear_model import LogisticRegression

# Train the logistic regression model
logistic_regression_model = LogisticRegression(random_state=42)
logistic_regression_model.fit(X_train, y_train)

# Predict Exempt Status on the test set using logistic regression
y_pred_logistic_regression = logistic_regression_model.predict(X_test)

# Calculate the accuracy of the logistic regression model
accuracy_logistic_regression = accuracy_score(y_test, y_pred_logistic_regression)
print("Accuracy of the logistic regression model:", accuracy_logistic_regression)

Accuracy of the logistic regression model: 0.883248730964467


In [130]:
# Replace values in the array with labels
y_pred_2_list = [label_map[value] for value in y_pred_logistic_regression]

In [134]:
# Replace values in the array with labels
y_test_list = [label_map[value] for value in y_test]

In [135]:
outputdf = pandas.DataFrame([y_pred,y_pred_reg])

In [137]:
outputdf = outputdf.transpose()

In [139]:
outputdf.columns = ['nlp dt', 'nlp reg']

In [140]:
outputdf['dt'] = y_pred_1_list

In [141]:
outputdf['reg'] = y_pred_2_list

In [142]:
outputdf['exempt status'] = y_test_list

In [144]:
outputdf.head()

Unnamed: 0,nlp dt,nlp reg,dt,reg,exempt status
0,Exempt,Exempt,Exempt,Exempt,Exempt
1,Exempt,Exempt,Exempt,Exempt,Exempt
2,Exempt,Exempt,Exempt,Exempt,Exempt
3,Exempt,Exempt,Exempt,Exempt,Exempt
4,Exempt,Exempt,Exempt,Exempt,Exempt


In [145]:
outputdf.to_csv('output.csv',index=False)

In [44]:
outputdf[outputdf['y_pred']!=outputdf['y_pred']]

Unnamed: 0,y_pred,y_pred_reg,y_test
0,Exempt,Exempt,Exempt
1,Exempt,Exempt,Exempt
2,Exempt,Exempt,Exempt
3,Exempt,Exempt,Exempt
4,Exempt,Exempt,Exempt
...,...,...,...
389,Non-Exempt,Non-Exempt,Non-Exempt
390,Exempt,Exempt,Exempt
391,Exempt,Exempt,Exempt
392,Exempt,Exempt,Exempt
