# Data Loading and Cleaning

In [32]:
import pandas as pd
import code_util.data_cleaning as dc
import importlib

from nltk.stem import WordNetLemmatizer

# File paths
raw_data_path = './data/job_descriptions_20k.csv'
preprocessed_data_path = raw_data_path.replace('.csv', '_preprocessed.csv')

lemmatizer = WordNetLemmatizer()

# Define desired database parameters. Set to -1 to load all data.
database_size = -1

# Optionally force data to be regenerated
force_data_regeneration = False

try:
    # If force_data_regeneration is set, force an exception to reload the data
    if force_data_regeneration:
        print('Forcing data regeneration.')
        raise ValueError('Forcing data regeneration.')
    
    # Load the preprocessed data if it exists
    df = pd.read_csv(preprocessed_data_path)
    
    # If dataframe is not expected size, reload the data
    if database_size != -1 and len(df) > database_size:
        df = df.sample(n=database_size)
    elif database_size != -1 and len(df) < database_size:    
        print('Preprocessed file is not the expected size. Reloading data.')
        raise ValueError('Preprocessed file is not the expected size.')
    
    print('Preprocessed file found and loaded.')
except (FileNotFoundError, ValueError):
    # Load dataset with stock data
    df = pd.read_csv(raw_data_path)

    # Clean the text data
    print('Cleaning text data...')
    df['Preprocessed Job Description'] = df['Job Description'].apply(lambda x: dc.preprocess_document(x, lemmatizer))
    df['Preprocessed Responsibilities'] = df['Responsibilities'].apply(lambda x: dc.preprocess_document(x, lemmatizer))
        
    # Save the preprocessed data
    df.drop_duplicates(inplace=True)
    df.to_csv(preprocessed_data_path, index=False)
    print('File preprocessing completed and saved.')

Preprocessed file found and loaded.


In [2]:
import code_util.text_embedding as te
importlib.reload(dc)

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.preprocessing import MultiLabelBinarizer
from gensim.models import KeyedVectors

# Define feature columns
text_features = ['Preprocessed Job Description', 'Preprocessed Responsibilities']
categorical_features = ['Qualifications', 'Job Title']
numerical_features = ['Company Size']

# Define target columns
target_columns = ['skills']

# Remove all unused columns
df = df[text_features + categorical_features + numerical_features + target_columns]

# Preprocess the categorical data
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
categorical_transformer.fit(df[categorical_features])

# Preprocess the numerical data
scaler = StandardScaler()
numerical_transformer = scaler.fit(df[['Company Size']])

# Prepare the text vectorizers
word_vectors = KeyedVectors.load_word2vec_format('./models/GoogleNews-vectors-negative300.bin', binary=True)
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=5, 
                                  max_features=5000, 
                                  ngram_range=(1, 3))
tfidf_vectorizer.fit(df['Preprocessed Job Description'])

In [3]:
from sklearn.model_selection import train_test_split
importlib.reload(te)
importlib.reload(dc)

# Prepare the two feature matrices
word2Vec_X = te.embed_w2v_dataframe(df, word_vectors, categorical_transformer, scaler, categorical_features, numerical_features)
tfidf_X = dc.prepare_dataset(df, tfidf_vectorizer, categorical_transformer, scaler, text_features, categorical_features, numerical_features)

# Itemize the skills in the skills columns
df['skills'] = df['skills'].apply(dc.preprocess_skills)
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['skills'])

# Split the dataset into training and testing sets (80% train, 20% test)
X_w2v_train, X_w2v_test, y_train, y_test = train_test_split(word2Vec_X, y, test_size=0.2, random_state=42)
X_tfidf_train, X_tfidf_test, y_train, y_test = train_test_split(tfidf_X, y, test_size=0.2, random_state=42)

## Model Definition & Tuning

In [4]:
import code_util.model_util as mu
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier

# Define the parameter grid for w2v_model
w2v_param_grid = {
    'estimator__n_estimators': [50, 100, 200],
    'estimator__max_depth': [None, 5, 10, 15],
}

# Define the parameter grid for tfidf_model
tfidf_param_grid = {
    'estimator__n_estimators': [50, 100, 200],
    'estimator__max_depth': [None, 5, 10, 15],
}

# Prepare model loading parameters
scoring_metric = 'f1_micro'
folds = 3
force_reload = False
w2v_model = OneVsRestClassifier(RandomForestClassifier(n_estimators=100, random_state=42),
                                verbose = -1)
tfidf_model = OneVsRestClassifier(RandomForestClassifier(n_estimators=100, random_state=42),
                                  verbose = -1)

# Load the models from disk if they exist, otherwise train new models
w2v_model = mu.load_model(w2v_model, './models/w2v_model.pkl', w2v_param_grid, X_w2v_train, y_train, scoring_metric, folds, force_reload)
tfidf_model = mu.load_model(tfidf_model, './models/tfidf_model.pkl', tfidf_param_grid, X_tfidf_train, y_train, scoring_metric, folds, force_reload)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] END estimator__max_depth=None, estimator__n_estimators=50; total time=64.7min
[CV] END estimator__max_depth=None, estimator__n_estimators=50; total time=64.8min
[CV] END estimator__max_depth=None, estimator__n_estimators=50; total time=64.8min
[CV] END estimator__max_depth=None, estimator__n_estimators=100; total time=162.2min
[CV] END estimator__max_depth=None, estimator__n_estimators=100; total time=162.2min
[CV] END estimator__max_depth=None, estimator__n_estimators=100; total time=162.3min
[CV] END .estimator__max_depth=5, estimator__n_estimators=50; total time=98.9min
[CV] END .estimator__max_depth=5, estimator__n_estimators=50; total time=99.0min
[CV] END .estimator__max_depth=5, estimator__n_estimators=50; total time=26.3min
[CV] END estimator__max_depth=None, estimator__n_estimators=200; total time=219.6min
[CV] END estimator__max_depth=None, estimator__n_estimators=200; total time=220.0min
[CV] END estimator__ma

## Model Evaluation

In [5]:
from sklearn.metrics import classification_report

# Predict on the test data
y_w2v_pred = w2v_model.predict(X_w2v_test)
y_tfidf_pred = tfidf_model.predict(X_tfidf_test)

# Additional detailed performance analysis
print("Word2Vec Classification Report:")
print(classification_report(y_test, y_w2v_pred, target_names=mlb.classes_))
print("\nTF-IDF Classification Report:")
print(classification_report(y_test, y_tfidf_pred, target_names=mlb.classes_))

Word2Vec Classification Report:
                                                             precision    recall  f1-score   support

                                                A/B testing       1.00      1.00      1.00        29
                               A/B testing and optimization       1.00      1.00      1.00         6
                                                       AJAX       1.00      1.00      1.00         7
                                 API design and development       1.00      1.00      1.00         9
                                            API development       1.00      1.00      1.00        35
                                            API integration       1.00      1.00      1.00        12
                                              API knowledge       1.00      1.00      1.00        16
                                                       APIs       1.00      1.00      1.00         8
                          APIs and web services integratio

## Custom Exampels

In [39]:
importlib.reload(dc)
importlib.reload(te)
import pprint

def eval_sample_data(data):
    """Runs TF-IDF and Word2Vec models on a sample datapoint and prints the predicted skills.

    Args:
        data (dict): A dictionary containing the sample data.
    """
    sample_df = pd.DataFrame([data])
    sample_df['Preprocessed Job Description'] = sample_df['Job Description'].apply(lambda x: dc.preprocess_document(x, lemmatizer))
    sample_df['Preprocessed Responsibilities'] = sample_df['Responsibilities'].apply(lambda x: dc.preprocess_document(x, lemmatizer))

    sample_tfidf_X = dc.prepare_dataset(sample_df, tfidf_vectorizer, categorical_transformer, scaler, text_features, categorical_features, numerical_features)
    sample_w2v_X = te.embed_w2v_dataframe(sample_df, word_vectors, categorical_transformer, scaler, categorical_features, numerical_features)

    # Predict the skills required for the sample datapoint
    y_sample_pred_w2v = w2v_model.predict(sample_w2v_X)
    y_sample_pred_tfidf = tfidf_model.predict(sample_tfidf_X)

    # Convert the predicted skills back to their original labels
    w2v_pred_skills = mlb.inverse_transform(y_sample_pred_w2v)
    tfidf_pred_skills = mlb.inverse_transform(y_sample_pred_tfidf)
    pprint.pprint(data)
    print("Word2Vec Predicted Skills:", w2v_pred_skills)
    print("TF-IDF Predicted Skills:", tfidf_pred_skills)

In [40]:
importlib.reload(dc)
importlib.reload(te)

# Define a sample datapoint
data_points = [
{
    'Job Description': 'Family Law Attorneys deal with legal matters related to relationships.',
    'Responsibilities': 'Specialize in family law matters, such as divorce, child custody, and adoption.',
    'Qualifications': 'PhD',
    'Work Type': 'Full-Time',
    'Preference': 'Female',
    'Job Title': 'Litigation Attorney',
    'Role': 'Family Law Attorney',
    'Company Size': 10000,
},
{
    'Job Description': 'Urgently hiring a educational specialist in early-childhood development and special education curriculums.',
    'Responsibilities': 'Develop and implement educational programs for children with special needs.',
    'Qualifications': 'Masters',
    'Work Type': 'Part-Time',
    'Preference': 'Both',
    'Job Title': 'Special Education Teacher',
    'Role': 'Educational Specialist',
    'Company Size': 500,
}]

for data in data_points:
    eval_sample_data(data)
    print('\n')

{'Company Size': 10000,
 'Job Description': 'Family Law Attorneys deal with legal matters related to '
                    'relationships.',
 'Job Title': 'Litigation Attorney',
 'Preference': 'Female',
 'Qualifications': 'PhD',
 'Responsibilities': 'Specialize in family law matters, such as divorce, child '
                     'custody, and adoption.',
 'Role': 'Family Law Attorney',
 'Work Type': 'Full-Time'}
Word2Vec Predicted Skills: [('Court representation', 'Family law', 'Mediation')]
TF-IDF Predicted Skills: [()]


{'Company Size': 500,
 'Job Description': 'Urgently hiring a educational specialist in '
                    'early-childhood development and special education '
                    'curriculums.',
 'Job Title': 'Special Education Teacher',
 'Preference': 'Both',
 'Qualifications': 'Masters',
 'Responsibilities': 'Develop and implement educational programs for children '
                     'with special needs.',
 'Role': 'Educational Specialist',
 'Work Type': 'Par