## Imported functions from DataCleaning

### TO-DO:
- Try Word2Vec instead of TF-IDF
- Flush out simple model to use KMeansClustering. Include hyperparamter optimization
- Consider using a pre-trained GPT model
- Evaluate model on training/validation data instead of randomly generated data

# Data Loading and Cleaning

In [3]:
import pandas as pd
import data_cleaning as dc

from nltk.stem import WordNetLemmatizer

# File paths
raw_data_path = './data/job_descriptions.csv'
preprocessed_data_path = raw_data_path.replace('.csv', '_preprocessed.csv')

lemmatizer = WordNetLemmatizer()

# Define desired database parameters. Set to -1 to load all data.
database_size = 15000

# Optionally force data to be regenerated
force_data_regeneration = False

try:
    # If force_data_regeneration is set, force an exception to reload the data
    if force_data_regeneration:
        print('Forcing data regeneration.')
        raise ValueError('Forcing data regeneration.')
    
    # Load the preprocessed data if it exists
    df = pd.read_csv(preprocessed_data_path)
    
    # If dataframe is not expected size, reload the data
    if database_size != -1 and len(df) > database_size:
        df = df.sample(n=database_size)
    elif database_size != -1 and len(df) < database_size:    
        print('Preprocessed file is not the expected size. Reloading data.')
        raise ValueError('Preprocessed file is not the expected size.')
    
    print('Preprocessed file found and loaded.')
except (FileNotFoundError, ValueError):
    # Load dataset with stock data
    df = pd.read_csv(raw_data_path)

    # Clean the text data
    print('Cleaning text data...')
    df['Preprocessed Job Description'] = df['Job Description'].apply(lambda x: dc.preprocess_document(x, lemmatizer))
    df['Preprocessed Responsibilities'] = df['Responsibilities'].apply(lambda x: dc.preprocess_document(x, lemmatizer))
        
    # Save the preprocessed data
    df.drop_duplicates(inplace=True)
    df.to_csv(preprocessed_data_path, index=False)
    print('File preprocessing completed and saved.')

Preprocessed file found and loaded.


In [4]:
import text_embedding as te
import importlib
importlib.reload(dc)

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.preprocessing import MultiLabelBinarizer
from gensim.models import KeyedVectors

# Define feature columns
text_features = ['Preprocessed Job Description', 'Preprocessed Responsibilities']
categorical_features = ['Qualifications', 'Job Title']
numerical_features = ['Company Size']

# Define target columns
target_columns = ['skills']

# Remove all unused columns
df = df[text_features + categorical_features + numerical_features + target_columns]

# Preprocess the categorical data
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
categorical_transformer.fit(df[categorical_features])

# Preprocess the numerical data
scaler = StandardScaler()
numerical_transformer = scaler.fit(df[['Company Size']])

# Prepare the text vectorizers
word_vectors = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=5, 
                                  max_features=5000, 
                                  ngram_range=(1, 3))
tfidf_vectorizer.fit(df['Preprocessed Job Description'])

In [5]:
from sklearn.model_selection import train_test_split

# Prepare the two feature matrices
word2Vec_X = te.embed_w2v_dataframe(df, word_vectors, categorical_transformer, numerical_transformer, categorical_features, numerical_features)
tfidf_X = dc.prepare_dataset(df, tfidf_vectorizer, categorical_transformer, numerical_transformer, text_features, categorical_features, numerical_features)

# Itemize the skills in the skills columns
df['skills'] = df['skills'].apply(dc.preprocess_skills)
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['skills'])

# Split the dataset into training and testing sets (80% train, 20% test)
X_w2v_train, X_w2v_test, y_train, y_test = train_test_split(word2Vec_X, y, test_size=0.2, random_state=42)
X_tfidf_train, X_tfidf_test, y_train, y_test = train_test_split(tfidf_X, y, test_size=0.2, random_state=42)

## Model Definition & Tuning

In [6]:
import model_util as mu
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier

# Define the parameter grid for w2v_model
w2v_param_grid = {
    'estimator__n_estimators': [50, 100, 200],
    'estimator__max_depth': [None, 5, 10, 15],
}

# Define the parameter grid for tfidf_model
tfidf_param_grid = {
    'estimator__n_estimators': [50, 100, 200],
    'estimator__max_depth': [None, 5, 10, 15],
}

# Prepare model loading parameters
scoring_metric = 'f1_samples'
folds = 3
force_reload = False
w2v_model = OneVsRestClassifier(RandomForestClassifier(n_estimators=100, random_state=42))
tfidf_model = OneVsRestClassifier(RandomForestClassifier(n_estimators=100, random_state=42))

# Load the models from disk if they exist, otherwise train new models
w2v_model = mu.load_model(w2v_model, './models/w2v_model.pkl', w2v_param_grid, X_w2v_train, y_train, scoring_metric, folds, force_reload)
tfidf_model = mu.load_model(tfidf_model, './models/tfidf_model.pkl', tfidf_param_grid, X_tfidf_train, y_train, scoring_metric, folds, force_reload)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] END estimator__max_depth=None, estimator__n_estimators=50; total time=19.9min
[CV] END estimator__max_depth=None, estimator__n_estimators=50; total time=19.9min
[CV] END estimator__max_depth=None, estimator__n_estimators=50; total time=20.0min
[CV] END estimator__max_depth=None, estimator__n_estimators=100; total time=38.4min
[CV] END estimator__max_depth=None, estimator__n_estimators=100; total time=38.5min
[CV] END estimator__max_depth=None, estimator__n_estimators=100; total time=38.6min
[CV] END .estimator__max_depth=5, estimator__n_estimators=50; total time=19.5min
[CV] END .estimator__max_depth=5, estimator__n_estimators=50; total time=19.5min
[CV] END .estimator__max_depth=5, estimator__n_estimators=50; total time=27.1min
[CV] END estimator__max_depth=None, estimator__n_estimators=200; total time=186.0min
[CV] END estimator__max_depth=None, estimator__n_estimators=200; total time=186.0min
[CV] END estimator__max_d

## Model Evaluation

In [7]:
from sklearn.metrics import classification_report

# Predict on the test data
y_w2v_pred = w2v_model.predict(X_w2v_test)
y_tfidf_pred = tfidf_model.predict(X_tfidf_test)

# Additional detailed performance analysis
print("Word2Vec Classification Report:")
print(classification_report(y_test, y_w2v_pred, target_names=mlb.classes_))
print("\nTF-IDF Classification Report:")
print(classification_report(y_test, y_tfidf_pred, target_names=mlb.classes_))

Word2Vec Classification Report:
                                                             precision    recall  f1-score   support

                                                A/B testing       1.00      1.00      1.00        19
                               A/B testing and optimization       1.00      1.00      1.00         4
                                                       AJAX       1.00      1.00      1.00         7
                                 API design and development       1.00      1.00      1.00        11
                                            API development       1.00      1.00      1.00        27
                                            API integration       1.00      1.00      1.00         9
                                              API knowledge       1.00      1.00      1.00        18
                                                       APIs       1.00      1.00      1.00         6
                          APIs and web services integratio

## Custom Exampels

In [8]:
importlib.reload(dc)
importlib.reload(dc)

# Define a sample datapoint
data = {
    'Job Description': 'As a staff member of the Compute Infrastructure team at LinkedIn, you will be charged with building the next-generation infrastructure and platforms for LinkedIn.',
    'Responsibilities': 'Develop predictive models to solve business problems',
    'Qualifications': 'PhD',
    'Work Type': 'Full-Time',
    'Preference': 'Male',
    'Job Title': 'Data Scientist',
    'Role': 'Data Scientist',
    'Company Size': 33428,
}
sample_df = pd.DataFrame([data])
sample_df['Preprocessed Job Description'] = sample_df['Job Description'].apply(lambda x: dc.preprocess_document(x, lemmatizer))
sample_df['Preprocessed Responsibilities'] = sample_df['Responsibilities'].apply(lambda x: dc.preprocess_document(x, lemmatizer))

sample_tfidf_X = dc.prepare_dataset(sample_df, tfidf_vectorizer, categorical_transformer, numerical_transformer, text_features, categorical_features, numerical_features)
sample_w2v_X = te.embed_w2v_dataframe(sample_df, word_vectors, categorical_transformer, numerical_transformer, categorical_features, numerical_features)

In [9]:
# Predict the skills required for the sample datapoint
y_sample_pred_w2v = w2v_model.predict(sample_w2v_X)
y_sample_pred_tfidf = tfidf_model.predict(sample_tfidf_X)

# Convert the predicted skills back to their original labels
w2v_pred_skills = mlb.inverse_transform(y_sample_pred_w2v)
tfidf_pred_skills = mlb.inverse_transform(y_sample_pred_tfidf)
print("Word2Vec Predicted Skills:", w2v_pred_skills)
print("TF-IDF Predicted Skills:", tfidf_pred_skills)

Word2Vec Predicted Skills: [()]
TF-IDF Predicted Skills: [()]
