In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import json


Training

In [None]:
with open('new_data_set.json') as f:
    d = json.loads(f.read())

In [None]:
d2 = {}
for key in d:
    d2 |= d[key]

In [None]:
set_of_all_skills = set()
for value in d2.values():
    set_of_all_skills.update(value)


In [None]:
d3 = {}
for key, value in d2.items():
    d3[key] = [1 if skill in value else 0 for skill in set_of_all_skills]


In [None]:
df = pd.DataFrame.from_dict(d3, orient='index', columns=list(set_of_all_skills))

In [None]:
df['jobs'] = df.index

In [None]:
jobs_encoded = pd.get_dummies(df['jobs'])
skills_encoded = df.drop('jobs', axis=1)

In [None]:
jobs_train, jobs_test = train_test_split(jobs_encoded, test_size=.2)

In [None]:
skills_train = skills_encoded.loc[jobs_train.index.to_list()]
skills_test = skills_encoded.loc[jobs_test.index.to_list()]

In [None]:
assert(jobs_train.shape[1], jobs_test.shape[1])
assert(skills_train.shape[1], skills_test.shape[1])
job_count = jobs_train.shape[1]
skill_count = skills_test.shape[1]

In [None]:
jobs_train_np = jobs_train.to_numpy()
jobs_test_np = jobs_test.to_numpy()
skills_train_np = skills_train.to_numpy()
skills_test_np = skills_test.to_numpy()

In [None]:
moniters = [
    tf.keras.callbacks.ModelCheckpoint('./best_weights', save_best_only=True),
    tf.keras.callbacks.EarlyStopping(min_delta=1e-4, patience=5, restore_best_weights=True)
]

In [None]:
NUM_OF_ITERATIONS = 10

for i in range(NUM_OF_ITERATIONS):
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Dense(skill_count, input_dim=skill_count))
    model.add(tf.keras.layers.Dense(int(skill_count/4), activation='relu') )
    model.add(tf.keras.layers.Dense(int(skill_count/8), activation='relu') )

    # Output
    model.add(tf.keras.layers.Dense(job_count, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')

    model.fit(skills_train_np, jobs_train_np, validation_data=(skills_test_np, jobs_test_np), callbacks=moniters, verbose=2,epochs=100)

Production Model

In [None]:
list_of_all_jobs = list(jobs_test.columns)
list_of_all_skills = list(set_of_all_skills)

In [None]:
json.dumps({'list_of_all_jobs' : list_of_all_jobs, 'list_of_all_skills' : list_of_all_skills})

In [45]:
class Prod:
    
    # load best model
    def __init__(self, model_path : str = './best_weights'):
        self.model = self.create_model()
        self.model.load_weights(model_path)
        
    @staticmethod
    def create_model() -> tf.keras.models.Sequential:
        model = tf.keras.models.Sequential()
        model.add(tf.keras.layers.Dense(skill_count, input_dim=skill_count))
        model.add(tf.keras.layers.Dense(int(skill_count/4), activation='relu') )
        model.add(tf.keras.layers.Dense(int(skill_count/8), activation='relu') )

        # Output
        model.add(tf.keras.layers.Dense(job_count, activation='softmax'))
        model.compile(loss='categorical_crossentropy', optimizer='adam')
        return model

        
    def get_job_list(self, skills: list[str], job_count: int = 10):
        # One hot encode skills
        arr = np.array([[1 if skill in skills else 0 for skill in list_of_all_skills]])
        
        # Send it to the model
        pred = self.model.predict(arr)
        
        # Create a dataframe and series
        df = pd.DataFrame(pred, columns=list_of_all_jobs)
        series = pd.Series(df.loc[0])

        # Take the largest job count
        rv = pd.Series.nlargest(series, n = job_count).to_dict()
        
        # Return the list of carrers
        return list(rv)

    

In [46]:
prod = Prod()
prod.get_job_list(['Depiction Software Deco-Con Estimator',
 'Patient management software',
 'Biometrics video game software',
 'DuPont Spies Hecker Wizard'])



['Network and Computer Systems Administrators',
 'Insurance Sales Agents',
 'Critical Care Nurses',
 'Training and Development Specialists',
 'Biofuels/Biodiesel Technology and Product Development Managers',
 'Information Security Analysts',
 'Court Reporters and Simultaneous Captioners',
 'Actuaries',
 'Communications Teachers, Postsecondary',
 'Computer User Support Specialists']