In [4]:
MLFLOW_TRACKING_URI = '../models/mlruns'
MLFLOW_RUN_ID = "22a5af9a6b784757b2151eb7ef1eed47"

LOG_DATA_PKL    =  "data.pkl"
LOG_MODEL_PKL   =  "model.pkl"
LOG_METRICS_PKL =  "metrics.pkl"

CLUSTERS_YAML_PATH = "../data/processed/features_skills_clusters_description.yaml"

In [5]:
import os 
import sklearn
import pickle
import yaml

import pandas as pd

import mlflow
from mlflow.tracking import MlflowClient

# Initialize

# 1. Mlflow

In [6]:
# Initialize client and experiment
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
client = MlflowClient()

run = mlflow.get_run(MLFLOW_RUN_ID)
artificats_path = run.info.artifact_uri

## Load model

In [7]:
# Load model
model_path = os.path.join(artificats_path, LOG_MODEL_PKL)
with open(model_path, "rb") as f:
    model = pickle.load(f)

model

{'model_description': 'Random Forest: with PCA + Hyperparamter tuning',
 'model_details': "GridSearchCV(estimator=Pipeline(steps=[('robustscaler', RobustScaler()),\n                                       ('pca', PCA()),\n                                       ('randomforestclassifier',\n                                        RandomForestClassifier(n_jobs=8,\n                                                               random_state=0,\n                                                               verbose=1))]),\n             param_grid=[{'pca__n_components': [0.7, 0.85, 0.95],\n                          'randomforestclassifier__max_depth': [3, 10, None],\n                          'randomforestclassifier__n_estimators': [250, 500]}])",
 'model_object': GridSearchCV(estimator=Pipeline(steps=[('robustscaler', RobustScaler()),
                                        ('pca', PCA()),
                                        ('randomforestclassifier',
                                      

In [8]:
# Load data pkl
data_path  = os.path.join(artificats_path, LOG_DATA_PKL)
with open(data_path, 'rb') as handle:
    data = pickle.load(handle)

data.keys()

dict_keys(['data_path', 'training_indices', 'test_indices', 'features_names', 'targets_names'])

In [9]:
# Unpack vars
features_names = pd.Series(data["features_names"])
targets_names  = pd.Series(data['targets_names'])
classifier = model['model_object']

In [10]:
classifier

______

## Load skills Clusters

In [11]:
# Load skills clusters
with open(CLUSTERS_YAML_PATH, "r") as stream:
    clusters_config = yaml.safe_load(stream)

clusters_config

{'skills_group_0': ['Go',
  'Rust',
  'DynamoDB',
  'PostgreSQL',
  'Ansible',
  'Docker',
  'Git',
  'Kubernetes',
  'Terraform'],
 'skills_group_1': ['PHP',
  'MariaDB',
  'MySQL',
  'SQLite',
  'Drupal',
  'Laravel',
  'Symfony',
  'Vue.js'],
 'skills_group_10': ['SQL', 'VBA', 'Microsoft SQL Server'],
 'skills_group_11': ['Cassandra', 'Couchbase', 'Elasticsearch', 'Redis'],
 'skills_group_12': ['Svelte', 'Deno'],
 'skills_group_13': ['Groovy', 'Java', 'Kotlin', 'Oracle', 'Spring'],
 'skills_group_14': ['TypeScript', 'Angular', 'Angular.js', 'Cordova'],
 'skills_group_15': ['Ruby', 'Ruby on Rails'],
 'skills_group_16': ['Clojure', 'Haskell', 'LISP'],
 'skills_group_17': ['Scala', 'Apache Spark', 'Hadoop'],
 'skills_group_18': ['Bash/Shell', 'Perl'],
 'skills_group_19': ['Dart', 'Firebase', 'Flutter'],
 'skills_group_2': ['C#',
  'F#',
  'PowerShell',
  'ASP.NET',
  'ASP.NET Core ',
  '.NET Core / .NET 5',
  '.NET Framework',
  'Xamarin'],
 'skills_group_20': ['Objective-C', 'Swift'],

In [12]:
# Reformat into data frame
molten_clusters = [(cluster_name, cluster_skill)
                   for cluster_name, cluster_skills in clusters_config.items()
                   for cluster_skill in cluster_skills]

clusters_df = pd.DataFrame(molten_clusters, columns=["cluster_name", "skill"])
clusters_df

Unnamed: 0,cluster_name,skill
0,skills_group_0,Go
1,skills_group_0,Rust
2,skills_group_0,DynamoDB
3,skills_group_0,PostgreSQL
4,skills_group_0,Ansible
...,...,...
92,skills_group_8,IBM DB2
93,skills_group_9,Assembly
94,skills_group_9,C
95,skills_group_9,C++


________

## Predict sample entry

In [13]:
sample_skills = ['Scala', 'Hadoop', 'Python']

In [14]:
# Verify
pd.Series(sample_skills).isin(features_names)

0    True
1    True
2    True
dtype: bool

### 1. Recreate cluster features

In [15]:
sample_clusters = clusters_df.copy()
sample_clusters["sample_skills"] = sample_clusters["skill"].isin(sample_skills)
sample_clusters

Unnamed: 0,cluster_name,skill,sample_skills
0,skills_group_0,Go,False
1,skills_group_0,Rust,False
2,skills_group_0,DynamoDB,False
3,skills_group_0,PostgreSQL,False
4,skills_group_0,Ansible,False
...,...,...,...
92,skills_group_8,IBM DB2,False
93,skills_group_9,Assembly,False
94,skills_group_9,C,False
95,skills_group_9,C++,False


In [16]:
cluster_features = sample_clusters.groupby("cluster_name")["sample_skills"].sum()
cluster_features

cluster_name
skills_group_0     0
skills_group_1     0
skills_group_10    0
skills_group_11    0
skills_group_12    0
skills_group_13    0
skills_group_14    0
skills_group_15    0
skills_group_16    0
skills_group_17    2
skills_group_18    0
skills_group_19    0
skills_group_2     0
skills_group_20    0
skills_group_21    0
skills_group_22    0
skills_group_23    0
skills_group_3     0
skills_group_4     1
skills_group_5     0
skills_group_6     0
skills_group_7     0
skills_group_8     0
skills_group_9     0
Name: sample_skills, dtype: int64

### 2. Create OneHotEncoded skills

In [17]:
skills_names = features_names[~features_names.isin(cluster_features.index)]
skills_names

0                     APL
1                Assembly
2              Bash/Shell
3                       C
4                      C#
              ...        
120                   Vim
121         Visual Studio
122    Visual Studio Code
123              Webstorm
124                 Xcode
Length: 125, dtype: object

In [18]:
ohe_skills = pd.Series(skills_names.isin(sample_skills).astype(int).tolist(), 
                       index=skills_names)
ohe_skills

APL                   0
Assembly              0
Bash/Shell            0
C                     0
C#                    0
                     ..
Vim                   0
Visual Studio         0
Visual Studio Code    0
Webstorm              0
Xcode                 0
Length: 125, dtype: int64

### 3. Combine features

In [19]:
# Concat
features = pd.concat([ohe_skills,
                      cluster_features])

In [20]:
# Sort columns
features = features.loc[features_names]
features

APL               0
Assembly          0
Bash/Shell        0
C                 0
C#                0
                 ..
skills_group_5    0
skills_group_6    0
skills_group_7    0
skills_group_8    0
skills_group_9    0
Length: 149, dtype: int64

### 4. Predict

In [21]:
predictions = classifier.predict_proba([features.values])
predictions

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    0.2s finished


[array([[0.83, 0.17]]),
 array([[0.846, 0.154]]),
 array([[0.876, 0.124]]),
 array([[0.978, 0.022]]),
 array([[0.972, 0.028]]),
 array([[0.858, 0.142]]),
 array([[0.68466667, 0.31533333]]),
 array([[0.876, 0.124]]),
 array([[0.95066667, 0.04933333]]),
 array([[0.968, 0.032]]),
 array([[0.946, 0.054]]),
 array([[0.94, 0.06]]),
 array([[0.97, 0.03]]),
 array([[0.66466667, 0.33533333]]),
 array([[0.87133333, 0.12866667]]),
 array([[0.972, 0.028]])]

In [22]:
positive_probs = [prob[0][1] for prob in predictions]
pd.Series(positive_probs, 
          index=targets_names).sort_values(ascending=False)

Engineer, data                                   0.335333
Developer, back-end                              0.315333
Academic researcher                              0.170000
Data or business analyst                         0.154000
Developer, QA or test                            0.142000
Scientist                                        0.128667
Data scientist or machine learning specialist    0.124000
Developer, desktop or enterprise applications    0.124000
Developer, game or graphics                      0.060000
Developer, full-stack                            0.054000
Developer, embedded applications or devices      0.049333
Developer, front-end                             0.032000
Developer, mobile                                0.030000
DevOps specialist                                0.028000
System administrator                             0.028000
Database administrator                           0.022000
dtype: float64