In [42]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

In [2]:
#working in local machine
Data_PATH = r"..\data\processed\onehot_2.pkl"
#working in codespace
# Data_PATH = r"/workspaces/stackoverflowDeveloper/data/processed/onehot_2.pkl"

EXPORT_PATH = r"..\data\processed\onehot_2.pkl"

TECH_COL = ["MiscTechWorkedWith", "NEWCollabToolsWorkedWith", "DatabaseWorkedWith", "WebframeWorkedWith", "LanguageWorkedWith", "PlatformWorkedWith"]
JOB_COL = "DevType"
COL_READ = TECH_COL +[JOB_COL]

# Load the data

In [3]:
df = pd.read_pickle(Data_PATH)
df.head()

Unnamed: 0_level_0,MiscTechWorkedWith,MiscTechWorkedWith,MiscTechWorkedWith,MiscTechWorkedWith,MiscTechWorkedWith,MiscTechWorkedWith,MiscTechWorkedWith,MiscTechWorkedWith,MiscTechWorkedWith,MiscTechWorkedWith,...,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType
Unnamed: 0_level_1,.NET,.NET Core,Ansible,Apache Spark,Chef,Cordova,Flutter,Hadoop,Keras,Node.js,...,"Developer, mobile",Educator,"Engineer, data","Engineer, site reliability",Engineering manager,Marketing or sales professional,Product manager,Scientist,Senior executive/VP,System administrator
0,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
def get_skills_names(dataFrame):
    #Make a skills list 
    skills = []
    job_mask = dataFrame[("DevType", "Educator")]==1  #Educator or any job as all jobs has the same columns
    for tech_col in TECH_COL:
        skills = skills + list(dataFrame.loc[job_mask][tech_col].columns)

    return skills

skills = get_skills_names(df)
skills[:10]

['.NET',
 '.NET Core',
 'Ansible',
 'Apache Spark',
 'Chef',
 'Cordova',
 'Flutter',
 'Hadoop',
 'Keras',
 'Node.js']

In [5]:
jobs = list(df["DevType"].columns)
jobs[:10]

['Academic researcher',
 'Data or business analyst',
 'Data scientist or machine learning specialist',
 'Database administrator',
 'Designer',
 'DevOps specialist',
 'Developer, QA or test',
 'Developer, back-end',
 'Developer, desktop or enterprise applications',
 'Developer, embedded applications or devices']

In [6]:
df_skills = df.copy().drop("DevType",axis=1)
df_skills_scaled = StandardScaler().fit_transform(df_skills)
df_skills_scaled

array([[ 1.88612836,  2.23463294, -0.21918963, ...,  1.11994429,
        -0.36706528, -0.33665307],
       [-0.53018661, -0.44750079, -0.21918963, ..., -0.89290155,
        -0.36706528,  2.97041698],
       [-0.53018661, -0.44750079, -0.21918963, ..., -0.89290155,
        -0.36706528, -0.33665307],
       ...,
       [-0.53018661, -0.44750079, -0.21918963, ..., -0.89290155,
        -0.36706528, -0.33665307],
       [-0.53018661, -0.44750079, -0.21918963, ...,  1.11994429,
        -0.36706528, -0.33665307],
       [ 1.88612836,  2.23463294, -0.21918963, ...,  1.11994429,
        -0.36706528, -0.33665307]])

In [7]:
df_skills_scaled.shape

(64461, 101)

# Use TSNE to visualize the data in 2d dimentions

In [52]:
tsne = TSNE(n_components=2,
            perplexity=3,
            max_iter=10**10,
            init="pca",
            learning_rate=0.01
            )
skills_tsne = tsne.fit_transform(df_skills_scaled.T)

In [53]:
skills_tsne.shape

(101, 2)

In [54]:
df_skills_tsne = pd.DataFrame(skills_tsne, index=df_skills.columns, columns=["x","y"])
df_skills_tsne.head()

Unnamed: 0,Unnamed: 1,x,y
MiscTechWorkedWith,.NET,-28.451099,5.894094
MiscTechWorkedWith,.NET Core,-30.989491,8.044797
MiscTechWorkedWith,Ansible,22.282867,5.071269
MiscTechWorkedWith,Apache Spark,23.707905,21.706453
MiscTechWorkedWith,Chef,26.442413,4.329106


In [55]:
# sns.scatterplot(x=df_skills_tsne["x"], y=df_skills_tsne["y"], hue=df_skills_tsne.droplevel(0).index)
fig = px.scatter(data_frame=df_skills_tsne, x="x", y="y",text=df_skills_tsne.droplevel(0).index)
fig.update_layout(title="skills in 2 dimentions",
                  width=1000,
                  height=1000)

# Feature(dimentionality) reduction...clustering