In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
%matplotlib inline

import json

In [2]:
data = []
with open('skills.json') as f:
    for line in f:
        data.append(json.loads(line))

In [3]:
len(data)

11629

In [4]:
data[3]

{'id': '20950ab0-560d-3822-8666-d449aea1c28b',
 'skills': [{'src': {'at': 1475105660000, 'name': 'WEB', 'url': 'v2'},
   'val': 'Management'},
  {'src': {'at': 1475105660000, 'name': 'WEB', 'url': 'v2'},
   'val': 'Staff Development'},
  {'src': {'at': 1524956221000, 'name': 'web', 'url': 'v3'},
   'val': 'Management'},
  {'src': {'at': 1524956221000, 'name': 'web', 'url': 'v3'},
   'val': 'Staff Development'},
  {'src': {'at': 1526941719000, 'name': 'web', 'url': 'v4'},
   'val': 'Management'},
  {'src': {'at': 1526941719000, 'name': 'web', 'url': 'v4'},
   'val': 'Staff Development'},
  {'src': {'at': 1526941719000, 'name': 'web', 'url': 'v4'},
   'val': 'Management'},
  {'src': {'at': 1526941719000, 'name': 'web', 'url': 'v4'},
   'val': 'Staff Development'}],
 'ver': [2, 3, 4, 5]}

In [5]:
# dict mapping each user to the skills it has at each particular time stamp
user_skills = {} # str -> int -> str
for user in data:
    curr_user_skills = {}
    curr_skills = set()
    curr_time = user['skills'][0]['src']['at']
    for s in user['skills']:
        time_stamp = s['src']['at']
        if time_stamp != curr_time:
            curr_user_skills[curr_time] = curr_skills
            curr_skills = set()
            curr_time = time_stamp
        curr_skills.add(s['val'])
    curr_user_skills[curr_time] = list(curr_skills)
    user_skills[user['id']] = curr_user_skills

In [6]:
counter = 0
for user in user_skills:
    if len(user_skills[user]) > 1:
        counter += 1
print(counter)

4540


In [7]:
user_skills['20950ab0-560d-3822-8666-d449aea1c28b']

{1475105660000: {'Management', 'Staff Development'},
 1524956221000: {'Management', 'Staff Development'},
 1526941719000: ['Staff Development', 'Management']}

In [40]:
# dict mapping skill with directed edge to all other skills of weight based on # changes made
skills_added = {}
skills_deleted = {}
skills_changes = {} # str -> str -> int
count = 0
for user_id, skills in user_skills.items():
    ids = list(skills.keys())
    for s in skills[ids[0]]:
        if s not in skills_changes:
            skills_changes[s] = {}
            skills_added[s] = {}
            skills_deleted[s] = {}
    for i in range(1, len(ids)):
        curr = ids[i]
        prev = ids[i-1]
        for s in skills[curr]:
            if s not in skills_changes:
                skills_added[s] = {}
                skills_deleted[s] = {}
                skills_changes[s] = {}
            # skill added between prev and curr
            if s not in skills[prev]:
                for temp in skills[prev]:
                    if s in skills_changes[temp]:
                        skills_changes[temp][s] += 1
                    else:
                        skills_changes[temp][s] = 1
                    if s in skills_added[temp]:
                        skills_added[temp][s] += 1
                    else:
                        skills_added[temp][s] = 1
        for s in skills[prev]:
            # skill deleted between prev and curr
            if s not in skills[curr]:
                for temp in skills[prev]:
                    if temp == s:
                        continue
                    if s in skills_changes[temp]:
                        skills_changes[temp][s] -= 1
                    else:
                        skills_changes[temp][s] = -1
                    if s in skills_deleted[temp]:
                        skills_deleted[temp][s] -= 1
                    else:
                        skills_deleted[temp][s] = -1
    

In [41]:
print(len(skills_changes))
print(len(skills_added))
print(len(skills_deleted))

22862
22862
22862


In [42]:
skills_changes['Tensorflow']

{'Algorithms': -1,
 'Amazon RDS': -1,
 'Apache': -1,
 'CSS': -1,
 'Data Extraction': -1,
 'Data Structures': -1,
 'Django': 0,
 'Git': -1,
 'HTML': -1,
 'Jasmine': -1,
 'Java': -1,
 'JavaScript': -1,
 'Linux': 0,
 'Maven': -1,
 'MySQL': 0,
 'Nginx': -1,
 'Node.js': -1,
 'Python': 0,
 'React': 0,
 'Scikit-Learn': -1}

In [43]:
skills_added['Tensorflow']

{'Django': 1, 'Linux': 1, 'MySQL': 1, 'Python': 1, 'React': 1}

In [44]:
skills_deleted['Tensorflow']

{'Algorithms': -1,
 'Amazon RDS': -1,
 'Apache': -1,
 'CSS': -1,
 'Data Extraction': -1,
 'Data Structures': -1,
 'Django': -1,
 'Git': -1,
 'HTML': -1,
 'Jasmine': -1,
 'Java': -1,
 'JavaScript': -1,
 'Linux': -1,
 'Maven': -1,
 'MySQL': -1,
 'Nginx': -1,
 'Node.js': -1,
 'Python': -1,
 'React': -1,
 'Scikit-Learn': -1}

In [50]:
def draw_skill_graph(skill):
    G = nx.DiGraph()
    for node, weight in skills_added[skill].items():
        if weight not in [-1, 0, 1]:
            G.add_edges_from([(skill, node)], weight=weight)
    return G

In [None]:
def draw_graph():
    G = nx.DiGraph()
    for skill in skills_added:
        for node, weight in skills_deleted[skill].items():
            if weight < -20:
                G.add_edges_from([(skill, node)], weight=weight)
    return G

In [None]:
G = draw_graph()
pos = nx.spring_layout(G)
nx.draw(G, pos, with_labels=True)
edge_labels = nx.get_edge_attributes(G,'weight')
nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)