In [1]:
import pandas as pd
import numpy as np
from math import log2
from scipy import sparse
import markov_clustering as mc
import feather

# import igraph as ig
import networkx as nx

import plotly as py
from plotly.graph_objs import *

py.offline.init_notebook_mode(connected=True)

In [2]:
co_occurenceMat = feather.read_dataframe('E:/Skill List Building/co_occurrenceMat.feather')
# co_occurenceMat = feather.read_dataframe('/run/media/boonteck/FAEF-9B11/Skill List Building/co_occurrenceMat.feather')
co_occurenceMat = co_occurenceMat.set_index(co_occurenceMat.columns)

In [119]:
# Normalize each element by row and column sum
# greater proportion -> smaller distance (closer to 0), so take 1 - (a / (row sum + col sum - 3*a))
# somehow pandas df screws up at some parts of the matrix, converting into np array and calculating there instead
co_occur_np = co_occurenceMat.values
neg_log = np.vectorize(lambda x: -1*log2(x) if ((x > 0) and ~(np.isinf(x))) else 0, otypes = [float]) # lower match, higher weight
dir_conv = np.vectorize(lambda x: 1-x if ((x > 0) and ~(np.isinf(x))) else 0, otypes = [float]) # takes dist = 1 - similarity

undirected = pd.DataFrame(
        co_occur_np / (
            co_occur_np.sum(axis = 1, keepdims = True) + 
            co_occur_np.sum(axis = 0, keepdims = True) -
            co_occur_np
    ), 
    index = co_occurenceMat.index, 
    columns = co_occurenceMat.index
)

out_directed = pd.DataFrame(
        co_occur_np / (
            co_occur_np.sum(axis = 1, keepdims = True)
        )
    , 
    index = co_occurenceMat.index, 
    columns = co_occurenceMat.index
)

In [266]:
# Markov Clustering Algorithm on undirected net
# set inflation level range (1,2]
# lower inflation -> finer clusters

result = mc.run_mcl(sparse.csr_matrix(undirected.values), inflation = 1.11)

clust_num = 0
clust_match = list()
for cluster in mc.get_clusters(result):
    [clust_match.append((skill,clust_num)) for skill in cluster]
    clust_num = clust_num + 1

clusters = pd.DataFrame(clust_match, columns = ['index', 'cluster'])
clusters['skill'] = clusters['index'].apply(lambda x: undirected.index[x])

In [247]:
# The most generic skills are packed in a cluster
# Current gen / tech split uses entropy across job industries & hub score, but this could be a viable alternative as well
generic = clusters[clusters.cluster == clusters.loc[clusters.skill == 'Teamwork', 'cluster'].tolist()[0]]

In [330]:
#
observe_skill = ['Adobe Photoshop']


skill_net = undirected.replace(
    to_replace = 0, 
    value = np.nan
).stack().reset_index().rename(
    columns = {'level_0' : 'source','level_1' : 'target', 0: 'weight'}
)

skill_net.weight = skill_net.weight.apply(neg_log)

subset_skills = clusters.loc[
    clusters.cluster.isin(clusters.loc[
        clusters.skill.isin(observe_skill),'cluster'
    ].tolist()),'skill'
].tolist()

skill_net = skill_net[(skill_net.source.isin(subset_skills)) & (skill_net.target.isin(subset_skills))].copy()
skill_net.reset_index(drop = True, inplace = True)

cluster_subset = clusters[clusters.skill.isin(subset_skills)].reset_index(drop = True)

skill_net['source_id'] = skill_net.source.apply(lambda x: cluster_subset.skill.tolist().index(x))
skill_net['target_id'] = skill_net.target.apply(lambda x: cluster_subset.skill.tolist().index(x))

In [331]:
data = dict()
data['nodes'] = cluster_subset.sort_values('index')[['cluster','skill']].rename(
    columns = {'cluster': 'group', 'skill' : 'name'}
    ).to_dict(orient = 'records')

data['links'] = skill_net[['source_id', 'target_id', 'weight']].rename(
    columns = {'source_id' : 'source', 'target_id': 'target', 'weight' : 'value'}
    ).to_dict(orient = 'records')

L=len(data['links'])

labels=[]
group=[]
for node in data['nodes']:
    labels.append(node['name'])
    group.append(node['group'])
    
N=len(data['nodes'])

In [334]:
G = nx.from_pandas_edgelist(skill_net, source = 'source_id', target = 'target_id',edge_attr = ['weight'])
#                             ,create_using = nx.MultiDiGraph())
                           
G = nx.maximum_spanning_tree(G)
# G = nx.algorithms.tree.branchings.Edmonds(G).find_optimum()

layt = nx.spring_layout(G, dim = 3)
# layt = nx.spectral_layout(G, dim = 3)


Xn=[layt[k][0] for k in range(N)]# x-coordinates of nodes
Yn=[layt[k][1] for k in range(N)]# y-coordinates
Zn=[layt[k][2] for k in range(N)]# z-coordinates
Xe=[]
Ye=[]
Ze=[]

for e in list(G.edges):
    Xe+=[layt[e[0]][0],layt[e[1]][0], None]# x-coordinates of edge ends
    Ye+=[layt[e[0]][1],layt[e[1]][1], None]
    Ze+=[layt[e[0]][2],layt[e[1]][2], None]

trace1=Scatter3d(
    x=Xe,
    y=Ye,
    z=Ze,
    mode='lines',
    line=Line(color='rgb(125,125,125)', width=1),
    hoverinfo='none'
)

trace2=Scatter3d(
    x=Xn,   
    y=Yn,
    z=Zn,
    mode='markers+text',
    name='actors',
    marker=Marker(symbol='dot',
                  size=6,
                  color=group,
                  colorscale='Viridis',
                  line=Line(color='rgb(50,50,50)', width=0.5)
                 ),
    text=labels,
    hoverinfo='text'
)

axis=dict(showbackground=False,
          showline=False,
          zeroline=False,
          showgrid=False,
          showticklabels=False,
          title=''
          )

layout = Layout(
         title="Network Cluster of Skill: " + ' ,'.join(observe_skill),
         width=1200,
         height=900,
         showlegend=False,
         scene=Scene(
         xaxis=XAxis(axis),
         yaxis=YAxis(axis),
         zaxis=ZAxis(axis),
        ),
     margin=Margin(
        t=100
    ),
    hovermode='closest',
    annotations=Annotations([
           Annotation(
           showarrow=False,
            text="",
            xref='paper',
            yref='paper',
            x=0,
            y=0.1,
            xanchor='left',
            yanchor='bottom',
            font=Font(
            size=14
            )
            )
        ]),    )

In [335]:
data=Data([trace1, trace2])
fig=Figure(data=data, layout=layout)

py.offline.plot(fig, filename='Network Cluster Graph.html')


# py.offline.iplot(fig, filename='Network Cluster Graph')


'file://C:\\Users\\mtinternet\\Jobs Bank Skill Net\\Network Cluster Graph.html'