In [2]:
"""
step1: filtrate related weight 
step2: make weighted graph
step3: adjust weight with adjust_weight_file
step4: get small components, put them all in 'others' class
step5: check ignore_tags_file
step6: get laplacian matrix, get eigvecs
step7-pre1: pagerank on graph
step7-pre2: get top 100 tag
step7-pre3: get component graph of these 100 tag
step7: clustering with k-means, move small group to 'others' class
step8: match group name, draw graph, save file
step9: check clustering result, with classify method
"""

import functools
from scipy.sparse.linalg import eigs
import numpy as np

from analysis.classify_tag_v1 import TagClassifier
from analysis.common import *
import logging

from data.cdn.sof_cdn import TagsCDN, TagRelatedCDN, CoreTagClfCache, CoreTagRankCache

logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

def related_weight_filter(related_weight, min_weight=0.1, save=True, msg=None):
    # delete related with small weight
    related_weight_filted = list(filter(lambda related: related[2] > min_weight, related_weight))
    save_step_data(related_weight_filted, step='related_weight_filted', transfer_item=lambda item: f'{item[0]} {item[1]} {item[2]}',
                   save=save, msg=msg)
    logger.info(f'related weight filter finished, related: {len(related_weight_filted)}')
    return related_weight_filted

def make_graph(related_weight_filted, save=True, msg=None):
    graph = nx.Graph()
    for related in related_weight_filted:
        graph.add_edge(related[0], related[1], weight=float(related[2]))
    save_graph(graph, 'tag-link.png', save=save, msg=msg)
    logger.info(f'create graph finish, tags: {len(graph.nodes)}')
    return graph


def weight_adjust(graph, adjust_file_path, save=True, msg=None):
    if adjust_file_path:
        adjust_weight = load_file(adjust_file_path)
        for related in adjust_weight:
            if graph.has_edge(related[0], related[1]):
                graph[related[0]][related[1]]['weight'] = related[2]
                logger.debug(f"adjust weight: {related[0]} {related[1]} {graph[related[0]][related[1]]['weight']}")
            else:
                graph.add_edge(related[0], related[1], weight=related[2])
                logger.debug(f"add weight: {related[0]} {related[1]} {related[2]}")
        save_graph(graph, 'tag-adjusted.png', save=save, msg=msg)
        save_step_data(adjust_weight, 'adjust_weight.txt', save=save, msg=msg)
    return graph


def get_center_tags(graph, k):
    tagranks = nx.pagerank(graph)
    tags = [(tag, tagranks[tag]) for tag in tagranks]
    tags.sort(key=lambda item: item[1], reverse=True)
    center_tags = [item[0] for item in tags[:k]]
    logger.info(f'center tags : {center_tags}')
    return center_tags


def center_tags_group(graph: nx.Graph, center_tags: list):
    subgraph = graph.subgraph(center_tags)
    components = nx.connected_components(subgraph)
    result = []
    for component in components:
        i = min([center_tags.index(tag) for tag in component])
        result.append(center_tags[i])
    return result


def laplacian_eigs(G, k, nodelist=None, save=True, msg=None):
    L = nx.laplacian_matrix(G, nodelist=nodelist)
    node_num = len(nodelist) if nodelist else len(G.nodes)
    eigval, eigvec = eigs(L, node_num - 2)
    dim = len(eigval)
    dictEigval = dict(zip(eigval, range(0, dim)))
    eigval_sort = np.sort(eigval)
    logger.debug(eigval_sort)
    kEig = eigval_sort[0: k]
    ix = [dictEigval[val] for val in kEig]
    eigvecs = eigvec[:, ix]
    if save:
        file_path = create_step_file_path('eigvecs.txt')
        np.savetxt(file_path, eigvecs)
    logger.info('gen eigvecs finished')
    return eigvecs


def graph_clustering(graph, k=8, individual_tags=None, ignore_tags=None, min_component=3, save=True, msg=None):
    # move small components to 'others' class, then do clustering on large components
    k = len(individual_tags) if individual_tags else k
    components = nx.connected_components(graph)
    small_node_sets = set()
    nodelist = set()
    for c in components:
        if len(c) < min_component:
            small_node_sets |= c
            logger.debug('small component ' + str(c))
        else:
            nodelist |= c
    # move ignore tags to others
    if ignore_tags:
        small_node_sets |= ignore_tags
        nodelist -= ignore_tags
        save_step_data(ignore_tags, 'ignore_tags.txt', save=save, msg=msg)
    nodelist = list(nodelist)
    # get laplacian matrix and do clustering
    eigvecs = laplacian_eigs(graph, k, nodelist=nodelist)
    # get init center for kmeans
    if individual_tags:
        init_center = eigvecs[[nodelist.index(center) for center in individual_tags if center not in small_node_sets]]
        clf = KMeans(k, init=init_center).fit(eigvecs).labels_
    else:
        clf = KMeans(k).fit(eigvecs).labels_

    # match k-means labels with nodelist order
    result = {}
    for i, c in enumerate(clf):
        if c in result:
            result[c].append(nodelist[i])
        else:
            result[c] = [nodelist[i]]

    # mark every class with the most important tag of the class
    tagranks = nx.pagerank(graph)
    for c in list(result.keys()):
        if len(result[c]) < min_component:
            # move small class to others
            small_node_sets |= set(result.pop(c))
            continue
        result[c].sort(key=functools.cmp_to_key(lambda a, b: tagranks[a] - tagranks[b]), reverse=True)
        result[result[c][0]] = result[c]
        result.pop(c)
    # add small components to final clf result
    result['others'] = list(small_node_sets)
    save_graph(graph, 'tag-clf.png', clf=result, save=save, msg=msg)
    save_step_data(result, 'tag-clf.json', save=save, msg=msg)
    save_step_data(tagranks, 'tag-rank.json', save=save, msg=msg)
    return result, tagranks


def save_to_cdn(tag_clf, tag_rank):
    CoreTagClfCache.set(tag_clf)
    CoreTagRankCache.set(tag_rank)


min_weight = 0.3
min_component = 5
clustering_method = ClusterMethod.kmeans
output_path = r'E:\SOF\file\tag_clustering_v4'

2018-05-09 14:18:11,350 data.config.config DEBUG: db setting: {'host': 'localhost', 'port': 3306, 'user': 'root', 'passwd': 'LOVEyjh201697', 'database': 'sof_basic', 'max_connections': 10, 'charset': 'utf8'}


2018-05-09 14:18:11,355 data.config.config DEBUG: redis setting: {'host': 'localhost', 'port': 6379, 'password': None, 'db': 0, 'maxsize': 10}


2018-05-09 14:18:11,374 data.config.config DEBUG: db setting: {'host': 'localhost', 'port': 3306, 'user': 'root', 'passwd': 'LOVEyjh201697', 'database': 'sof_analysis', 'max_connections': 10, 'charset': 'utf8'}


2018-05-09 14:18:11,378 data.config.config DEBUG: redis setting: {'host': 'localhost', 'port': 6379, 'password': None, 'db': 0, 'maxsize': 10}


2018-05-09 14:18:12,050 matplotlib.backends DEBUG: backend module://ipykernel.pylab.backend_inline version unknown


In [23]:
related_weight = TagRelatedCDN.get_tag_related_filtered(0.3)
graph = make_graph(related_weight, save=False)
center_tag = get_center_tags(graph, 50)
cores = center_tags_group(graph, center_tag)
print(cores)

2018-05-09 14:56:51,106 __main__ INFO: create graph finish, tags: 1399


2018-05-09 14:56:51,628 __main__ INFO: center tags : ['android', 'java', 'c#', 'javascript', 'python', 'ios', 'c++', 'php', 'css', 'jquery', 'sql', 'ruby-on-rails', 'r', 'html', 'c', 'angular', 'asp.net', 'sql-server', 'git', 'objective-c', 'swift', 'iphone', 'node.js', 'algorithm', 'django', 'angularjs', 'mysql', 'swing', 'amazon-web-services', 'asp.net-mvc', 'azure', '.net', 'wpf', 'spring', 'xml', 'apache-spark', 'ruby', 'laravel', 'linux', 'hadoop', 'xcode', 'excel', 'bash', 'firebase', 'unit-testing', 'machine-learning', 'entity-framework', 'oracle', 'qt', 'facebook']


['c#', 'java', 'linux', 'mysql', 'javascript', 'sql', 'ios', 'facebook', 'apache-spark', 'git', 'xml', 'ruby-on-rails', 'azure', 'php', 'python', 'excel', 'bash', 'c++', 'amazon-web-services', 'algorithm', 'angular', 'android', 'unit-testing', 'hadoop', 'r', 'c']


In [24]:
nodelist = []
for component in nx.connected_components(graph):
    if len(component) >= 10:
        nodelist.extend(component)
L = nx.laplacian_matrix(graph, nodelist=nodelist)
eigval, eigvec = eigs(L, len(nodelist) - 2)

In [28]:
eigval_sorted = sorted(eigval)
temp = [abs(eigval_sorted[i] - eigval_sorted[i+1]) for i in range(0,1000)]
k = temp.index(max(temp))
print(k)
print(eigval_sorted[:50])

16
[(-3.1825266467135454e-15+0j), (0.0051477361342167635+0j), (0.0066049747327374805+0j), (0.006817871030448737+0j), (0.010118338890117798+0j), (0.012688235165480443+0j), (0.014416873666944937+0j), (0.018556120425836285+0j), (0.02004568982394964+0j), (0.02265716142254052+0j), (0.024556714694509298+0j), (0.033322977189318106+0j), (0.034632156374678716+0j), (0.03588821570194735+0j), (0.044936076346399886+0j), (0.04717312433130673+0j), (0.047867668034975996+0j), (0.0681514622906278+0j), (0.06987999582298814+0j), (0.07597293139839012+0j), (0.07959181403439013+0j), (0.08139612170329752+0j), (0.09227315167488148+0j), (0.09575882682834454+0j), (0.10129616781899947+0j), (0.10266596117257776+0j), (0.11828053505906513+0j), (0.12052993595224656+0j), (0.1220191010299754+0j), (0.12736977837080585+0j), (0.13314729810598985+0j), (0.13812862455891806+0j), (0.13937720189669356+0j), (0.1408567485735605+0j), (0.14455478498506694+0j), (0.14789177399882847+0j), (0.1545455484667187+0j), (0.15879122856715389

In [18]:
sorted([1,2,3])

[1, 2, 3]