In [1]:
import time
import pickle

import functions_reading as reading
import functions_iterative_clustering as iterative_clustering
import functions_metrics as metrics
import functions_select_cluster as select_cluster

def c_Cs_D(year, refferences_d, path_network, initial_resolution, clusters_per_level, max_depth, resolution_factor, beta_l, encoding=None, errors=None):
    """Create the base of the clustering solution dictionary

    Parameters
    ----------
    year : int
        Publication year of the systematic reviews. It is used for retrieving the references of the systematic reviews.
        
    refferences_d : dict
        Dictionary with the references of each topic.
        
    path_network : str
        Path of the file of the network.
        
    initial_resolution : float
        Resolution of the clustering algorithm in the first level.
    
    clusters_per_level : int
        Maximum number of clusters per branch per level (including the initial level).
        
    max_depth : int
        Lowest level of the clustering
        
    resolution_factor : float
        Factor by which the value of the resolution increases at each level.
        
    encoding : str, optional
        Parameter of p_Tab_Delimited()
    
    errors : str, optional
        Parameter of p_Tab_Delimited()
    
    Returns
    -------
    cs : dict
        Dictionary with the data of the clustering solution.
    
    Notes
    -------
    t_references_d = dict of set (int -> set -> int). It only contains the references of the systematic reviews published on the year 'year'
    """
    cs = {}
    cs['YEAR'] = year
    cs['INITIAL_RESOLUTION'] = initial_resolution
    cs['CLUSTERS_PER_LEVEL'] = clusters_per_level
    cs['PATH_NETWORK'] = path_network
    cs['t_references_d'] = refferences_d
    tab_del_net = reading.p_Tab_Delimited(path_network)
    cs['parsed_network'] = reading.parse_Network(tab_del_net)
    cs['igraph_network'] = reading.create_Igraph_Network(cs['parsed_network'])
    cs['max_depth'] = max_depth
    cs['resolution_factor'] = resolution_factor
    cs['beta_l'] = beta_l
    return cs

# TO DO: document this functions

def pipeline_Clustering(year, refferences_d, path_network, initial_resolution=0.000002, clusters_per_level=10, max_depth=13, resolution_factor=3.0, beta_l=[0.125, 0.25, 0.5, 1.0, 2.0, 4.0, 8.0], encoding=None, errors=None):
    cs = c_Cs_D(year, refferences_d, path_network, initial_resolution, clusters_per_level, max_depth, resolution_factor, beta_l, encoding=encoding, errors=errors)
    cs['level_data'], ITERATIONS_COUNT = iterative_clustering.c_Clus_Recursion(cs['igraph_network'], cs['INITIAL_RESOLUTION'], 0, cs['max_depth'], cs['CLUSTERS_PER_LEVEL'], cs['t_references_d'], cs['resolution_factor'], 0)
    cs['level_data']['ITERATIONS_COUNT'] = ITERATIONS_COUNT
    cs['level_data'] = metrics.c_Metric_Recursion(cs['level_data'], cs['t_references_d'], cs['beta_l'])
    cs['t_greedy_data'] = select_cluster.c_T_Greedy_D(cs['level_data'], cs['t_references_d'], cs['beta_l'])
    cs['t_universal_fscore'] = select_cluster.c_T_Universal_Fscore_D(cs['t_references_d'], cs['beta_l'], cs['level_data'])
    return cs

In [2]:
f = reading.p_Tab_Delimited('PAPER2_topic_year_netid_TEST.txt')
f_clean = [[int(row[0]), int(row[1]), int(row[2])] for row in f]
test_ref = reading.parse_2_Level_D(f_clean)
for a in test_ref:
    for b in test_ref[a]:
        test_ref[a][b] = set(test_ref[a][b])

In [3]:
mtime = time.time()
cs_test = pipeline_Clustering(1, test_ref[1], 'PAPER2_nid1_nid2_TEST.txt')
print(time.time() - mtime)

28.295661211013794


In [4]:
f = open('clean_ref.pickle', 'rb')
year_topic_retrieved_d = pickle.load(f)
f.close()
year_references_d = {}
for year in year_topic_retrieved_d:
    year_references_d[year] = {}
    for topic in year_topic_retrieved_d[year]:
        year_references_d[year][topic] = set(year_topic_retrieved_d[year][topic]['positives_retrieved_in_net'])

In [5]:
mtime = time.time()
cs_2014 = pipeline_Clustering(2014, year_references_d[2014], 'PAPER2_nid1_nid2_YEAR_2003_2013.txt')
print(time.time() - mtime)

3574.3791427612305


OSError: [Errno 22] Invalid argument

In [9]:
mtime = time.time()
f = open('cs_2014.pickle', 'wb')
pickle.dump(cs_2014, f)
f.close()
print(time.time() - mtime)

379.0683023929596


In [10]:
del(cs_2014)

In [11]:
mtime = time.time()
cs_2015 = pipeline_Clustering(2015, year_references_d[2015], 'PAPER2_nid1_nid2_YEAR_2004_2014.txt')
print(time.time() - mtime)


3878.562724351883


In [12]:
mtime = time.time()
f = open('cs_2015.pickle', 'wb')
pickle.dump(cs_2015, f)
f.close()
print(time.time() - mtime)

353.41613149642944


In [13]:
del(cs_2015)

In [14]:
mtime = time.time()
cs_2016 = pipeline_Clustering(2016, year_references_d[2016], 'PAPER2_nid1_nid2_YEAR_2005_2015.txt')
print(time.time() - mtime)


3987.9391782283783


In [15]:
mtime = time.time()
f = open('cs_2016.pickle', 'wb')
pickle.dump(cs_2016, f)
f.close()
print(time.time() - mtime)

356.18973112106323


In [16]:
del(cs_2016)