In [46]:
from new_main import *
import time
import numpy as np

In [105]:
# return a list with every endpoint that has over 0% of a certain topic (new_datasets[i], where i == topic)
def filter_sub_topics_over_0(mod, dataset):
    dataset = create_table_ids_topics(mod)
    d_0 = dataset_topics_over_zero(dataset)
    endps = [filter_endpoints_by_topic(d_0, i) for i in range(mod.cur_model_topic_quantity())]
    new_datasets = [new_dataset_filtered_endpoint(data_lemmatized, endp) for endp in endps]
    return endps, new_datasets

# Return a dictionary with the best endpoints for each topic [id of endpoint, topic it belongs, % of topic]
def endpoints_for_topic(mod, tpc):
    dataset = create_table_ids_topics(mod)
    d_best = dataset_topics_over_zero(dataset)
    dict_best = {i: (d_best[i][j][0], d_best[i][j][1]) for i in range(len(d_best)) for j in range(len(d_best[i])) if d_best[i][j][0] == tpc}
    ordered_d = sorted(dict_best.items(), key=lambda x:x[1][1], reverse=True)
    return ordered_d

# Return a dictionary with the best endpoints for each topic [id of endpoint, topic it belongs, % of topic]
def best_endpoints_for_topic(mod, tpc):
    dataset = create_table_ids_topics(mod)
    d_oz = dataset_topics_over_zero(dataset)
    dict_best = {}
    for i in range(len(d_oz)):
        best_score = -1
        best_tpc = None
        for j in range(len(d_oz[i])):
            if(d_oz[i][j][1] > best_score):
                best_score = d_oz[i][j][1]
                best_tpc = d_oz[i][j][0]
        if(best_tpc == tpc):       
            dict_best[i] = (best_tpc, best_score) 
    ordered_d = sorted(dict_best.items(), key=lambda x:x[1][1], reverse=True)
    return ordered_d

def gen_csv_to_topics(mod, bests, id_info):
    csv_vals = []
    for i in range(mod.cur_model_topic_quantity()):
        for j in range(5):
            csv_vals.append(i)
            csv_vals.append(', '.join([elem[0] for elem in mod.cur_model_topic_words(i, 5)]))
            endp_id = bests[i][j][0]
            endp_score = bests[i][j][1][1]
            info = id_info[endp_id]
            csv_vals.append(info[0] + info[1])
            csv_vals.append(endp_score)
            
    return csv_vals

def gen_csv_to_statistics(mod, bests, id_info):
    csv_vals = []
    m = mod._Gensim_Model__current_model
    c = mod._Gensim_Model__corpus
    idw2 = mod._Gensim_Model__id2words
    d_l = mod._Gensim_Model__data_lemmatized
    tpcs = {}
    top_topics = m.top_topics(c, d_l, idw2, coherence='c_v', topn=5)
    top_topics = {', '.join([word[1] for word in top_topics[i][0]]): top_topics[i][1] for i in range(len(top_topics))}
    
    print(top_topics)
    
    for i in range(mod.cur_model_topic_quantity()):
        words_tpc = [elem[0] for elem in mod.cur_model_topic_words(i, 5)]
        weights_w_tpc = [elem[1] for elem in mod.cur_model_topic_words(i, 5)]
        tpc = ', '.join(words_tpc)
        tpcs[i] = top_topics[tpc]
        best_i = [bests[i][j][1][1] for j in range(len(bests[i]))]
        avg_score, std_score = np.average(best_i), np.std(best_i)
        avg_weights, std_weights = np.average(weights_w_tpc), np.std(weights_w_tpc)
        qtt_endpoints = len(bests[i])
        csv_vals.append(i)
        csv_vals.append(tpcs[i]) 
        csv_vals.append(avg_score) 
        csv_vals.append(std_score) 
        csv_vals.append(avg_weights) 
        csv_vals.append(std_weights)
        csv_vals.append(qtt_endpoints)
        
    return csv_vals
        

In [3]:
# Loading the dataset into variables, with the endpoints' description lemmatized and a dictionary of endpoint id -> information 
t0 = time.time()
data_lemmatized, id_info = prepare_dataset()
print('Time taken: {} seconds'.format(time.time() - t0))

Time taken: 34.8973867893219 seconds


In [4]:
# Create the model with the lemmatized data, and try to model the topics 2-20, with LDA, LSA and NMF
t0 = time.time()
mod = evaluate_and_set_best_model(data_lemmatized)
print('Time taken: {} seconds'.format(time.time() - t0))

Latent Dirichlet Allocation (LDA), c_v, 2, 5, 0.48600392227043915
Latent Semantic Analysis (LSA), aka Latent Semantic Index (LSI), c_v, 2, 5, 0.581088045279271
Non Negative Matrix Factorization (NMF), c_v, 2, 5, 0.542969009996337
Latent Dirichlet Allocation (LDA), c_v, 3, 5, 0.5170370624460126
Latent Semantic Analysis (LSA), aka Latent Semantic Index (LSI), c_v, 3, 5, 0.5205065979855491
Non Negative Matrix Factorization (NMF), c_v, 3, 5, 0.633244834642606
Latent Dirichlet Allocation (LDA), c_v, 4, 5, 0.4672813264062485
Latent Semantic Analysis (LSA), aka Latent Semantic Index (LSI), c_v, 4, 5, 0.553342069010452
Non Negative Matrix Factorization (NMF), c_v, 4, 5, 0.6044404515680454
Latent Dirichlet Allocation (LDA), c_v, 5, 5, 0.48101850010476543
Latent Semantic Analysis (LSA), aka Latent Semantic Index (LSI), c_v, 5, 5, 0.5916308490507415
Non Negative Matrix Factorization (NMF), c_v, 5, 5, 0.5947677115212902
Latent Dirichlet Allocation (LDA), c_v, 6, 5, 0.49866673502108094
Latent Seman

In [106]:
# For each topic generate a list with the endpoint that is best related to the topic
t0 = time.time()
bests = [best_endpoints_for_topic(mod, i) for i in range(mod.cur_model_topic_quantity())]
print('Time taken: {} seconds'.format(time.time() - t0))

Topics for each endpoints modeled in a table
Topics for each endpoints modeled in a table
Topics for each endpoints modeled in a table
Topics for each endpoints modeled in a table
Topics for each endpoints modeled in a table
Topics for each endpoints modeled in a table
Topics for each endpoints modeled in a table
Topics for each endpoints modeled in a table
Topics for each endpoints modeled in a table
Topics for each endpoints modeled in a table
Topics for each endpoints modeled in a table
Topics for each endpoints modeled in a table
Time taken: 93.8808343410492 seconds


In [6]:
#len(best)

In [7]:
for i in range(mod.cur_model_topic_quantity()):
    print('Topic {} has the following words as its highest weight:'.format(i))
    print(mod.cur_model_topic_words(i, 5))
    print('The endpoints that are best related to the topic are the following (endpoint_id, value):')
    for j in range(5):
        print((bests[i][j][0], bests[i][j][1][1]))
    print()

Topic 0 has the following words as its highest weight:
[('get', 0.0669169989218362), ('list', 0.046666187614072915), ('filter', 0.031010050778435817), ('give', 0.029935052711474647), ('service', 0.02639571359849857)]
The endpoints that are best related to the topic are the following (endpoint_id, value):
(0, 1.0)
(445, 1.0)
(618, 1.0)
(628, 1.0)
(633, 1.0)

Topic 1 has the following words as its highest weight:
[('api', 0.06921266623079911), ('user', 0.04092281531664043), ('call', 0.018439558511459238), ('see', 0.015073819874472434), ('endpoint', 0.014902130720897374)]
The endpoints that are best related to the topic are the following (endpoint_id, value):
(119, 1.0)
(4308, 1.0)
(5790, 1.0)
(5892, 1.0)
(6579, 1.0)

Topic 2 has the following words as its highest weight:
[('would', 0.043758343512318734), ('-PRON-', 0.04230971448287902), ('list', 0.041454561157330996), ('endpoint', 0.03688487809161816), ('request', 0.023967186942227506)]
The endpoints that are best related to the topic ar

In [25]:
id = 445
info = id_info[id]
print('API: {}, \nEndpoint: {} \nDescription: {}'.format(info[0], info[1], info[3]))
print(data_lemmatized[id])

API: ./APIs/azure.com\azsadmin-Quotas\2018-02-09, 
Endpoint: /subscriptions/{subscriptionId}/providers/Microsoft.Compute.Admin/locations/{location}/quotas/{quotaName}/get 
Description: Get an existing Compute Quota.
['get', 'exist', 'compute', 'quota']


In [9]:
#Create new lemmatized datasets, for each topic (new_datasets[0] == lemmatized version of endpoints in topic 0)
t0 = time.time()
endps, new_datasets = filter_sub_topics_over_0(mod, data_lemmatized)
print('Time taken: {} seconds'.format(time.time() - t0))

Topics for each endpoints modeled in a table
Time taken: 13.454716444015503 seconds


In [10]:
# Model sub-topics for topic 0
t0 = time.time()
mod_sub0 = evaluate_and_set_best_model(new_datasets[0])
print('Time taken: {} seconds'.format(time.time() - t0))

Latent Dirichlet Allocation (LDA), c_v, 2, 5, 0.3135504251850122
Latent Semantic Analysis (LSA), aka Latent Semantic Index (LSI), c_v, 2, 5, 0.5373231286813
Non Negative Matrix Factorization (NMF), c_v, 2, 5, 0.518405437224078
Latent Dirichlet Allocation (LDA), c_v, 3, 5, 0.42035548787351384
Latent Semantic Analysis (LSA), aka Latent Semantic Index (LSI), c_v, 3, 5, 0.6481521188327088
Non Negative Matrix Factorization (NMF), c_v, 3, 5, 0.5901837908453264
Latent Dirichlet Allocation (LDA), c_v, 4, 5, 0.40859793722867066
Latent Semantic Analysis (LSA), aka Latent Semantic Index (LSI), c_v, 4, 5, 0.5252570351771952
Non Negative Matrix Factorization (NMF), c_v, 4, 5, 0.5931478422122805
Latent Dirichlet Allocation (LDA), c_v, 5, 5, 0.45808237469718804
Latent Semantic Analysis (LSA), aka Latent Semantic Index (LSI), c_v, 5, 5, 0.41813270742244263
Non Negative Matrix Factorization (NMF), c_v, 5, 5, 0.59020360888369
Latent Dirichlet Allocation (LDA), c_v, 6, 5, 0.41139058986661103
Latent Seman

In [11]:
t0 = time.time()
bests_sub0 = [best_endpoints_for_topic(mod_sub0, i) for i in range(mod_sub0.cur_model_topic_quantity())]
print('Time taken: {} seconds'.format(time.time() - t0))

Topics for each endpoints modeled in a table
Topics for each endpoints modeled in a table
Topics for each endpoints modeled in a table
Time taken: 10.824284791946411 seconds


In [12]:
for i in range(mod_sub0.cur_model_topic_quantity()):
    print('Topic {} has the following words as its highest weight:'.format(i))
    print(mod_sub0.cur_model_topic_words(i, 5))
    print('The endpoints that are best related to the topic are the following (endpoint_id, %):')
    for j in range(5):
        print((bests_sub0[i][j][0], bests_sub0[i][j][1][1]))
    print()

Topic 0 has the following words as its highest weight:
[('user', 0.4052311057718351), ('filter', 0.2690587721107555), ('value', 0.257955595706499), ('return', 0.2523012158970444), ('use', 0.24676139836062372)]
The endpoints that are best related to the topic are the following (endpoint_id, %):
(2207, 31.884173975306354)
(2272, 30.309827725638247)
(4981, 21.768608491744782)
(2266, 20.433678167635417)
(1983, 18.898143348001344)

Topic 1 has the following words as its highest weight:
[('user', -0.6162135928235281), ('return', 0.32362251111400364), ('filter', 0.16951044851893604), ('key', 0.16098813307168452), ('contain', 0.1556922994013812)]
The endpoints that are best related to the topic are the following (endpoint_id, %):
(4079, 8.829900270248672)
(4081, 8.665594238327103)
(4049, 8.609206426403192)
(4046, 8.234915132372631)
(4041, 7.743532986734398)

Topic 2 has the following words as its highest weight:
[('route', 0.43868103433510935), ('value', -0.4319522099862365), ('filter', -0.374

In [24]:
id = 2207
info = id_info[id]
print('API: {}, \nEndpoint: {} \nDescription: {}'.format(info[0], info[1], info[3]))
print(data_lemmatized[id])

API: ./APIs/bikewise.org\v2, 
Endpoint: /v2/incidents/get 
Description: 
<p>If you’d like more detailed information about bike incidents, use this endpoint. For mapping, <code>locations</code> is probably a better bet.</p>

<p><strong>Notes on location searching</strong>: <br />
- <code>proximity</code> accepts an ip address, an address, zipcode, city, or latitude,longitude - i.e. <code>70.210.133.87</code>, <code>210 NW 11th Ave, Portland, OR</code>, <code>60647</code>, <code>Chicago, IL</code>, and <code>45.521728,-122.67326</code> are all acceptable<br />
- <code>proximity_square</code> sets the length of the sides of the square to find matches inside of. The square is centered on the location specified by <code>proximity</code>. It defaults to 100.</p>

['like', 'detailed', 'information', 'bike', 'incident', 'use', 'endpoint', 'mapping', 'location', 'probably', 'better', 'bet', 'note', 'location', 'search', 'proximity', 'accept', 'ip', 'address', 'address', 'zipcode', 'city', 'lati

In [14]:
t0 = time.time()
mod_sub1 = evaluate_and_set_best_model(new_datasets[1])
print('Time taken: {} seconds'.format(time.time() - t0))

Latent Dirichlet Allocation (LDA), c_v, 2, 5, 0.5743401741052129
Latent Semantic Analysis (LSA), aka Latent Semantic Index (LSI), c_v, 2, 5, 0.4555369722303932
Non Negative Matrix Factorization (NMF), c_v, 2, 5, 0.49362869959612177
Latent Dirichlet Allocation (LDA), c_v, 3, 5, 0.6038356887848554
Latent Semantic Analysis (LSA), aka Latent Semantic Index (LSI), c_v, 3, 5, 0.5493553266062928
Non Negative Matrix Factorization (NMF), c_v, 3, 5, 0.5864257234341214
Latent Dirichlet Allocation (LDA), c_v, 4, 5, 0.5210194868895611
Latent Semantic Analysis (LSA), aka Latent Semantic Index (LSI), c_v, 4, 5, 0.5459162544057682
Non Negative Matrix Factorization (NMF), c_v, 4, 5, 0.5493953582923019
Latent Dirichlet Allocation (LDA), c_v, 5, 5, 0.4398328926475209
Latent Semantic Analysis (LSA), aka Latent Semantic Index (LSI), c_v, 5, 5, 0.5279497639513836
Non Negative Matrix Factorization (NMF), c_v, 5, 5, 0.6311487829880905
Latent Dirichlet Allocation (LDA), c_v, 6, 5, 0.46355165813873933
Latent Se

In [15]:
t0 = time.time()
bests_sub1 = [best_endpoints_for_topic(mod_sub1, i) for i in range(mod_sub1.cur_model_topic_quantity())]
print('Time taken: {} seconds'.format(time.time() - t0))

Topics for each endpoints modeled in a table
Topics for each endpoints modeled in a table
Topics for each endpoints modeled in a table
Topics for each endpoints modeled in a table
Topics for each endpoints modeled in a table
Time taken: 24.207664728164673 seconds


In [16]:
for i in range(mod_sub1.cur_model_topic_quantity()):
    print('Topic {} has the following words as its highest weight:'.format(i))
    print(mod_sub1.cur_model_topic_words(i, 5))
    print('The endpoints that are best related to the topic are the following (endpoint_id, %):')
    for j in range(5):
        print((bests_sub1[i][j][0], bests_sub1[i][j][1][1]))
    print()

Topic 0 has the following words as its highest weight:
[('user', 0.06717222532150371), ('email', 0.045696338533723964), ('allow', 0.017116733242016874), ('send', 0.016886120454048813), ('create', 0.016591596554293042)]
The endpoints that are best related to the topic are the following (endpoint_id, %):
(188, 1.0)
(424, 1.0)
(425, 1.0)
(448, 1.0)
(476, 1.0)

Topic 1 has the following words as its highest weight:
[('value', 0.02859597997360834), ('specify', 0.02002124466638937), ('create', 0.018111399537097582), ('token', 0.0173376236458634), ('must', 0.014805978449973355)]
The endpoints that are best related to the topic are the following (endpoint_id, %):
(479, 1.0)
(2344, 1.0)
(2650, 1.0)
(2656, 1.0)
(2981, 1.0)

Topic 2 has the following words as its highest weight:
[('field', 0.02540788540171225), ('object', 0.022886181857317448), ('key', 0.02105923408355683), ('request', 0.019106963074859505), ('return', 0.015082693662815144)]
The endpoints that are best related to the topic are th

In [21]:
id = 188
info = id_info[id]
print('API: {}, \nEndpoint: {} \nDescription: {}'.format(info[0], info[1], info[3]))
print(data_lemmatized[id])

API: ./APIs/akeneo.com\1.0.0, 
Endpoint: /api/rest/v1/assets/{asset_code}/variation-files/{channel_code}/{locale_code}/post 
Description: This endpoint allows you to upload a new variation file for a given asset, channel and locale.
['endpoint', 'allow', 'upload', 'new', 'variation', 'file', 'give', 'asset', 'channel', 'locale']


In [109]:
csv_topics = gen_csv_to_topics(mod, bests, id_info)
csv_vals[:10]
csv_statistics = 

[0,
 'get, list, filter, give, service',
 './APIs/1forge.com\\0.0.1/quotes/get',
 1.0,
 0,
 'get, list, filter, give, service',
 './APIs/azure.com\\alertsmanagement-AlertsManagement\\2018-11-02-privatepreview/subscriptions/{subscriptionId}/resourceGroups/{resourceGroup}/providers/Microsoft.AlertsManagement/actionRules/{actionRuleName}/get',
 1.0,
 0,
 'get, list, filter, give, service']

In [5]:
from python_to_html import *

In [26]:
csv_vals = gen_csv_to_topics(mod, bests, id_info)
create_table_with_csv(csv_vals)

[0, 'get, list, filter, give, service', './APIs/1forge.com\\0.0.1/quotes/get', 1.0, 0, 'get, list, filter, give, service', './APIs/azure.com\\azsadmin-Quotas\\2018-02-09/subscriptions/{subscriptionId}/providers/Microsoft.Compute.Admin/locations/{location}/quotas/{quotaName}/get', 1.0, 0, 'get, list, filter, give, service', './APIs/azure.com\\datashare-DataShare\\2018-11-01-preview/providers/Microsoft.DataShare/locations/{location}/consumerInvitations/{invitationId}/get', 1.0, 0, 'get, list, filter, give, service', './APIs/azure.com\\datashare-DataShare\\2018-11-01-preview/subscriptions/{subscriptionId}/resourceGroups/{resourceGroupName}/providers/Microsoft.DataShare/accounts/{accountName}/shareSubscriptions/{shareSubscriptionName}/ConsumerSourceDataSets/get', 1.0, 0, 'get, list, filter, give, service', './APIs/azure.com\\datashare-DataShare\\2018-11-01-preview/subscriptions/{subscriptionId}/resourceGroups/{resourceGroupName}/providers/Microsoft.DataShare/accounts/{accountName}/shareSub

In [18]:
m = mod._Gensim_Model__current_model
c = mod._Gensim_Model__corpus
idw2 = mod._Gensim_Model__id2words
d_l = mod._Gensim_Model__data_lemmatized

In [12]:
m.num_topics

12

In [13]:
dir(m)

['A',
 'B',
 '_W',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_adapt_by_suffix',
 '_apply',
 '_dense_dot_csc',
 '_h',
 '_h_max_iter',
 '_h_stop_condition',
 '_kappa',
 '_load_specials',
 '_save_specials',
 '_setup',
 '_smart_save',
 '_solve_w',
 '_solveproj',
 '_transform',
 '_w_error',
 '_w_max_iter',
 '_w_stop_condition',
 'chunksize',
 'eval_every',
 'get_document_topics',
 'get_term_topics',
 'get_topic_terms',
 'get_topics',
 'id2word',
 'l2_norm',
 'load',
 'minimum_probability',
 'normalize',
 'num_tokens',
 'num_topics',
 'passes',
 'print_topic',
 'print_topics',
 'random_state',
 'save',
 'show_topic',
 'show_topics',
 'top_topics',
 'up

In [58]:
#m.top_topics(c, d_l, idw2, coherence='c_v', topn=5)
#cohs = m.top_topics(c, d_l, idw2, coherence='c_v', topn=5)
print(bests[0])

[(0, (0, 1.0)), (445, (0, 1.0)), (618, (0, 1.0)), (628, (0, 1.0)), (633, (0, 1.0)), (640, (0, 1.0)), (644, (0, 1.0)), (648, (0, 1.0)), (652, (0, 1.0)), (716, (0, 1.0)), (754, (0, 1.0)), (761, (0, 1.0)), (771, (0, 1.0)), (980, (0, 1.0)), (1013, (0, 1.0)), (1020, (0, 1.0)), (1048, (0, 1.0)), (1109, (0, 1.0)), (1176, (0, 1.0)), (1690, (0, 1.0)), (1770, (0, 1.0)), (1790, (0, 1.0)), (1791, (0, 1.0)), (1793, (0, 1.0)), (1796, (0, 1.0)), (1801, (0, 1.0)), (1805, (0, 1.0)), (1823, (0, 1.0)), (1840, (0, 1.0)), (1868, (0, 1.0)), (1889, (0, 1.0)), (1891, (0, 1.0)), (1896, (0, 1.0)), (1930, (0, 1.0)), (1964, (0, 1.0)), (1983, (0, 1.0)), (1996, (0, 1.0)), (2000, (0, 1.0)), (2010, (0, 1.0)), (2015, (0, 1.0)), (2112, (0, 1.0)), (2113, (0, 1.0)), (2114, (0, 1.0)), (2117, (0, 1.0)), (2118, (0, 1.0)), (2124, (0, 1.0)), (2128, (0, 1.0)), (2131, (0, 1.0)), (2132, (0, 1.0)), (2133, (0, 1.0)), (2135, (0, 1.0)), (2136, (0, 1.0)), (2137, (0, 1.0)), (2139, (0, 1.0)), (2140, (0, 1.0)), (2141, (0, 1.0)), (2143, 

In [107]:
print(bests[0])

[(0, (0, 1.0)), (422, (0, 1.0)), (441, (0, 1.0)), (618, (0, 1.0)), (628, (0, 1.0)), (633, (0, 1.0)), (640, (0, 1.0)), (644, (0, 1.0)), (648, (0, 1.0)), (652, (0, 1.0)), (716, (0, 1.0)), (754, (0, 1.0)), (761, (0, 1.0)), (771, (0, 1.0)), (855, (0, 1.0)), (980, (0, 1.0)), (1009, (0, 1.0)), (1013, (0, 1.0)), (1020, (0, 1.0)), (1048, (0, 1.0)), (1121, (0, 1.0)), (1176, (0, 1.0)), (1690, (0, 1.0)), (1770, (0, 1.0)), (1790, (0, 1.0)), (1791, (0, 1.0)), (1793, (0, 1.0)), (1801, (0, 1.0)), (1805, (0, 1.0)), (1840, (0, 1.0)), (1868, (0, 1.0)), (1889, (0, 1.0)), (1891, (0, 1.0)), (1896, (0, 1.0)), (1920, (0, 1.0)), (1930, (0, 1.0)), (1943, (0, 1.0)), (1962, (0, 1.0)), (1964, (0, 1.0)), (1983, (0, 1.0)), (2000, (0, 1.0)), (2015, (0, 1.0)), (2112, (0, 1.0)), (2113, (0, 1.0)), (2117, (0, 1.0)), (2118, (0, 1.0)), (2124, (0, 1.0)), (2131, (0, 1.0)), (2132, (0, 1.0)), (2133, (0, 1.0)), (2135, (0, 1.0)), (2136, (0, 1.0)), (2137, (0, 1.0)), (2139, (0, 1.0)), (2140, (0, 1.0)), (2141, (0, 1.0)), (2143, (0

In [108]:
gen_csv_to_statistics(mod, bests, id_info)
'''
print(tpcs[i]) 
print(avg_score) 
print(std_score) 
print(avg_weights) 
print(std_weights)
print(qtt_endpoints)
'''

{'account, true, boolean_admin, envelope, docusign': 0.9106093859435193, 'node, file, add, link, folder': 0.7532945520699723, 'key, return, object, contain, link': 0.7461294483530392, 'value, name, filter, user, search': 0.7322689471938924, 'use, true, account, create, boolean_admin': 0.6779054518085654, 'field, information, group, provide, include': 0.6510879143988152, 'application, service, type, health, specify': 0.6240891118997645, 'api, user, call, see, endpoint': 0.6155617605555058, 'name, parameter, file, return, use': 0.5828733803625671, 'request, status, delete, return, create': 0.5488193134332955, 'would, -PRON-, list, endpoint, request': 0.5218229340013093, 'get, list, filter, give, service': 0.49424471932309544}
0.49424471932309544
0.6595997225910635
0.22826810550720722
0.04018480072486363
0.015073095840906556
2116
0.6155617605555058
0.5017236631584698
0.15353254664977484
0.031710198130853726
0.02110306857522929
1461
0.5218229340013093
0.5288151796452925
0.17160661404726768

'\nprint(tpcs[i]) \nprint(avg_score) \nprint(std_score) \nprint(avg_weights) \nprint(std_weights)\nprint(qtt_endpoints)\n'

In [70]:
dataset = create_table_ids_topics(mod)
d_oz = dataset_topics_over_zero(dataset)
print(d_oz)

Topics for each endpoints modeled in a table
{0: [[0, 1.0]], 1: [[0, 0.49109968307189106], [2, 0.508900316928109]], 2: [[3, 1.0]], 3: [[0, 0.9572044679298476], [4, 0.030261552809901143], [6, 0.012533979260251217]], 4: [[0, 1.0]], 5: [[1, 0.026372209589300817], [3, 0.5126803009446055], [6, 0.08932201885692477], [7, 0.27283099688699797], [9, 0.08931391926945319]], 6: [[0, 0.9882750568229517], [6, 0.011724943177048234]], 7: [[1, 0.14251977287415632], [3, 0.1612047137693608], [5, 0.2922966976168651], [10, 0.06456669904286404], [11, 0.3394121166967536]], 8: [[4, 0.09430776948295898], [9, 0.6938413962595578], [10, 0.20475120870548674]], 9: [[0, 0.7327147386034016], [2, 0.2633403267413753]], 10: [[2, 0.8185852038054067], [6, 0.08971869212382703], [7, 0.07982719532269214], [11, 0.011868908748074029]], 11: [[5, 0.7139233446580915], [7, 0.09141704697226319], [9, 0.16009178796735427], [10, 0.028294906048501597]], 12: [[2, 0.11852681280132772], [6, 0.0226984243052903], [7, 0.045967675880885205], [

In [71]:
print(len(d_oz))

9896
