In [1]:
from new_main import *
import time
import numpy as np
from python_to_html import create_html

In [2]:
# return a list with every endpoint that has over 0% of a certain topic (new_datasets[i], where i == topic)
def filter_sub_topics_over_0(mod, dataset):
    dataset = create_table_ids_topics(mod)
    d_0 = dataset_topics_over_zero(dataset)
    endps = [filter_endpoints_by_topic(d_0, i) for i in range(mod.cur_model_topic_quantity())]
    new_datasets = [new_dataset_filtered_endpoint(data_lemmatized, endp) for endp in endps]
    return endps, new_datasets

# Return a dictionary with the best endpoints for each topic [id of endpoint, topic it belongs, % of topic]
def endpoints_for_topic(mod, tpc):
    dataset = create_table_ids_topics(mod)
    d_end = dataset_topics_over_zero(dataset)
    dict_end = {i: (d_end[i][j][0], d_end[i][j][1]) for i in range(len(d_end)) for j in range(len(d_end[i])) if d_end[i][j][0] == tpc}
    ordered_d = sorted(dict_end.items(), key=lambda x:x[1][1], reverse=True)
    return ordered_d

# Return a dictionary with the best endpoints for each topic [id of endpoint, topic it belongs, % of topic]
def best_endpoints_for_topic(mod, tpc):
    dataset = create_table_ids_topics(mod)
    d_oz = dataset_topics_over_zero(dataset)
    dict_best = {}
    for i in range(len(d_oz)):
        best_score = -1
        best_tpc = None
        for j in range(len(d_oz[i])):
            if(d_oz[i][j][1] > best_score):
                best_score = d_oz[i][j][1]
                best_tpc = d_oz[i][j][0]
        if(best_tpc == tpc):       
            dict_best[i] = (best_tpc, best_score) 
    ordered_d = sorted(dict_best.items(), key=lambda x:x[1][1], reverse=True)
    return ordered_d

def gen_csv_to_topics(mod, bests, id_info):
    csv_vals = []
    for i in range(mod.cur_model_topic_quantity()):
        for j in range(5):
            csv_vals.append(i)
            csv_vals.append(', '.join([elem[0] for elem in mod.cur_model_topic_words(i, 5)]))
            endp_id = bests[i][j][0]
            endp_score = bests[i][j][1][1]
            info = id_info[endp_id]
            csv_vals.append(info[0] + info[1])
            csv_vals.append(endp_score)
            
    return csv_vals

def gen_csv_to_statistics(mod, bests, id_info):
    csv_vals = []
    m = mod._Gensim_Model__current_model
    c = mod._Gensim_Model__corpus
    idw2 = mod._Gensim_Model__id2words
    d_l = mod._Gensim_Model__data_lemmatized
    tpcs = {}
    top_topics = m.top_topics(c, d_l, idw2, coherence='c_v', topn=5)
    top_topics = {', '.join([word[1] for word in top_topics[i][0]]): top_topics[i][1] for i in range(len(top_topics))}
    
    #print(top_topics)
    
    for i in range(mod.cur_model_topic_quantity()):
        words_tpc = [elem[0] for elem in mod.cur_model_topic_words(i, 5)]
        weights_w_tpc = [elem[1] for elem in mod.cur_model_topic_words(i, 5)]
        tpc = ', '.join(words_tpc)
        tpcs[i] = top_topics[tpc]
        best_i = [bests[i][j][1][1] for j in range(len(bests[i]))]
        avg_score, std_score = np.average(best_i), np.std(best_i)
        avg_weights, std_weights = np.average(weights_w_tpc), np.std(weights_w_tpc)
        qtt_endpoints = len(bests[i])
        csv_vals.append(i)
        csv_vals.append(tpcs[i]) 
        csv_vals.append(avg_score) 
        csv_vals.append(std_score) 
        csv_vals.append(avg_weights) 
        csv_vals.append(std_weights)
        csv_vals.append(qtt_endpoints)
        
    return csv_vals

In [3]:
# Loading the dataset into variables, with the endpoints' description lemmatized and a dictionary of endpoint id -> information 
t0 = time.time()
data_lemmatized, id_info = prepare_dataset()
print('Time taken: {} seconds'.format(time.time() - t0))

Time taken: 39.67620277404785 seconds


In [5]:
# Create the model with the lemmatized data, and try to model the topics 2-20, with LDA, LSA and NMF
t0 = time.time()
#mod = evaluate_and_set_best_model(data_lemmatized)
mod = Gensim_Model(data_lemmatized)
mod.set_model(TopicModelingAlgorithm.NMF, 12)

print('Time taken: {} seconds'.format(time.time() - t0))

Time taken: 1.5490031242370605 seconds


In [6]:
# For each topic generate a list with the endpoint that is best related to the topic
t0 = time.time()
bests = [best_endpoints_for_topic(mod, i) for i in range(mod.cur_model_topic_quantity())]
print('Time taken: {} seconds'.format(time.time() - t0))

Topics for each endpoints modeled in a table
Topics for each endpoints modeled in a table
Topics for each endpoints modeled in a table
Topics for each endpoints modeled in a table
Topics for each endpoints modeled in a table
Topics for each endpoints modeled in a table
Topics for each endpoints modeled in a table
Topics for each endpoints modeled in a table
Topics for each endpoints modeled in a table
Topics for each endpoints modeled in a table
Topics for each endpoints modeled in a table
Topics for each endpoints modeled in a table
Time taken: 94.28344655036926 seconds


In [7]:
csv_topics = gen_csv_to_topics(mod, bests, id_info)
print(csv_topics[:10])
csv_statistics = gen_csv_to_statistics(mod, bests, id_info)
print(csv_statistics[:10])

[0, 'get, list, filter, give, service', './APIs/1forge.com\\0.0.1/quotes/get', 1.0, 0, 'get, list, filter, give, service', './APIs/azure.com\\azsadmin-Quotas\\2018-02-09/subscriptions/{subscriptionId}/providers/Microsoft.Compute.Admin/locations/{location}/quotas/{quotaName}/get', 1.0, 0, 'get, list, filter, give, service']
[0, 0.49424471932309544, 0.6611925133320253, 0.22339082963739318, 0.04018480072486363, 0.015073095840906556, 2485, 1, 0.6155617605555058, 0.5183797046204622]


In [8]:
#csv_vals = gen_csv_to_topics(mod, bests, id_info)
#create_table_with_csv(csv_vals)
create_html(csv_topics, csv_statistics)

[0, 'get, list, filter, give, service', './APIs/1forge.com\\0.0.1/quotes/get', 1.0, 0, 'get, list, filter, give, service', './APIs/azure.com\\azsadmin-Quotas\\2018-02-09/subscriptions/{subscriptionId}/providers/Microsoft.Compute.Admin/locations/{location}/quotas/{quotaName}/get', 1.0, 0, 'get, list, filter, give, service', './APIs/azure.com\\datashare-DataShare\\2018-11-01-preview/providers/Microsoft.DataShare/locations/{location}/consumerInvitations/{invitationId}/get', 1.0, 0, 'get, list, filter, give, service', './APIs/azure.com\\datashare-DataShare\\2018-11-01-preview/subscriptions/{subscriptionId}/resourceGroups/{resourceGroupName}/providers/Microsoft.DataShare/accounts/{accountName}/shareSubscriptions/{shareSubscriptionName}/ConsumerSourceDataSets/get', 1.0, 0, 'get, list, filter, give, service', './APIs/azure.com\\datashare-DataShare\\2018-11-01-preview/subscriptions/{subscriptionId}/resourceGroups/{resourceGroupName}/providers/Microsoft.DataShare/accounts/{accountName}/shareSub