In [5]:
import sys
sys.path.insert(0,'/home/leandror/cluwords')

In [58]:
import cluwords
from cluwords import Cluwords, CluwordsTFIDF
import embedding
from embedding import CreateEmbeddingModels
from metrics import Evaluation
from sklearn.decomposition import NMF
from scipy.sparse import csr_matrix


In [77]:
def create_embedding_models(dataset, embedding_file_path, embedding_type,
                            datasets_path, path_to_save_model):
    """
    Description
    -----------
    Create the word2vec models for each dataset
    """
    word2vec_models = CreateEmbeddingModels(embedding_file_path=embedding_file_path,
                                            embedding_type=embedding_type,
                                            document_path=datasets_path,
                                            path_to_save_model=path_to_save_model)
    n_words = create_embedding_models(dataset)

    return n_words
def top_words(model, feature_names, n_top_words):
    topico = []
    for topic_idx, topic in enumerate(model.components_):
        top = ''
        top2 = ''
        top += ' '.join([feature_names[i]
                         for i in topic.argsort()[:-n_top_words - 1:-1]])
        top2 += ''.join(str(sorted(topic)[:-n_top_words - 1:-1]))

        topico.append(str(top))

    return topico


In [57]:
def print_results(model, tfidf_feature_names, cluwords_freq, cluwords_docs,
                  dataset, path_to_save_results, path_to_save_model):
    print(path_to_save_results)
    for t in [5, 10, 20]:
        with open('{}/result_topic_{}.txt'.format(path_to_save_results, t), 'w') as f_res:
            f_res.write('Topics {}\n'.format(t))
            topics = top_words(model, tfidf_feature_names, t)
            f_res.write('{}\n'.format(topics))

            coherence = Evaluation.coherence(topics, cluwords_freq, cluwords_docs)
            f_res.write('Coherence: {} ({})\n'.format(np.round(np.mean(coherence), 4), np.round(np.std(coherence), 4)))
            f_res.write('{}\n'.format(coherence))

            pmi, npmi = Evaluation.pmi(topics, cluwords_freq, cluwords_docs,
                                       sum([freq for word, freq in cluwords_freq.items()]), t)
            f_res.write('PMI: {} ({})\n'.format(np.round(np.mean(pmi), 4), np.round(np.std(pmi), 4)))
            f_res.write('{}\n'.format(pmi))
            f_res.write('NPMI: {} ({})\n'.format(np.round(np.mean(npmi), 4), np.round(np.std(npmi), 4)))
            f_res.write('{}\n'.format(npmi))

            w2v_l1 = Evaluation.w2v_metric(topics, t, path_to_save_model, 'l1_dist', dataset)
            f_res.write('W2V-L1: {} ({})\n'.format(np.round(np.mean(w2v_l1), 4), np.round(np.std(w2v_l1), 4)))
            f_res.write('{}\n'.format(w2v_l1))

            f_res.close()

In [81]:
# Variaveis que voce deve alterar:
DATASETS_PATH = """datasets"""
PATH_TO_SAVE_RESULTS = """results"""
PATH_TO_SAVE_MODEL = """word_emb_models/dataset_artigos_models"""
EMBEDDINGS_FILE_PATH = """word_emb_models/l2v.vec"""
DATASET = "artigos"
N_THREADS = 4
N_COMPONENTS = 10

# Nao precisa alterar essas:
HAS_CLASS = False
CLASS_PATH = """"""
EMBEDDINGS_BIN_TYPE = False

In [82]:
# RUN ONE TIME
n_words = create_embedding_models(dataset=DATASET,
                                  embedding_file_path=EMBEDDINGS_FILE_PATH,
                                  embedding_type=EMBEDDINGS_BIN_TYPE,
                                  datasets_path=DATASETS_PATH,
                                  path_to_save_model=PATH_TO_SAVE_MODEL)

Embedding model read in 1.354s.


TypeError: create_embedding_models() missing 4 required positional arguments: 'embedding_file_path', 'embedding_type', 'datasets_path', and 'path_to_save_model'

In [None]:
threshold = 0.4
cossine_filter = 0.8
word_count = n_words
k = n_words
algorithm_type = "knn_cosine"
embedding_file_path = """{}/{}.txt""".format(PATH_TO_SAVE_MODEL, DATASET)
dataset_file_path = """{}/{}Pre.txt""".format(DATASETS_PATH, DATASET)
path_to_save_results = '{}/{}'.format(PATH_TO_SAVE_RESULTS, DATASET)

try:
    os.mkdir('{}'.format(path_to_save_results))
except FileExistsError:
    pass

In [None]:
# Codigo das Cluwords
Cluwords(algorithm=algorithm_type,
         embedding_file_path=embedding_file_path,
         n_words=word_count,
         k_neighbors=k,
         threshold=threshold,
         n_jobs=N_THREADS
         )

In [None]:
cluwords = CluwordsTFIDF(dataset_file_path=dataset_file_path,
                         n_words=word_count,
                         cossine_filter=cossine_filter,
                         path_to_save_cluwords=path_to_save_results,
                         class_file_path=CLASS_PATH,
                         has_class=HAS_CLASS)
print('Computing TFIDF...')
cluwords_tfidf = cluwords.fit_transform()
# Convert the cluwords_tfidf array matrix to a sparse cluwords
cluwords_tfidf = csr_matrix(cluwords_tfidf)

In [None]:
# Fit the NMF model
print("\nFitting the NMF model (Frobenius norm) with tf-idf features, "
      "n_samples=%d and n_features=%d..." % (cluwords.n_documents, cluwords.n_cluwords))

nmf = NMF(n_components=N_COMPONENTS,
          random_state=1,
          alpha=.1,
          l1_ratio=.5).fit(cluwords_tfidf)


with open('{}/matrix_w.txt'.format(path_to_save_results), 'w') as f:
    w = nmf.fit_transform(cluwords_tfidf)  # matrix W = m x k
    h = nmf.components_.transpose()  # matrix H = n x k
    print('W: {} H:{}'.format(w.shape, h.shape))
    for x in range(w.shape[0]):
        for y in range(w.shape[1]):
            f.write('{} '.format(w[x][y]))
        f.write('\n')
    f.close()

tfidf_feature_names = list(cluwords.vocab_cluwords)

# -----------------------------
# Avaliação dos resultados
# Os resultados serao salvos na pasta results (nesse caso)
n_cluwords, cluwords_vocab, cluwords_freq, cluwords_docs = Evaluation.count_tf_idf_repr(
    cluwords.vocab_cluwords, cluwords_tfidf)

# print('n_terms: {}'.format(n_cluwords))
# print('words1: {}'.format(cluwords_vocab))
# print('word_frequency: {}'.format(cluwords_freq))
# print('term_docs: {}'.format(cluwords_docs))

print_results(model=nmf,
              tfidf_feature_names=tfidf_feature_names,
              cluwords_freq=cluwords_freq,
              cluwords_docs=cluwords_docs,
              dataset=DATASET,
              path_to_save_results=path_to_save_results,
              path_to_save_model=PATH_TO_SAVE_MODEL
              )
