In [1]:
import sys
import os
sys.path.insert(1, '/home/peps/Documents/tesis_codigo')

In [2]:
import pickle
from gensim.models import AuthorTopicModel
from gensim.models import LdaModel

In [3]:
with open("pickles/preprocesamiento.pickle", "rb") as f:
    corpus, dictionary, author2doc = pickle.load(f) 

In [4]:
print('# de autores: %d' % len(author2doc))
print('# tokens unicos: %d' % len(dictionary))
print('# de documentos: %d' % len(corpus))

# de autores: 114
# tokens unicos: 19900
# de documentos: 41340


# Escogiendo el número de topicos e hiperparametros

## Hiperparametros

In [5]:
from ray import tune
from ray.tune import track
from ray.tune.schedulers import ASHAScheduler
import numpy as np

In [6]:
search_space = {
    "num_topics": 100,
    "passes": tune.sample_from(lambda spec: np.random.randint(1,10)),
    "iterations": tune.sample_from(lambda spec: np.random.randint(50,10000)),
    "gamma_threshold": tune.uniform(1e-11, 0.001)    
}

In [11]:
# class Trainable(tune.Trainable):
#     def _setup(self, config):
#         self.corpus = corpus
#         self.corpus_test = corpus
#         self.dictionary = dictionary
#         self.author2doc = author2doc
#         for key, value in config.items():
#             setattr(self, key, value)
    
#     def _train(self):
#         model = AuthorTopicModel(
#             corpus=self.corpus, 
#             num_topics=self.num_topics,
#             id2word=self.dictionary.id2token,
#             author2doc=self.author2doc, 
#             chunksize=2000, 
#             passes=self.passes, 
#             eval_every=2, 
#             iterations=self.iterations,
#             gamma_threshold=self.gamma_threshold
#         )
#         self.model = model
#         top_topics = model.top_topics(corpus)
#         tc = sum([t[1] for t in top_topics])
#         return {"topic_coherence": tc}
    
#     def _save(self, tmp_checkpoint_dir):
#         import os
#         checkpoint_path = os.path.join(tmp_checkpoint_dir, "model.pth")
#         self.model.save(checkpoint_path)
#         return tmp_checkpoint_dir
 

In [20]:
def search_best_AuthorTopicModel(config):
    model = AuthorTopicModel(
        corpus=corpus, 
        num_topics=config["num_topics"],
        id2word=dictionary.id2token,
        author2doc=author2doc, 
        chunksize=2000, 
        passes=config["passes"], 
        eval_every=0, 
        iterations=config["iterations"],
        gamma_threshold=config["gamma_threshold"]
    )
    top_topics = model.top_topics(corpus)
    tc = sum([t[1] for t in top_topics])
    
    model_path = os.path.join(track.trial_dir(), "model.save")
    model.save(model_path)
    
    track.log(topic_coherence=tc)

In [18]:
reporter = tune.JupyterNotebookReporter(True, max_progress_rows=20, max_error_rows=20, max_report_frequency=5)

In [22]:
analysis = tune.run(search_best_AuthorTopicModel,
                    name = "Hyper_search_test",
                    config = search_space,
                    num_samples=2,
                    progress_reporter=reporter)

Trial name,status,loc,gamma_threshold,iterations,passes,iter,total time (s)
search_best_AuthorTopicModel_00000,TERMINATED,,0.000894836,4663,4,0,78.3047
search_best_AuthorTopicModel_00001,TERMINATED,,0.000543645,2953,2,0,52.3524


In [None]:
# df_analysis_hyper = analysis.dataframe()

In [None]:
# with open("pickles/hyper_search.pickle", "wb") as f:
#     pickle.dump(df_analysis_hyper , f)

In [None]:
with open("pickles/hyper_search.pickle", "rb") as f:
    df_analysis_hyper = pickle.load(f)

In [None]:
df_analysis_hiper

### gamma_threshold

In [None]:
import matplotlib.pyplot as plt
plt.close('all')
plt.figure()

In [None]:
df_analysis_hiper.plot.scatter(y = "topic_coherence", x = "config/gamma_threshold")
plt.xlim(0, 0.001)

In [None]:
from pandas.plotting import lag_plot

plt.close('all')
plt.figure()
lag_plot(df_analysis_hiper["config/gamma_threshold"])
plt.xlim(0, 0.001)
plt.ylim(0, 0.001)

In [None]:
from pandas.plotting import scatter_matrix
# plt.figure(figsize=(18*3, 16*3), dpi= 80*3)
plt.rcParams["figure.figsize"]=15,15
scatter_matrix(df_analysis_hiper[["topic_coherence", "config/gamma_threshold", "config/iterations", "config/passes"]], diagonal = "kde")

In [None]:
best_hiper = analysis.get_best_config(metric = "topic_coherence", mode='max')

In [None]:
# best_hiper_trial = analysis.get_best_trial(metric = "topic_coherence", mode='max')

In [None]:
best_hiper

## Topicos

In [None]:
search_space_topics = {
    **best_hiper,
    "num_topics": tune.sample_from(lambda spec: np.random.randint(10,150)) 
}

In [None]:
reporter = tune.JupyterNotebookReporter(True, max_progress_rows=20, max_error_rows=20, max_report_frequency=5)

In [None]:
# %%time
# analysis = tune.run(search_best_AuthorTopicModel, config=search_space, num_samples=50, progress_reporter=reporter)

In [None]:
# df_analysis_topic = analysis.dataframe()

In [None]:
# with open("pickles/topic_search.pickle", "wb") as f:
#     pickle.dump(df_analysis_topic , f)

In [None]:
with open("pickles/topic_search.pickle", "rb") as f:
    df_analysis_topic = pickle.load(f)

In [None]:
df_analysis_topic.columns

In [None]:
df_analysis_topic.loc[df_analysis_topic["topic_coherence"] == df_analysis_topic["topic_coherence"].max()]

In [None]:
df_analysis_topic.plot.scatter(x="config/num_topics", y = "topic_coherence")