In [1]:
from sklearn.decomposition import LatentDirichletAllocation
from nltk.stem.porter import PorterStemmer
import re
from nltk.tokenize import RegexpTokenizer
import tomotopy as tp
import pandas as pd
from pathlib import Path
from sklearn.feature_extraction.text import CountVectorizer
import itertools
import random

In [2]:
stemmer = PorterStemmer()
tokenizer = RegexpTokenizer(r"\w+")


def preprocess_text(text: str):
    return " ".join(
        [
            stemmer.stem(word)
            if len(word) > 2
            and word
            not in [
                "uni",
                "uni uni",
                "uni uni uni",
                "ieee",
                "doi",
                "vextendsingl",
                "http",
                "https",
                "vextenddoubl",
                "parenrightbig",
                "parenleftbig",
            ]
            else ""
            for word in tokenizer.tokenize(
                " ".join(
                    [
                        re.sub(
                            r"(\${1,2})(?:(?!\1)[\s\S])*\1",
                            " ",
                            re.sub(r"[\s\d]+", " ", word),
                        )
                        for word in text.split()
                    ]
                )
            )
        ]
    )


In [3]:
data_path = Path.cwd() /  "findal_df.json"


In [4]:
df = pd.read_json(data_path).dropna(subset=["text"])

In [5]:
df['text_pp'] = df['text'].apply(lambda x: preprocess_text(x))

In [6]:
import numpy as np
from typing import Any
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_percentage_error


def _get_params_combinations(hyper_params: dict[str, list]) -> list[dict]:
    keys, values = zip(*hyper_params.items())
    return [dict(zip(keys, v)) for v in itertools.product(*values)]


def one_model_train(
    X_train: pd.DataFrame, y_train: pd.Series, params: dict[str, Any]
):
    slda = tp.SLDAModel(**params)
    for i in range(0, len(X_train)):
        slda.add_doc(
            X_train.iloc[i].strip().split(), y=[float(np.array(y_train)[i])]
        )
    for i in range(0, 1020, 20):
        slda.train(20, workers=0)

    return slda


def one_model_eval(
    model: tp.SLDAModel, X_test: pd.DataFrame, y_test: pd.Series
):
    test_preds = []
    for i in range(0, len(X_test)):
        slda_test_doc = model.make_doc(list(X_test)[i])
        model.infer(slda_test_doc, workers=0)
        test_preds.append(float(model.estimate(slda_test_doc)))

    return mean_absolute_percentage_error(y_true=y_test, y_pred=test_preds)


def model_cross_validate(
    X: pd.DataFrame, y: pd.Series, params: dict[str, Any]
):
    results = []
    kf = KFold(n_splits=3)

    for i, (train_index, test_index) in enumerate(kf.split(X)):
        print(f"Fold {i}")
        X_train, y_train = X.iloc[train_index], y.iloc[train_index]
        X_test, y_test = X.iloc[test_index], y.iloc[test_index]

        model = one_model_train(X_train=X_test, y_train=y_train, params=params)
        result = one_model_eval(model=model, X_test=X_test, y_test=y_test)
        results.append(result)

    return {**params, "mape": np.mean(results)}


def hypreropt(
    X: pd.DataFrame,
    y: pd.Series,
    hyper_params: dict[str, list],
    n_iter: int = 20,
) -> pd.DataFrame:
    params_combinations = _get_params_combinations(hyper_params)
    random.shuffle(params_combinations)

    results = []
    for i, params in enumerate(params_combinations[:n_iter]):
        print(f"Params {i}")
        result = model_cross_validate(X=X, y=y, params=params)
        results.append(result)
        
    return results


In [19]:
hyper_params = dict(
    k=[5, 10, 15, 20, 25],  # number of topics
    min_df=[0],  # DF of tokens to be removed "from the bottom"
    rm_top=[0, 1, 2, 5],  # how many tokens should be removed "from the top"
    vars=["l"],  # indicate binary response variable
    alpha=[0.1, 0.2, 0.3],
    eta=[0.01, 0.05, 0.1, 0.2],
    mu=[0, 0.1, 0.2],
    nu_sq=[1],
    glm_param=[1],
    seed=[123],
)


In [20]:
X = df["text_pp"]
y = df["citations"]

In [26]:
results = hypreropt(X, y, hyper_params, 30)

Params 0
Fold 0


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 1


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 2


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Params 1
Fold 0


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 1


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 2


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Params 2
Fold 0


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 1


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 2


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Params 3
Fold 0


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 1


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 2


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Params 4
Fold 0


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 1


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 2


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Params 5
Fold 0


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 1


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 2


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Params 6
Fold 0


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 1


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 2


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Params 7
Fold 0


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 1


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 2


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Params 8
Fold 0


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 1


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 2


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Params 9
Fold 0


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 1


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 2


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Params 10
Fold 0


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 1


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 2


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Params 11
Fold 0


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 1


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 2


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Params 12
Fold 0


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 1


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 2


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Params 13
Fold 0


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 1


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 2


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Params 14
Fold 0


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 1


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 2


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Params 15
Fold 0


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 1


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 2


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Params 16
Fold 0


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 1


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 2


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Params 17
Fold 0


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 1


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 2


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Params 18
Fold 0


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 1


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 2


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Params 19
Fold 0


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 1


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 2


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Params 20
Fold 0


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 1


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 2


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Params 21
Fold 0


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 1


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 2


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Params 22
Fold 0


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 1


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 2


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Params 23
Fold 0


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 1


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 2


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Params 24
Fold 0


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 1


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 2


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Params 25
Fold 0


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 1


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 2


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Params 26
Fold 0


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 1


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 2


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Params 27
Fold 0


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 1


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 2


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Params 28
Fold 0


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 1


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 2


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Params 29
Fold 0


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 1


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 2


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


In [27]:
result_df = pd.DataFrame(results)

In [28]:
result_df

Unnamed: 0,k,min_df,rm_top,vars,alpha,eta,mu,nu_sq,glm_param,seed,mape
0,20,0,0,l,0.3,0.2,0.2,1,1,123,401.554314
1,20,0,5,l,0.1,0.1,0.1,1,1,123,411.168225
2,10,0,5,l,0.3,0.05,0.1,1,1,123,410.345526
3,20,0,5,l,0.3,0.01,0.2,1,1,123,74.194412
4,20,0,5,l,0.1,0.01,0.0,1,1,123,25.392506
5,25,0,1,l,0.3,0.2,0.2,1,1,123,380.574457
6,20,0,1,l,0.2,0.01,0.0,1,1,123,112.171611
7,5,0,0,l,0.2,0.2,0.0,1,1,123,415.735967
8,15,0,1,l,0.3,0.01,0.2,1,1,123,103.041735
9,25,0,2,l,0.3,0.05,0.2,1,1,123,291.991895


In [29]:
result_df.to_csv("slda_results_2.csv")

In [60]:
best_model_params = {
    k: v
    for k, v in result_df.sort_values(by="mape", ascending=True)
    .iloc[0]
    .to_dict()
    .items()
    if k != "mape"
}


In [8]:
best_model_params = {'k': 25,
 'min_df': 0,
 'rm_top': 0,
 'vars': 'l',
 'alpha': 0.1,
 'eta': 0.01,
 'mu': 0.2,
 'nu_sq': 1,
 'glm_param': 1,
 'seed': 123}

In [9]:
best_model_cv_results = model_cross_validate(X, y, best_model_params)

Fold 0


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 1


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 2


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


In [10]:
best_model_cv_results

{'k': 25,
 'min_df': 0,
 'rm_top': 0,
 'vars': 'l',
 'alpha': 0.1,
 'eta': 0.01,
 'mu': 0.2,
 'nu_sq': 1,
 'glm_param': 1,
 'seed': 123,
 'mape': 1.0728589518032574}

In [11]:
best_model = one_model_train(X, y, best_model_params)

  slda.train(20, workers=0)


In [15]:
best_model.save("best_model.bin")

In [12]:
# mdl = tp.SLDAModel.load('best_model.bin')
mdl = best_model

In [16]:
best_model.perplexity

390420.1534422796

In [18]:
from pprint import pprint

In [13]:
topic_list = []
for i in range(mdl.k):
    topic_list.append(mdl.get_topic_words(i))

In [14]:
topic_list

[[('network', 0.05360081046819687),
  ('layer', 0.05084865912795067),
  ('neural', 0.03615159913897514),
  ('and', 0.029881231486797333),
  ('deep', 0.02864701859652996),
  ('with', 0.027653975412249565),
  ('input', 0.016721026971936226),
  ('train', 0.015108511783182621),
  ('arxiv', 0.014210043475031853),
  ('learn', 0.012039531022310257)],
 [('and', 0.04313143342733383),
  ('learn', 0.03918875753879547),
  ('polici', 0.02442975528538227),
  ('the', 0.023782560601830482),
  ('agent', 0.019304275512695312),
  ('reward', 0.01599763333797455),
  ('state', 0.015926962718367577),
  ('reinforc', 0.015521536581218243),
  ('action', 0.01398910116404295),
  ('with', 0.013803125359117985)],
 [('the', 0.05114930495619774),
  ('and', 0.04657886549830437),
  ('for', 0.01724468544125557),
  ('network', 0.016319412738084793),
  ('model', 0.013253813609480858),
  ('data', 0.011184660717844963),
  ('system', 0.010020444169640541),
  ('time', 0.009517136961221695),
  ('physic', 0.009201934561133385),