In [10]:
from sklearn.decomposition import LatentDirichletAllocation
from nltk.stem.porter import PorterStemmer
import re
from nltk.tokenize import RegexpTokenizer
import tomotopy as tp
import pandas as pd
from pathlib import Path
from sklearn.feature_extraction.text import CountVectorizer
import itertools
import random

In [4]:
stemmer = PorterStemmer()
tokenizer = RegexpTokenizer(r"\w+")


def preprocess_text(text: str):
    return " ".join(
        [
            stemmer.stem(word)
            for word in tokenizer.tokenize(
                " ".join([re.sub(r"\s+", " ", word) for word in text.split()])
            )
        ]
    )


In [5]:
data_path = Path.cwd() /  "findal_df.json"


In [6]:
df = pd.read_json(data_path).dropna(subset="text")

In [7]:
df['text_pp'] = df['text'].apply(lambda x: preprocess_text(x))

In [38]:
import numpy as np
from typing import Any
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_percentage_error


def _get_params_combinations(hyper_params: dict[str, list]) -> list[dict]:
    keys, values = zip(*hyper_params.items())
    return [dict(zip(keys, v)) for v in itertools.product(*values)]


def one_model_train(
    X_train: pd.DataFrame, y_train: pd.Series, params: dict[str, Any]
):
    slda = tp.SLDAModel(**params)
    for i in range(0, len(X_train)):
        slda.add_doc(
            X_train.iloc[i].strip().split(), y=[float(np.array(y_train)[i])]
        )
    for i in range(0, 1020, 20):
        slda.train(20, workers=0)

    return slda


def one_model_eval(
    model: tp.SLDAModel, X_test: pd.DataFrame, y_test: pd.Series
):
    test_preds = []
    for i in range(0, len(X_test)):
        slda_test_doc = model.make_doc(list(X_test)[i])
        model.infer(slda_test_doc, workers=0)
        test_preds.append(float(model.estimate(slda_test_doc)))

    return mean_absolute_percentage_error(y_true=y_test, y_pred=test_preds)


def model_cross_validate(
    X: pd.DataFrame, y: pd.Series, params: dict[str, Any]
):
    results = []
    kf = KFold(n_splits=3)

    for i, (train_index, test_index) in enumerate(kf.split(X)):
        print(f"Fold {i}")
        X_train, y_train = X.iloc[train_index], y.iloc[train_index]
        X_test, y_test = X.iloc[test_index], y.iloc[test_index]

        model = one_model_train(X_train=X_test, y_train=y_train, params=params)
        result = one_model_eval(model=model, X_test=X_test, y_test=y_test)
        results.append(result)

    return {**params, "mape": np.mean(results)}


def hypreropt(
    X: pd.DataFrame,
    y: pd.Series,
    hyper_params: dict[str, list],
    n_iter: int = 20,
) -> pd.DataFrame:
    params_combinations = _get_params_combinations(hyper_params)
    random.shuffle(params_combinations)

    results = []
    for i, params in enumerate(params_combinations[:n_iter]):
        print(f"Params {i}")
        result = model_cross_validate(X=X, y=y, params=params)
        results.append(result)
        
    return results


In [39]:
hyper_params = dict(
    k=[5, 10, 15, 20, 25],  # number of topics
    min_df=[0],  # DF of tokens to be removed "from the bottom"
    rm_top=[0],  # how many tokens should be removed "from the top"
    vars=["l"],  # indicate binary response variable
    alpha=[0.1],
    eta=[0.01, 0.05, 0.1, 0.2],
    mu=[0, 0.1, 0.2],
    nu_sq=[1],
    glm_param=[1],
    seed=[123],
)


In [43]:
X = df["text_pp"]
y = df["citations"]

In [44]:
results = hypreropt(X, y, hyper_params, 5)

Params 0
Fold 0


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 1


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 2


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Params 1
Fold 0


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 1


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 2


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Params 2
Fold 0


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 1


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 2


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Params 3
Fold 0


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 1


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 2


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Params 4
Fold 0


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 1


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


Fold 2


  slda.train(20, workers=0)
  slda_test_doc = model.make_doc(list(X_test)[i])


In [47]:
result_df = pd.DataFrame(results)

In [49]:
result_df.to_csv("slda_results.csv")

In [60]:
best_model_params = {
    k: v
    for k, v in result_df.sort_values(by="mape", ascending=True)
    .iloc[0]
    .to_dict()
    .items()
    if k != "mape"
}


In [61]:
best_model_params

{'k': 25,
 'min_df': 0,
 'rm_top': 0,
 'vars': 'l',
 'alpha': 0.1,
 'eta': 0.01,
 'mu': 0.2,
 'nu_sq': 1,
 'glm_param': 1,
 'seed': 123}

In [63]:
best_model = one_model_train(X, y, best_model_params)

  slda.train(20, workers=0)


In [71]:
best_model.save("best_model.bin")

In [80]:
mdl = tp.SLDAModel.load('best_model.bin')
