In [10]:
from sklearn.decomposition import LatentDirichletAllocation
from nltk.stem.porter import PorterStemmer
import re
from nltk.tokenize import RegexpTokenizer
import tomotopy as tp
import pandas as pd
from pathlib import Path
from sklearn.feature_extraction.text import CountVectorizer
import itertools
import random

In [4]:
stemmer = PorterStemmer()
tokenizer = RegexpTokenizer(r"\w+")


def preprocess_text(text: str):
    return " ".join(
        [
            stemmer.stem(word)
            for word in tokenizer.tokenize(
                " ".join([re.sub(r"\s+", " ", word) for word in text.split()])
            )
        ]
    )


In [5]:
data_path = Path.cwd() /  "findal_df.json"


In [6]:
df = pd.read_json(data_path).dropna(subset="text")

In [7]:
df['text_pp'] = df['text'].apply(lambda x: preprocess_text(x))

In [9]:
import numpy as np
from typing import Any
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_percentage_error


def _get_params_combinations(
    self, hyper_params: dict[str, list]
) -> list[dict]:
    keys, values = zip(*hyper_params.items())
    return [dict(zip(keys, v)) for v in itertools.product(*values)]


def one_model_train(
    X_train: pd.DataFrame, y_train: pd.Series, params: dict[str, Any]
):
    slda = tp.SLDAModel(**params)
    for i in range(0, len(X_train)):
        slda.add_doc(
            X_train.iloc[i].strip().split(), y=[float(np.array(y_train)[i])]
        )
    for i in range(0, 1020, 20):
        slda.train(20)

    return slda


def one_model_eval(
    model: tp.SLDAModel, X_test: pd.DataFrame, y_test: pd.Series
):
    test_preds = []
    for i in range(0, len(X_test)):
        slda_test_doc = model.make_doc(list(X_test)[i])
        model.infer(slda_test_doc)
        test_preds.append(float(model.estimate(slda_test_doc)))

    return mean_absolute_percentage_error(y_true=y_test, y_pred=test_preds)


def model_cross_validate(
    X: pd.DataFrame, y: pd.Series, params: dict[str, Any]
):
    results = []
    kf = KFold(n_splits=3)

    for i, (train_index, test_index) in enumerate(kf.split(X)):
        X_train, y_train = X.iloc[train_index], y.iloc[train_index]
        X_test, y_test = X.iloc[test_index], y.iloc[test_index]

        model = one_model_train(X_train=X_test, y_train=y_train, params=params)
        result = one_model_eval(model=model, X_test=X_test, y_test=y_test)
        result.append(results)

    return {**params, "mape": np.mean(results)}


def hypreropt(
    X: pd.DataFrame,
    y: pd.Series,
    hyper_params: dict[str, list],
    n_iter: int = 20,
) -> pd.DataFrame:
    params_combinations = _get_params_combinations(hyper_params)
    random.shuffle(params_combinations)
    
    results = []
    for params in params_combinations[:n_iter]:
        result = model_cross_validate(X=X, y=y, params=params)

    return results


In [11]:
[1, 3, 5, 1][:2]

[1, 3]

In [None]:
hyper_params = dict(
    k=[20],  # number of topics
    min_df=[0],  # DF of tokens to be removed "from the bottom"
    rm_top=[0],  # how many tokens should be removed "from the top"
    vars=["l"],  # indicate binary response variable
    alpha=[0.1],
    eta=[0.01],
    mu=[0],
    nu_sq=[1],
    glm_param=[1],
    seed=[123],
)
