In [1]:
from sklearn.decomposition import LatentDirichletAllocation
from nltk.stem.porter import PorterStemmer
import re
from nltk.tokenize import RegexpTokenizer
# import tomotopy as tp
import pandas as pd
from pathlib import Path
from sklearn.feature_extraction.text import CountVectorizer
import itertools

In [2]:
stemmer = PorterStemmer()
tokenizer = RegexpTokenizer(r"\w+")


def preprocess_text(text: str):
    return " ".join(
        [
            stemmer.stem(word)
            for word in tokenizer.tokenize(
                " ".join([re.sub(r"\s+", " ", word) for word in text.split()])
            )
        ]
    )


In [3]:
data_path = Path.cwd() / "data" / "findal_df.json"


In [4]:
df = pd.read_json(data_path).dropna(subset="text")

In [5]:
df['text_pp'] = df['text'].apply(lambda x: preprocess_text(x))

In [6]:
df["text_pp"]

0       workshop track 1st confer on lifelong learn ag...
1       arxiv 1410 3831v1 stat ml 14 oct 2014an exact ...
2       learn gener model across incompar space charlo...
3       on the gener abil of onlin learn algorithm for...
4       geometr understand of deep learn na lei zhongx...
                              ...                        
1230    music word embed bridg the gap between listen ...
1231    metric for multi class classif anoverview a w ...
1232    intellig play dice stochast is essenti for mac...
1233    can automl outperform human an evalu on popula...
1234    set valu and variat analysi manuscript no will...
Name: text_pp, Length: 1223, dtype: object

In [15]:
tf_vectorizer = CountVectorizer(
    ngram_range=(1, 3),
    max_df=0.75,
    min_df=0.01,
    tokenizer=tokenizer.tokenize,
    stop_words="english",
)


In [16]:
tf = tf_vectorizer.fit_transform(df['text_pp'])



In [19]:
import numpy as np
from pyparsing import Any
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_percentage_error


def _get_params_combinations(
    self, hyper_params: dict[str, list]
) -> list[dict]:
    keys, values = zip(*hyper_params.items())
    return [dict(zip(keys, v)) for v in itertools.product(*values)]


def one_model_train(
    X_train: pd.DataFrame, y_train: pd.Series, params: dict[str, Any]
):
    slda = tp.SLDAModel(**params)
    for i in range(0, len(X_train)):
        slda.add_doc(
            X_train.iloc[i].strip().split(), y=[float(np.array(y_train)[i])]
        )
    for i in range(0, 1020, 20):
        slda.train(20)

    return slda


def one_model_eval(
    model: tp.SLDAModel, X_test: pd.DataFrame, y_test: pd.Series
):
    test_preds = []
    for i in range(0, len(X_test)):
        slda_test_doc = model.make_doc(list(X_test)[i])
        model.infer(slda_test_doc)
        test_preds.append(float(model.estimate(slda_test_doc)))

    return mean_absolute_percentage_error(y_true=y_test, y_pred=test_preds)


def model_cross_validate(
    X: pd.DataFrame, y: pd.Series, params: dict[str, Any]
):
    results = []
    kf = KFold(n_splits=3)

    for i, (train_index, test_index) in enumerate(kf.split(X)):
        X_train, y_train = X.iloc[train_index], y.iloc[train_index]
        X_test, y_test = X.iloc[test_index], y.iloc[test_index]

        model = one_model_train(X_train=X_test, y_train=y_train, params=params)
        result = one_model_eval(model=model, X_test=X_test, y_test=y_test)
        result.append(results)

    return {**params, "mape": np.mean(results)}


def hypreropt(
    X: pd.DataFrame,
    y: pd.Series,
    hyper_params: dict[str, list],
) -> pd.DataFrame:
    params_combinations = _get_params_combinations(hyper_params)

    results = []
    for params in params_combinations:
        result = model_cross_validate(X=X, y=y, params=params)

    return results


NameError: name 'tp' is not defined