In [1]:
import pandas as pd
from pathlib import Path
from sklearn.decomposition import LatentDirichletAllocation
from nltk.stem.porter import PorterStemmer
import re
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer
import pickle
import xgboost as xgb
import dalex as dx
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_percentage_error

In [2]:
df = pd.read_json(Path.cwd() / "findal_df.json")

In [3]:
df

Unnamed: 0,title,link,citations,text
0,Continual Reinforcement Learning with TELLA,http://arxiv.org/pdf/2208.04287v1,2,Workshop Track - 1st Conference on Lifelong Le...
1,An exact mapping between the Variational Renor...,http://arxiv.org/pdf/1410.3831v1,295,arXiv:1410.3831v1 [stat.ML] 14 Oct 2014An ex...
2,Learning Generative Models across Incomparable...,http://arxiv.org/pdf/1905.05461v2,69,Learning Generative Models across Incomparable...
3,On the Generalization Ability of Online Learni...,http://arxiv.org/pdf/1305.2505v1,74,On the Generalization Ability of Online Learni...
4,Geometric Understanding of Deep Learning,http://arxiv.org/pdf/1805.10451v2,110,Geometric Understanding of Deep Learning\nNa L...
...,...,...,...,...
1230,Musical Word Embedding: Bridging the Gap betwe...,http://arxiv.org/pdf/2008.01190v1,4,Musical Word Embedding: Bridging the Gap betwe...
1231,Metrics for Multi-Class Classification: an Ove...,http://arxiv.org/pdf/2008.05756v1,409,METRICS FOR MULTI -CLASS CLASSIFICATION :ANOVE...
1232,Intelligence plays dice: Stochasticity is esse...,http://arxiv.org/pdf/2008.07496v1,5,Intelligence plays dice: Stochasticity is esse...
1233,Can AutoML outperform humans? An evaluation on...,http://arxiv.org/pdf/2009.01564v2,24,Can AutoML outperform humans? An evaluation \n...


In [4]:
df['text'].isna().sum()

12

In [5]:
df.dropna(subset=['text'], inplace=True)

In [6]:
stemmer = PorterStemmer()
tokenizer = RegexpTokenizer(r'\w+')
def preprocess_text(text: str):
    return ' '.join([stemmer.stem(word) if len(word) > 2 and word not in ['uni', 'uni uni', 'uni uni uni', 'ieee','doi', 'vextendsingl', 'http', 'https', 'vextenddoubl', 'parenrightbig', 'parenleftbig'] else '' for word in tokenizer.tokenize(' '.join([re.sub(r'(\${1,2})(?:(?!\1)[\s\S])*\1', ' ', re.sub(r'[\s\d]+', ' ', word)) for word in text.split()]))])


In [7]:
df['text_pp'] = df['text'].apply(lambda x: preprocess_text(x))

In [8]:
tf_vectorizer = CountVectorizer(ngram_range = (1, 4),
                                max_df = 0.8,
                                min_df = 0.01,
                                tokenizer = tokenizer.tokenize,
                                stop_words='english'
)

In [9]:
df.head()

Unnamed: 0,title,link,citations,text,text_pp
0,Continual Reinforcement Learning with TELLA,http://arxiv.org/pdf/2208.04287v1,2,Workshop Track - 1st Conference on Lifelong Le...,workshop track confer lifelong learn agent c...
1,An exact mapping between the Variational Renor...,http://arxiv.org/pdf/1410.3831v1,295,arXiv:1410.3831v1 [stat.ML] 14 Oct 2014An ex...,arxiv stat oct exact map between the variat...
2,Learning Generative Models across Incomparable...,http://arxiv.org/pdf/1905.05461v2,69,Learning Generative Models across Incomparable...,learn gener model across incompar space charlo...
3,On the Generalization Ability of Online Learni...,http://arxiv.org/pdf/1305.2505v1,74,On the Generalization Ability of Online Learni...,the gener abil onlin learn algorithm for pai...
4,Geometric Understanding of Deep Learning,http://arxiv.org/pdf/1805.10451v2,110,Geometric Understanding of Deep Learning\nNa L...,geometr understand deep learn lei zhongxuan ...


In [10]:
tf = tf_vectorizer.fit_transform(df['text_pp'])

In [11]:
#params = {
#    'n_components': list(range(5, 150, 5)),
#    'learning_decay': np.arange(0.6, 0.9, 0.1),
#    'learning_offset': np.arange(10, 150, 10),
#    'perp_tol': [0.1],
#    'max_iter': [20],
#    'mean_change_tol': [1e-6, 1e-5, 1e-4, 1e-3, 1e-2],
#    'max_doc_update_iter': [100]
#}
best_params = {'perp_tol': 0.1,
 'n_components': 5,
 'mean_change_tol': 0.01,
 'max_iter': 150,
 'max_doc_update_iter': 100,
 'learning_offset': 30,
 'learning_decay': 0.6,
 'verbose': 1,
 'n_jobs': -1}

lda = LatentDirichletAllocation(**best_params)
#rsearch = RandomizedSearchCV(lda, params, n_iter=30, verbose=3, cv=3, error_score='raise')
#rsearch.fit(tf)
lda.fit(tf)

iteration: 1 of max_iter: 150
iteration: 2 of max_iter: 150
iteration: 3 of max_iter: 150
iteration: 4 of max_iter: 150
iteration: 5 of max_iter: 150
iteration: 6 of max_iter: 150
iteration: 7 of max_iter: 150
iteration: 8 of max_iter: 150
iteration: 9 of max_iter: 150
iteration: 10 of max_iter: 150
iteration: 11 of max_iter: 150
iteration: 12 of max_iter: 150
iteration: 13 of max_iter: 150
iteration: 14 of max_iter: 150
iteration: 15 of max_iter: 150
iteration: 16 of max_iter: 150
iteration: 17 of max_iter: 150
iteration: 18 of max_iter: 150
iteration: 19 of max_iter: 150
iteration: 20 of max_iter: 150
iteration: 21 of max_iter: 150
iteration: 22 of max_iter: 150
iteration: 23 of max_iter: 150
iteration: 24 of max_iter: 150
iteration: 25 of max_iter: 150
iteration: 26 of max_iter: 150
iteration: 27 of max_iter: 150
iteration: 28 of max_iter: 150
iteration: 29 of max_iter: 150
iteration: 30 of max_iter: 150
iteration: 31 of max_iter: 150
iteration: 32 of max_iter: 150
iteration: 33 of 

LatentDirichletAllocation(learning_decay=0.6, learning_offset=30, max_iter=150,
                          mean_change_tol=0.01, n_components=5, n_jobs=-1,
                          verbose=1)

In [11]:
#best = rsearch.best_estimator_

In [12]:
#best.perplexity(tf)
lda.perplexity(tf)

9269.797920022853

In [43]:
pickle.dump(lda, open('model_test_new_regex.pkl', 'wb'))
#lda = pickle.load(open('model.pkl', 'rb'))

In [13]:
for index, component in enumerate(lda.components_): #taking model's components, i.e. P(w|z)
    zipped = zip(tf_vectorizer.get_feature_names(), component) #taking together tokens' names with components
    top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:10] #top 10 terms per topic
    top_terms_list=list(dict(top_terms_key).keys()) #taking only tokens, no weights

    print("Topic "+str(index)+": ",top_terms_list) #prints top 10 tokens per topic



Topic 0:  ['quantum', 'graph', 'neural network', 'imag', 'represent', 'structur', 'physic', 'deep', 'layer', 'space']
Topic 1:  ['dataset', 'select', 'figur', 'decis', 'accuraci', 'class', 'classi', 'evalu', 'variabl', 'regress']
Topic 2:  ['bound', 'theorem', 'probabl', 'gradient', 'kernel', 'loss', 'proof', 'vector', 'estim', 'vextendsingl']
Topic 3:  ['layer', 'polici', 'meta', 'deep', 'agent', 'reinforc', 'reward', 'reinforc learn', 'action', 'represent']
Topic 4:  ['attack', 'adversari', 'ieee', 'imag', 'deep', 'detect', 'differ', 'commun', 'user', 'privaci']


In [14]:
lda_output = lda.transform(tf)

In [15]:
topics = [f'Topic {i}' for i in range(lda.n_components)]
docnames = df.index.tolist()
topics_df = pd.DataFrame(lda_output, columns=topics, index=docnames)

In [16]:
topics_df['citations'] = df['citations']

In [17]:
xg_model = xgb.XGBRegressor()
kf = KFold(n_splits=7)
cross_val_score(xg_model, topics_df[[f'Topic {i}' for i in range(lda.n_components)]], topics_df['citations'], scoring='neg_mean_absolute_percentage_error')

array([-14.78618517, -25.10303218, -22.18794748, -20.6434463 ,
       -16.85243759])

In [18]:
xg_model.fit(topics_df[[f'Topic {i}' for i in range(lda.n_components)]], topics_df['citations'])

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [21]:
exp = dx.Explainer(xg_model, topics_df[['Topic 0', 'Topic 1', 'Topic 2', 'Topic 3', 'Topic 4']], topics_df['citations'])

Preparation of a new explainer is initiated

  -> data              : 1223 rows 5 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 1223 values
  -> model_class       : xgboost.sklearn.XGBRegressor (default)
  -> label             : Not specified, model's class short name will be used. (default)
  -> predict function  : <function yhat_default at 0x0000018356AAE0D0> will be used (default)
  -> predict function  : Accepts pandas.DataFrame and numpy.ndarray.
  -> predicted values  : min = -31.6, mean = 84.5, max = 1.16e+04
  -> model type        : regression will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -89.8, mean = 0.129, max = 1.24e+02
  -> model_info        : package xgboost

A new explainer has been created!


In [23]:
agg_profiles = exp.model_profile().plot(show=False)
agg_profiles.write_html('agg_profiles.html')

Calculating ceteris paribus: 100%|██████████| 5/5 [00:00<00:00, 19.23it/s]


In [24]:
agg_profiles

In [25]:
var_importance = exp.model_parts().plot(show=False)
var_importance.write_html('var_importance.html')

In [26]:
var_importance

In [27]:
for index, component in enumerate(lda.components_): #taking model's components, i.e. P(w|z)
    zipped = zip(tf_vectorizer.get_feature_names(), component) #taking together tokens' names with components
    top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:10] #top 10 terms per topic
    top_terms_list=list(dict(top_terms_key).keys()) #taking only tokens, no weights

    print("Topic "+str(index)+": ",top_terms_list) #prints top 10 tokens per topic


Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.



Topic 0:  ['attack', 'adversari', 'human', 'knowledg', 'decis', 'user', 'detect', 'privaci', 'ieee', 'differ']
Topic 1:  ['bound', 'theorem', 'probabl', 'kernel', 'vector', 'estim', 'proof', 'loss', 'gradient', 'error']
Topic 2:  ['layer', 'quantum', 'imag', 'neural network', 'graph', 'deep', 'physic', 'represent', 'structur', 'vector']
Topic 3:  ['polici', 'deep', 'meta', 'agent', 'reinforc', 'represent', 'reward', 'reinforc learn', 'action', 'domain']
Topic 4:  ['dataset', 'select', 'accuraci', 'figur', 'tabl', 'regress', 'evalu', 'random', 'tree', 'error']


In [28]:
exp.model_performance()

Unnamed: 0,mse,rmse,r2,mae,mad
XGBRegressor,392.468776,19.810825,0.997745,13.057491,8.738222


In [21]:
variables = topics_df[['Topic 0', 'Topic 1', 'Topic 2', 'Topic 3', 'Topic 4']]
target = topics_df[['citations']]

In [23]:
mean_absolute_percentage_error(xg_model.predict(variables), target)

1.162015251684972