# Initial configurations

In [1]:
# Params
TRANSACTON = False
SUBSET_SIZE = 0.4

MAX_DF = 0.6
MIN_DF = 2

In [2]:
import datetime as dt
import pandas as pd
import warnings
import os

# Ignore warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

# Mount filename
base_path = "../assets/data/nips"
filename = os.path.abspath(os.path.join(base_path, "transaction/papers.csv"))

# Reading file
df = pd.read_csv(filename)

# Filling the empty columns
df.text = df.text.fillna("")

if not TRANSACTON:
    # Agg transactional to full file
    df = df.groupby(["year", "id"], as_index=False).agg({"text": lambda x: " ".join(list(x))})

# Display df
df.head()

Unnamed: 0,year,id,text
0,1987,1,self organization associative database applica...
1,1987,2,capacity kanerva associative memory exponentia...
2,1987,3,supervise learning probability distribution ne...
3,1987,4,constrained differential optimization constrai...
4,1987,5,towards organize principle layered perceptual ...


## Data properties

In [3]:
print("No. of documents: {:,}".format(df.id.nunique()))
print("No. of transactions: {:,}".format(len(df)))

No. of documents: 7,241
No. of transactions: 7,241


## Select subset

In [4]:
# Selct subset
max_id = df.id.max()
subset_id = max_id * SUBSET_SIZE
subset_df = df.loc[df.year <= 2005].reset_index(drop=True)

print("No. of documents: {:,}".format(subset_df.id.nunique()))
print("No. of transactions: {:,}".format(len(subset_df)))

No. of documents: 2,920
No. of transactions: 2,920


## Frequency based stop words

In [5]:
def find_stop_words(data: pd.Series, max_df=MAX_DF, min_df=MIN_DF, quiet=False):
    """
    max_df & min_df: 
        * 0 < x < 1: Porcentagem de aparicao
        * x > 1: Quantidade de palavras
    """
    
    t0 = dt.datetime.now()
    
    # Get unique words for each document
    unique = data.apply(lambda x: list(set(x.split())))
    
    # Calculate the words document frequency
    words_document_frequency = unique.explode().value_counts() / len(unique)
    
    if max_df > 1: 
        max_df = max_df / len(unique)
    if min_df > 1:
        min_df = min_df / len(unique)
    
    # Get percentiles values
    top = max_df # np.percentile(words_document_frequency, percentile_top)
    bottom = min_df #np.percentile(words_document_frequency, percentile_bottom)
    
    # Find words
    mask_top = words_document_frequency > top
    mask_bottom = bottom > words_document_frequency
    mask_wdf = mask_top | mask_bottom
    freq_stop_words = list(words_document_frequency[mask_wdf].index)
    vocabulary = list(words_document_frequency[~mask_wdf].index)
    
    if not quiet:
        length = len(str(len(words_document_frequency))) + 1
        print(f"{len(words_document_frequency)}".rjust(length), "- raw vocabulary length")
        print(f"{len(vocabulary)}".rjust(length), "- new vocabulary length\n")
        print(f"{mask_wdf.sum()}".rjust(length), "- new stop words founded")
        print(f"{mask_top.sum()}".rjust(length), f"- df above  {top:.8f}")
        print(f"{mask_bottom.sum()}".rjust(length), f"- df bellow {bottom:.8f}\n")
        print(f"Max df: {words_document_frequency.max():.8f}")
        print(f"Min df: {words_document_frequency.min():.8f}\n")        
        print(f"Execution in {dt.datetime.now() - t0}")
        
    return freq_stop_words, vocabulary

In [6]:
# Find stop words
freq_stop_words, vocabulary = find_stop_words(subset_df.text)

 87471 - raw vocabulary length
 29269 - new vocabulary length

 58202 - new stop words founded
   111 - df above  0.60000000
 58091 - df bellow 0.00068493

Max df: 0.98321918
Min df: 0.00034247

Execution in 0:00:01.462875


# Data transformation: Corpus and Dictionary

In [7]:
import gensim.corpora as corpora

# Create Dictionary
id2word = corpora.Dictionary([[word] for word in vocabulary])

# Create Corpus with Term Document Frequency
subset_df["corpus"] = subset_df.text.str.split().apply(id2word.doc2bow, allow_update=False)

# Removing empty documents
filterred_df = subset_df[subset_df["corpus"].apply(len) > 0]

# Re-ajust text (indirect filter stop words)
filterred_df.text = filterred_df.corpus.apply(lambda x:
    " ".join([" ".join([id2word[token_id]]*count) for token_id, count in x])
)

# Latent Dirichlet Allocation (LDA)

## Hyperparameter Tunning

In [8]:
corpus = filterred_df.corpus.to_list()
texts = filterred_df.text.str.split().to_list()
models_path = os.path.abspath(os.path.join("../assets/models", "nips-tuning-20200928.csv"))

In [9]:
from gensim.models import CoherenceModel

# supporting function
def compute_coherence_values(corpus, dictionary, k, a, b, texts):
    
    lda_model = gensim.models.LdaMulticore(
        corpus=corpus,
        id2word=dictionary,
        num_topics=k, 
        random_state=100,
        chunksize=100,
        passes=10,
        alpha=a,
        eta=b,
        per_word_topics=True
    )
    
    perplexity_lda = lda_model.log_perplexity(corpus)
    coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
    
    return coherence_model_lda.get_coherence(), perplexity_lda

In [10]:
from tqdm import tqdm 
import datetime as dt
import numpy as np
import pandas as pd
import gensim

grid = {}
grid['Validation_Set'] = {}

# Topics range
min_topics = 10
max_topics = 161
step_size = 5
topics_range = list(range(min_topics, max_topics, step_size))

# Alpha parameter
alpha = [0.1]

# Beta parameter
beta = [0.1]

# Validation sets
num_of_docs = len(corpus)
corpus_sets = [
    corpus
]

corpus_title = [
    '100% Corpus'
]

model_results = {
    'Validation_Set': [],
    'Topics': [],
    'Alpha': [],
    'Beta': [],
    'Coherence': [],
    'Perplexity': [],
    'Train_Time': []
}

# Can take a long time to run
if 1 == 1:
    pbar = tqdm(
        total=len(corpus_sets)*len(topics_range)*len(alpha)*len(beta),
        position=0, 
        leave=True
    )
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    start = dt.datetime.now()
                    cv, pl = compute_coherence_values(
                        corpus=corpus_sets[i],                        
                        dictionary=id2word, 
                        k=k, a=a, b=b,
                        texts=texts
                    )
                    
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    model_results['Perplexity'].append(pl)
                    model_results['Train_Time'].append((dt.datetime.now() - start).seconds)
                    
                    pbar.update(1)
        pd.DataFrame(model_results).to_csv(models_path, index=False)
        pbar.close()

100%|██████████| 31/31 [3:44:23<00:00, 434.30s/it]  


### Investigate results

In [11]:
import pandas as pd

# Import dataframe
df = pd.read_csv(models_path)
# df = df.loc[df['Topics'] >= 5]

# Sort values by Coherence score
df = df.sort_values('Coherence', ascending=False).reset_index(drop=True)

# Selecting best params
best_row = df.loc[0]
best_df = df.loc[
    (df['Validation_Set'] == best_row['Validation_Set']) & 
    (df['Alpha'] == best_row['Alpha']) &
    (df['Beta'] == best_row['Beta'])
].sort_values('Topics')

# Display tuning results
df.head()

Unnamed: 0,Validation_Set,Topics,Alpha,Beta,Coherence,Perplexity,Train_Time
0,100% Corpus,130,0.1,0.1,0.502424,-7.719064,555
1,100% Corpus,35,0.1,0.1,0.502084,-7.734922,309
2,100% Corpus,50,0.1,0.1,0.499184,-7.715846,340
3,100% Corpus,150,0.1,0.1,0.497913,-7.738688,607
4,100% Corpus,30,0.1,0.1,0.49733,-7.757215,281


In [12]:
df.head(10)

Unnamed: 0,Validation_Set,Topics,Alpha,Beta,Coherence,Perplexity,Train_Time
0,100% Corpus,130,0.1,0.1,0.502424,-7.719064,555
1,100% Corpus,35,0.1,0.1,0.502084,-7.734922,309
2,100% Corpus,50,0.1,0.1,0.499184,-7.715846,340
3,100% Corpus,150,0.1,0.1,0.497913,-7.738688,607
4,100% Corpus,30,0.1,0.1,0.49733,-7.757215,281
5,100% Corpus,125,0.1,0.1,0.495746,-7.719057,538
6,100% Corpus,25,0.1,0.1,0.495724,-7.764011,267
7,100% Corpus,40,0.1,0.1,0.494474,-7.726188,313
8,100% Corpus,60,0.1,0.1,0.493659,-7.712125,407
9,100% Corpus,155,0.1,0.1,0.493393,-7.739873,610


In [15]:
import plotly.graph_objects as go

fig = go.Figure([
    go.Scatter(
        x=best_df['Topics'],
        y=best_df['Coherence'],
        line=dict(color='rgb(0,100,80)'),
        mode='lines'
    )
])

fig.update_layout(
    title_text="",
    xaxis_title="", # xaxis label
    yaxis_title="", # yaxis label
    showlegend=False,
    width=600, height=400,
)
fig.add_traces(go.Scatter(x=sorted(df.year.unique()), y=[subset_length]*df.year.nunique(), mode='lines'))


fig.show()