In [31]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.decomposition import LatentDirichletAllocation as LDAmodel
from sklearn.model_selection import KFold, ParameterGrid
import seaborn as sns
import pickle as pkl

In [2]:
df = pd.read_csv("../data/cleaned_data_SYMPTOMS_9_13_23.csv", index_col=0)
sympdf = df.loc[:, df.columns.str.startswith('Symptom_')]

In [44]:
param_grid = {
    'n_components': [3, 4, 5, 6, 7],
    'learning_method': ['batch'],
    'random_state': [42],
    'max_iter': [5, 10, 20],
}


In [45]:
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=0)
results = {}  # To store results

for hyperparameters in ParameterGrid(param_grid):
    print(hyperparameters)
    total_perplexity = 0
    lda = LDAmodel(**hyperparameters)
    
    perplexities = []  # To store perplexity values for each fold

    for fold, (train_index, val_index) in enumerate(kf.split(sympdf), 1):
        train_data = sympdf.iloc[train_index]
        val_data = sympdf.iloc[val_index]
        
        lda.fit(train_data)
        perplexity = lda.perplexity(val_data)
        perplexities.append(perplexity)

    average_perplexity = np.mean(perplexities)

    # Store the results
    results[str(hyperparameters)] = {
        "average_perplexity": average_perplexity,
        "perplexities": perplexities
    }

# Find the best hyperparameters based on average perplexity
best_hyperparameters = min(results, key=lambda x: results[x]["average_perplexity"])
best_average_perplexity = results[best_hyperparameters]["average_perplexity"]

print("Best hyperparameters:", best_hyperparameters)
print("Best average perplexity:", best_average_perplexity)

{'learning_method': 'batch', 'max_iter': 5, 'n_components': 3, 'random_state': 42}
{'learning_method': 'batch', 'max_iter': 5, 'n_components': 4, 'random_state': 42}
{'learning_method': 'batch', 'max_iter': 5, 'n_components': 5, 'random_state': 42}
{'learning_method': 'batch', 'max_iter': 5, 'n_components': 6, 'random_state': 42}
{'learning_method': 'batch', 'max_iter': 5, 'n_components': 7, 'random_state': 42}
{'learning_method': 'batch', 'max_iter': 10, 'n_components': 3, 'random_state': 42}
{'learning_method': 'batch', 'max_iter': 10, 'n_components': 4, 'random_state': 42}
{'learning_method': 'batch', 'max_iter': 10, 'n_components': 5, 'random_state': 42}
{'learning_method': 'batch', 'max_iter': 10, 'n_components': 6, 'random_state': 42}
{'learning_method': 'batch', 'max_iter': 10, 'n_components': 7, 'random_state': 42}
{'learning_method': 'batch', 'max_iter': 20, 'n_components': 3, 'random_state': 42}
{'learning_method': 'batch', 'max_iter': 20, 'n_components': 4, 'random_state': 4

In [46]:
# save a pkl file of results
with open('output/lda_results-4.pkl', 'wb') as f:
    pkl.dump(results, f)

In [29]:
# performed well with a default learning decay, 

"{'n_components': 10, 'learning_decay': 0.9}"

In [47]:
# convert the average_perplexities into a dataframe
df = pd.DataFrame.from_dict(results, orient='index')

In [48]:
df.sort_values(by='average_perplexity', inplace=True)

In [49]:
df
# takeaway: n_components of 4 is across the board the best performing 
# What does it mean that I'm getting the same values? 

Unnamed: 0,average_perplexity,perplexities
"{'learning_method': 'batch', 'max_iter': 20, 'n_components': 3, 'random_state': 42}",133.027585,"[132.6451185286299, 132.86677982910615, 133.07..."
"{'learning_method': 'batch', 'max_iter': 10, 'n_components': 3, 'random_state': 42}",133.559102,"[133.20260966265667, 133.45574274326054, 133.7..."
"{'learning_method': 'batch', 'max_iter': 5, 'n_components': 3, 'random_state': 42}",135.459295,"[135.107787124454, 135.47688330452698, 135.886..."
"{'learning_method': 'batch', 'max_iter': 20, 'n_components': 4, 'random_state': 42}",136.031815,"[135.82686061987795, 135.72110708919075, 136.2..."
"{'learning_method': 'batch', 'max_iter': 10, 'n_components': 4, 'random_state': 42}",136.41775,"[136.1859146159476, 136.12856546788757, 136.71..."
"{'learning_method': 'batch', 'max_iter': 5, 'n_components': 4, 'random_state': 42}",137.414989,"[137.1714754909619, 137.19849219990073, 137.70..."
"{'learning_method': 'batch', 'max_iter': 20, 'n_components': 5, 'random_state': 42}",138.621205,"[138.59161110991235, 138.4524391366244, 138.74..."
"{'learning_method': 'batch', 'max_iter': 10, 'n_components': 5, 'random_state': 42}",139.206151,"[139.2100476223345, 138.95417047699462, 139.33..."
"{'learning_method': 'batch', 'max_iter': 5, 'n_components': 5, 'random_state': 42}",140.419156,"[140.3827892418617, 140.11843621605814, 140.47..."
"{'learning_method': 'batch', 'max_iter': 20, 'n_components': 6, 'random_state': 42}",141.379701,"[141.29878215806875, 141.30779888897487, 141.5..."


In [None]:
# typically, you'd expect more topics means more model capacity 
# so why is my perplexity dropping? 
# are some patients having really poor fit? 
# or maybe need to do symptom pruning analogous to stopwords

# symptom absence tells you something about a patient

# think about different prior? 
# 

#doc_topic_priorfloat, default=None
#Prior of document topic distribution theta. If the value is None, defaults to 1 / n_components. In [1], this is called alpha.
# dirichlet prior -- flat by default
# if we have reason to believe docs are spiky, you'd want this to be a lower value 
# try running with (0.5,0.5,0.5 , ... ) and look if this is input as a number or a vector

#topic_word_priorfloat, default=None
#Prior of topic word distribution beta. If the value is None, defaults to 1 / n_components. In [1], this is called eta.
# leave this -- expectation is mixed
# should probably be 1/n_words (/library size) but look at the implementation bc this seems weird


# possible issues: model is a poor fit for the data (which we know) so perplexity might not be the best readout
# priors might be off
# need to look at the topics themselves and the topic dists for patients and see if they make sense