In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.decomposition import LatentDirichletAllocation as LDAmodel
from sklearn.model_selection import KFold, ParameterGrid, GridSearchCV
import seaborn as sns
import pickle as pkl

In [4]:
df = pd.read_csv("../data/cleaned_data_SYMPTOMS_9_13_23.csv", index_col=0)
sympdf = df.loc[:, df.columns.str.startswith('Symptom_')]

In [44]:
param_grid = {
    'n_components': [3, 4, 5, 6, 7],
    'learning_method': ['batch'],
    'random_state': [42],
    'max_iter': [5, 10, 20],
}


In [45]:
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=0)
results = {}  # To store results

for hyperparameters in ParameterGrid(param_grid):
    print(hyperparameters)
    total_perplexity = 0
    lda = LDAmodel(**hyperparameters)
    
    perplexities = []  # To store perplexity values for each fold

    for fold, (train_index, val_index) in enumerate(kf.split(sympdf), 1):
        train_data = sympdf.iloc[train_index]
        val_data = sympdf.iloc[val_index]
        
        lda.fit(train_data)
        perplexity = lda.perplexity(val_data)
        perplexities.append(perplexity)

    average_perplexity = np.mean(perplexities)

    # Store the results
    results[str(hyperparameters)] = {
        "average_perplexity": average_perplexity,
        "perplexities": perplexities
    }

# Find the best hyperparameters based on average perplexity
best_hyperparameters = min(results, key=lambda x: results[x]["average_perplexity"])
best_average_perplexity = results[best_hyperparameters]["average_perplexity"]

print("Best hyperparameters:", best_hyperparameters)
print("Best average perplexity:", best_average_perplexity)

{'learning_method': 'batch', 'max_iter': 5, 'n_components': 3, 'random_state': 42}
{'learning_method': 'batch', 'max_iter': 5, 'n_components': 4, 'random_state': 42}
{'learning_method': 'batch', 'max_iter': 5, 'n_components': 5, 'random_state': 42}
{'learning_method': 'batch', 'max_iter': 5, 'n_components': 6, 'random_state': 42}
{'learning_method': 'batch', 'max_iter': 5, 'n_components': 7, 'random_state': 42}
{'learning_method': 'batch', 'max_iter': 10, 'n_components': 3, 'random_state': 42}
{'learning_method': 'batch', 'max_iter': 10, 'n_components': 4, 'random_state': 42}
{'learning_method': 'batch', 'max_iter': 10, 'n_components': 5, 'random_state': 42}
{'learning_method': 'batch', 'max_iter': 10, 'n_components': 6, 'random_state': 42}
{'learning_method': 'batch', 'max_iter': 10, 'n_components': 7, 'random_state': 42}
{'learning_method': 'batch', 'max_iter': 20, 'n_components': 3, 'random_state': 42}
{'learning_method': 'batch', 'max_iter': 20, 'n_components': 4, 'random_state': 4

In [46]:
# save a pkl file of results
with open('output/lda_results-4.pkl', 'wb') as f:
    pkl.dump(results, f)

In [29]:
# performed well with a default learning decay, 

"{'n_components': 10, 'learning_decay': 0.9}"

In [47]:
# convert the average_perplexities into a dataframe
df = pd.DataFrame.from_dict(results, orient='index')

In [48]:
df.sort_values(by='average_perplexity', inplace=True)

In [49]:
df
# takeaway: n_components of 4 is across the board the best performing 
# What does it mean that I'm getting the same values? 

Unnamed: 0,average_perplexity,perplexities
"{'learning_method': 'batch', 'max_iter': 20, 'n_components': 3, 'random_state': 42}",133.027585,"[132.6451185286299, 132.86677982910615, 133.07..."
"{'learning_method': 'batch', 'max_iter': 10, 'n_components': 3, 'random_state': 42}",133.559102,"[133.20260966265667, 133.45574274326054, 133.7..."
"{'learning_method': 'batch', 'max_iter': 5, 'n_components': 3, 'random_state': 42}",135.459295,"[135.107787124454, 135.47688330452698, 135.886..."
"{'learning_method': 'batch', 'max_iter': 20, 'n_components': 4, 'random_state': 42}",136.031815,"[135.82686061987795, 135.72110708919075, 136.2..."
"{'learning_method': 'batch', 'max_iter': 10, 'n_components': 4, 'random_state': 42}",136.41775,"[136.1859146159476, 136.12856546788757, 136.71..."
"{'learning_method': 'batch', 'max_iter': 5, 'n_components': 4, 'random_state': 42}",137.414989,"[137.1714754909619, 137.19849219990073, 137.70..."
"{'learning_method': 'batch', 'max_iter': 20, 'n_components': 5, 'random_state': 42}",138.621205,"[138.59161110991235, 138.4524391366244, 138.74..."
"{'learning_method': 'batch', 'max_iter': 10, 'n_components': 5, 'random_state': 42}",139.206151,"[139.2100476223345, 138.95417047699462, 139.33..."
"{'learning_method': 'batch', 'max_iter': 5, 'n_components': 5, 'random_state': 42}",140.419156,"[140.3827892418617, 140.11843621605814, 140.47..."
"{'learning_method': 'batch', 'max_iter': 20, 'n_components': 6, 'random_state': 42}",141.379701,"[141.29878215806875, 141.30779888897487, 141.5..."


In [None]:
# typically, you'd expect more topics means more model capacity 
# so why is my perplexity dropping? 
# are some patients having really poor fit? 
# or maybe need to do symptom pruning analogous to stopwords

# symptom absence tells you something about a patient

# think about different prior? 
# 

#doc_topic_priorfloat, default=None
#Prior of document topic distribution theta. If the value is None, defaults to 1 / n_components. In [1], this is called alpha.
# dirichlet prior -- flat by default
# if we have reason to believe docs are spiky, you'd want this to be a lower value 
# try running with (0.5,0.5,0.5 , ... ) and look if this is input as a number or a vector

#topic_word_priorfloat, default=None
#Prior of topic word distribution beta. If the value is None, defaults to 1 / n_components. In [1], this is called eta.
# leave this -- expectation is mixed
# should probably be 1/n_words (/library size) but look at the implementation bc this seems weird


# possible issues: model is a poor fit for the data (which we know) so perplexity might not be the best readout
# priors might be off
# need to look at the topics themselves and the topic dists for patients and see if they make sense

In [37]:
param_grid = {
    'n_components': [3, 4, 5, 6, 7,8,9],
    'learning_method': ['batch'],
    'random_state': [42],
    'max_iter': [60,90,120,150,180],
}

# no gaussians or whatever, so n_steps can just be 1 

# Init the Model
lda = LDAmodel()

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=param_grid)

# Do the Grid Search
model.fit(sympdf)

In [38]:
model.best_params_

{'learning_method': 'batch',
 'max_iter': 180,
 'n_components': 3,
 'random_state': 42}

In [39]:
gsres = pd.DataFrame(model.cv_results_)
gsres.to_csv("output/lda-gridsearch-1.csv")

In [40]:
gsres

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_method,param_max_iter,param_n_components,param_random_state,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,56.834458,3.426613,0.203918,0.01707,batch,60,3,42,"{'learning_method': 'batch', 'max_iter': 60, '...",-278064.688238,-263563.143617,-269825.635942,-254873.427444,-262050.717538,-265675.522556,7810.268656,5
1,46.614303,0.287657,0.188179,0.003142,batch,60,4,42,"{'learning_method': 'batch', 'max_iter': 60, '...",-279454.237307,-264621.759329,-271221.087701,-256152.413035,-263306.43718,-266951.18691,7872.512593,10
2,49.811641,0.454723,0.204991,0.007013,batch,60,5,42,"{'learning_method': 'batch', 'max_iter': 60, '...",-280345.537064,-265642.321164,-272024.938768,-257078.284026,-264089.034515,-267836.023108,7856.413377,15
3,53.484045,0.619138,0.217031,0.005289,batch,60,6,42,"{'learning_method': 'batch', 'max_iter': 60, '...",-281480.386541,-266513.915076,-273048.447951,-258020.87138,-265007.483017,-268814.220793,7932.710568,20
4,56.607311,1.419578,0.227927,0.006732,batch,60,7,42,"{'learning_method': 'batch', 'max_iter': 60, '...",-282504.818715,-267428.390129,-274116.99377,-258884.634934,-265804.871526,-269747.941815,8009.570312,25
5,52.856849,0.99055,0.217738,0.005796,batch,60,8,42,"{'learning_method': 'batch', 'max_iter': 60, '...",-282968.16561,-267969.425818,-274639.407581,-259494.89162,-266326.010214,-270279.580169,7965.994366,32
6,52.859523,0.806215,0.219785,0.004376,batch,60,9,42,"{'learning_method': 'batch', 'max_iter': 60, '...",-283250.836715,-268692.7304,-275179.575197,-260039.540028,-266539.120295,-270740.360527,7906.572068,35
7,78.401409,4.802406,0.192181,0.005294,batch,90,3,42,"{'learning_method': 'batch', 'max_iter': 90, '...",-278044.717242,-263376.803991,-269813.03396,-254863.638528,-262000.595605,-265619.757865,7820.31279,4
8,67.508,0.734093,0.189184,0.004463,batch,90,4,42,"{'learning_method': 'batch', 'max_iter': 90, '...",-279397.988889,-264575.959633,-271195.551267,-256069.596092,-263276.307746,-266903.080726,7880.119257,9
9,72.234653,0.949666,0.203363,0.011015,batch,90,5,42,"{'learning_method': 'batch', 'max_iter': 90, '...",-280273.316614,-265489.599276,-271898.68086,-256951.6033,-263999.304382,-267722.500887,7871.76894,14


In [34]:
# subset columns of gsres to keepcols
keepcols = [ 'param_max_iter', 'param_n_components', 'mean_test_score', 'std_test_score', 'rank_test_score']
gsres = gsres[keepcols]

In [35]:
gsres.sort_values('rank_test_score',ascending=True)

Unnamed: 0,param_max_iter,param_n_components,mean_test_score,std_test_score,rank_test_score
21,120,3,-265597.318859,7826.935919,1
14,90,3,-265619.757865,7820.31279,2
7,60,3,-265675.522556,7810.268656,3
0,30,3,-265755.828645,7813.489646,4
22,120,4,-266858.586837,7889.993107,5
15,90,4,-266903.080726,7880.119257,6
8,60,4,-266951.18691,7872.512593,7
1,30,4,-267015.841264,7868.249391,8
23,120,5,-267625.764313,7879.793143,9
16,90,5,-267722.500887,7871.76894,10
