In [1]:
import gzip
import os
import pandas as pd
import codecs

In [2]:
dataDir = './data/'

def extract_params(statefile):
    """extract alpha and beta values from MALLET statefile by path to statfile
    
    Args:
        statefile (str) : Path to statefile produced by MALLET
    Returns:
        tuple: alpha (list), beta
    """
    
#     with codecs.open(statefile, "r",encoding='utf-8') as state:
    with gzip.open(statefile, 'r') as state:
        params = [x.decode('utf-8').strip() for x in state.readlines()[1:3]]
#         params = [x.strip() for x in state.readlines()[1:3]]
    return (list(params[0].split(':')[1].split(" ")), float(params[1].split(':')[1]))

In [3]:
def state_to_df(statefile):
    """transform state file into pandas dataframe
    the MALLET statefile is tab-separated, and the first two rows contain the alpha and beta parameters
    
    Args:
        statefile (str): Path to statefile produced by MALLET
    Returns:
        dataframe: topic assignment for each token in each documnet of the model
    """
    return pd.read_csv(statefile,\
                      compression='gzip',\
                      sep=' ',\
                      skiprows=[1,2])

In [4]:
params = extract_params(os.path.join(dataDir, 'state.mallet.gz'))
alpha = [float(x) for x in params[0][1:]]
beta = params[1]
print("{}, {}".format(alpha, beta))

[10.0, 10.0, 10.0, 10.0, 10.0], 0.01


In [5]:
df = state_to_df(os.path.join(dataDir, 'state.mallet.gz'))
df['type'] = df.type.astype(str)
df[:10]
#doc id, word position index, word index, topic assignmnet

Unnamed: 0,#doc,source,pos,typeindex,type,topic
0,0,,0,0,baselin,4
1,0,,1,1,characterist,0
2,1,,0,2,analysi,0
3,1,,1,3,efficaci,0
4,2,,0,4,antigen,1
5,2,,1,5,carbohydr,1
6,2,,2,6,comparison,0
7,2,,3,7,level,1
8,2,,4,8,postop,4
9,2,,5,9,surviv,2


In [6]:
df[len(df)-10:]

Unnamed: 0,#doc,source,pos,typeindex,type,topic
357767,45350,,2,19,treatment,0
357768,45350,,3,95,patient,4
357769,45350,,4,98,relat,2
357770,45350,,5,359,observ,4
357771,45351,,0,2,analysi,2
357772,45351,,1,93,popul,0
357773,45351,,2,173,activ,0
357774,45351,,3,1534,confirm,2
357775,45351,,4,2203,recist,2
357776,45351,,5,3105,pembrolizumab,2


In [7]:
#to get the length of the document, group by the document id and count the tokens
docs = df.groupby('#doc')['type'].count().reset_index(name='doc_length')
docs[:10]

Unnamed: 0,#doc,doc_length
0,0,2
1,1,2
2,2,7
3,3,3
4,4,4
5,5,2
6,6,3
7,7,2
8,8,5
9,9,8


In [8]:
#get vocab and term frequencies
vocab = df['type'].value_counts().reset_index()
vocab.columns = ['type', 'term_freq']
vocab = vocab.sort_values(by='type', ascending=True)
vocab[:10]

Unnamed: 0,type,term_freq
2306,aaa,12
4866,aac,3
2090,aad,14
4698,aai,3
222,abbrevi,307
2757,abca,9
2414,abcd,11
1945,abciximab,16
1172,abdomin,35
1817,abi,18


In [9]:
# vocab.sort_values(by='term_freq', ascending=False)[:10]

In [10]:
#matrix file
#need to normalize data so that each row sums to 1
import sklearn.preprocessing

def pivot_and_smooth(df, smooth_values, rows_variable, cols_variable, values_variable):
    """
    modify dataframe into matrix
    Args:
        df (dataframe) : 
        smooth_values (float) : value to add to the matrix to account for the priors
        rows_variable (str) : title of rows
        cols_variable (str) : title of columns
        values_variable (str) : values
    Returns:
        dataframe : that has been normalized on the rows.
    """
    matrix = df.pivot(index=rows_variable, columns=cols_variable, values=values_variable).fillna(value=0)
    matrix = matrix.values + smooth_values
    
    normed = sklearn.preprocessing.normalize(matrix, norm='l1', axis=1)
    
    return pd.DataFrame(normed)

In [11]:
#get the number of topic assingments for words in documents
#phi - topic-term matrix and counted the number of times each word was assigned to each topic 
#and sorted by alphabetically to match the order of the vocabulary frame

#beta as the smoothign value
phi_df = df.groupby(['topic', 'type'])['type'].count().reset_index(name='token_count')
phi_df = phi_df.sort_values(by='type', ascending=True)
phi_df[:10]

Unnamed: 0,topic,type,token_count
2930,2,aaa,12
1454,1,aac,3
4210,3,aad,14
4211,3,aai,3
4212,3,abbrevi,307
1455,1,abca,9
1456,1,abcd,3
0,0,abcd,8
1,0,abciximab,16
4213,3,abdomin,35


In [12]:
phi = pivot_and_smooth(phi_df, beta, 'topic', 'type', 'token_count')
phi[:10]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5745,5746,5747,5748,5749,5750,5751,5752,5753,5754
0,1.434832e-07,1.434832e-07,1.434832e-07,1.434832e-07,1.434832e-07,1.434832e-07,0.0001149301,0.0002297167,1.434832e-07,1.434832e-07,...,1.434832e-07,1.434832e-07,1.449181e-05,1.449181e-05,1.434832e-07,7.18851e-05,1.434832e-07,1.434832e-07,1.434832e-07,1.434832e-07
1,1.452885e-07,4.373185e-05,1.452885e-07,1.452885e-07,1.452885e-07,0.000130905,4.373185e-05,1.452885e-07,1.452885e-07,1.452885e-07,...,1.452885e-07,4.373185e-05,1.452885e-07,1.452885e-07,5.826071e-05,1.452885e-07,1.452885e-07,4.373185e-05,4.373185e-05,1.452885e-07
2,0.0001627955,1.355499e-07,1.355499e-07,1.355499e-07,1.355499e-07,1.355499e-07,1.355499e-07,1.355499e-07,1.355499e-07,0.0002441254,...,1.355499e-07,1.355499e-07,1.355499e-07,1.355499e-07,1.355499e-07,1.355499e-07,1.355499e-07,1.355499e-07,1.355499e-07,1.355499e-07
3,1.303364e-07,1.303364e-07,0.0001826013,3.923125e-05,0.004001457,1.303364e-07,1.303364e-07,1.303364e-07,0.0004563077,1.303364e-07,...,0.0001695676,1.303364e-07,2.619761e-05,1.303364e-07,1.303364e-07,1.303364e-07,1.303364e-07,1.303364e-07,1.303364e-07,0.0003781058
4,1.448361e-07,1.448361e-07,1.448361e-07,1.448361e-07,1.448361e-07,1.448361e-07,1.448361e-07,1.448361e-07,1.448361e-07,1.448361e-07,...,1.448361e-07,1.448361e-07,1.448361e-07,0.0003622351,1.448361e-07,1.448361e-07,4.359567e-05,1.448361e-07,1.448361e-07,1.448361e-07


In [13]:
#theta document-topic matrix and use alpha as the smoothign value
theta_df = df.groupby(['#doc', 'topic'])['topic'].count().reset_index(name='topic_count')
theta_df[:10]

Unnamed: 0,#doc,topic,topic_count
0,0,0,1
1,0,4,1
2,1,0,2
3,2,0,1
4,2,1,3
5,2,2,1
6,2,4,2
7,3,0,1
8,3,3,2
9,4,0,4


In [14]:
theta = pivot_and_smooth(theta_df, alpha, '#doc', 'topic', 'topic_count')
theta[:10]

Unnamed: 0,0,1,2,3,4
0,0.211538,0.192308,0.192308,0.192308,0.211538
1,0.230769,0.192308,0.192308,0.192308,0.192308
2,0.192982,0.22807,0.192982,0.175439,0.210526
3,0.207547,0.188679,0.188679,0.226415,0.188679
4,0.259259,0.185185,0.185185,0.185185,0.185185
5,0.192308,0.211538,0.192308,0.192308,0.211538
6,0.226415,0.188679,0.188679,0.207547,0.188679
7,0.211538,0.211538,0.192308,0.192308,0.192308
8,0.236364,0.181818,0.218182,0.181818,0.181818
9,0.241379,0.224138,0.172414,0.172414,0.189655


In [15]:
import pyLDAvis
pyLDAvis.enable_notebook()

data = {
    'topic_term_dists':phi,
    'doc_topic_dists':theta,
    'doc_lengths':list(docs['doc_length']),
    'vocab':list(vocab['type']),
    'term_frequency':list(vocab['term_freq'])
}
vis_data = pyLDAvis.prepare(**data)
pyLDAvis.display(vis_data)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
