# Topic Modelling with MALLET Visualized with pyLDAvisf

### Running MALLET on the command line
https://programminghistorian.org/en/lessons/topic-modeling-and-mallet

### Visualizing topics in Python with pyLDAvis
http://jeriwieringa.com/2018/07/17/pyLDAviz-and-Mallet/#topic=0&lambda=1&term=

## Full Corpus Topic Model

In [1]:
import gzip
import os
import pandas as pd

dataDir = "data/"

def extract_params(statefile):
    """Extract the alpha and beta values from the statefile.

    Args:
        statefile (str): Path to statefile produced by MALLET.
    Returns:
        tuple: alpha (list), beta    
    """
    with gzip.open(statefile, 'r') as state:
        params = [x.decode('utf8').strip() for x in state.readlines()[1:3]]
    return (list(params[0].split(":")[1].split(" ")), float(params[1].split(":")[1]))


def state_to_df(statefile):
    """Transform state file into pandas dataframe.
    The MALLET statefile is tab-separated, and the first two rows contain the alpha and beta hypterparamters.
    
    Args:
        statefile (str): Path to statefile produced by MALLET.
    Returns:
        datframe: topic assignment for each token in each document of the model
    """
    return pd.read_csv(statefile,
                       compression='gzip',
                       sep=' ',
                       skiprows=[1,2]
                       )

In [2]:
params = extract_params(os.path.join(dataDir, 'clean.gz'))

In [3]:
alpha = [float(x) for x in params[0][1:]]
beta = params[1]
print("{}, {}".format(alpha, beta))

[0.19020829008743637, 0.48544349912449664, 0.7288929692655277, 0.3057105675014934, 0.8453269903323324, 0.16933052326612533, 0.15284865377341036, 0.3544607361087922, 0.5580985096848134, 0.9863480290849803, 0.12316624542703536, 0.1254791977898903, 0.10576166660479214, 0.33090439885487255, 0.014224645719695202, 0.7312566493209688, 0.01562883414567754, 0.2849514328057798, 1.2519412013243623, 0.16494093841631616], 0.02513430469947828


In [4]:
df = state_to_df(os.path.join(dataDir, 'clean.gz'))

In [5]:
df['type'] = df.type.astype(str)
df[:10]

Unnamed: 0,#doc,source,pos,typeindex,type,topic
0,0,sample-data\Clean1\Berkeley_AlciphronCLEAN.txt,0,0,alciphron,10
1,0,sample-data\Clean1\Berkeley_AlciphronCLEAN.txt,1,1,minute,10
2,0,sample-data\Clean1\Berkeley_AlciphronCLEAN.txt,2,2,philosopher,10
3,0,sample-data\Clean1\Berkeley_AlciphronCLEAN.txt,3,3,defence,4
4,0,sample-data\Clean1\Berkeley_AlciphronCLEAN.txt,4,4,christian,10
5,0,sample-data\Clean1\Berkeley_AlciphronCLEAN.txt,5,5,religion,10
6,0,sample-data\Clean1\Berkeley_AlciphronCLEAN.txt,6,6,socalled,10
7,0,sample-data\Clean1\Berkeley_AlciphronCLEAN.txt,7,7,freethinkers,10
8,0,sample-data\Clean1\Berkeley_AlciphronCLEAN.txt,8,8,george,8
9,0,sample-data\Clean1\Berkeley_AlciphronCLEAN.txt,9,9,berkeley,10


In [6]:
# Get document lengths from statefile
docs = df.groupby('#doc')['type'].count().reset_index(name ='doc_length')

docs[:10]

Unnamed: 0,#doc,doc_length
0,0,32214
1,1,12267
2,2,9708
3,3,11983
4,4,94752
5,5,19164
6,6,14450
7,7,20450
8,8,94628
9,9,34682


In [7]:
# Get vocab and term frequencies from statefile
vocab = df['type'].value_counts().reset_index()
vocab.columns = ['type', 'term_freq']
vocab = vocab.sort_values(by='type', ascending=True)

vocab[:10]

Unnamed: 0,type,term_freq
17525,aall,1
19797,aan,1
12711,aand,2
6596,aaron,5
21241,aarons,1
14618,aathe,1
21125,abalienatione,1
3521,abandon,15
16241,abandond,1
3505,abandoned,15


In [8]:
# Topic-term matrix from state file
# https://ldavis.cpsievert.me/reviews/reviews.html

import sklearn.preprocessing

def pivot_and_smooth(df, smooth_value, rows_variable, cols_variable, values_variable):
    """
    Turns the pandas dataframe into a data matrix.
    Args:
        df (dataframe): aggregated dataframe 
        smooth_value (float): value to add to the matrix to account for the priors
        rows_variable (str): name of dataframe column to use as the rows in the matrix
        cols_variable (str): name of dataframe column to use as the columns in the matrix
        values_variable(str): name of the dataframe column to use as the values in the matrix
    Returns:
        dataframe: pandas matrix that has been normalized on the rows.
    """
    matrix = df.pivot(index=rows_variable, columns=cols_variable, values=values_variable).fillna(value=0)
    matrix = matrix.values + smooth_value
    
    normed = sklearn.preprocessing.normalize(matrix, norm='l1', axis=1)
    
    return pd.DataFrame(normed)

In [9]:
phi_df = df.groupby(['topic', 'type'])['type'].count().reset_index(name ='token_count')
phi_df = phi_df.sort_values(by='type', ascending=True)

phi_df[:10]

Unnamed: 0,topic,type,token_count
10534,6,aall,1
25873,14,aan,1
0,0,aand,1
25874,14,aand,1
21724,12,aaron,5
3364,2,aarons,1
12294,7,aathe,1
21725,12,abalienatione,1
6402,4,abandon,13
15957,9,abandon,2


In [10]:
phi = pivot_and_smooth(phi_df, beta, 'topic', 'type', 'token_count')

# phi[:10]

In [11]:
theta_df = df.groupby(['#doc', 'topic'])['topic'].count().reset_index(name ='topic_count')

theta_df[:10]

Unnamed: 0,#doc,topic,topic_count
0,0,1,951
1,0,2,4363
2,0,3,412
3,0,4,2248
4,0,5,861
5,0,6,253
6,0,7,2806
7,0,8,579
8,0,9,1209
9,0,10,10967


In [12]:
theta = pivot_and_smooth(theta_df, alpha , '#doc', 'topic', 'topic_count')

# theta[:10]

In [13]:
import pyLDAvis

data = {'topic_term_dists': phi, 
        'doc_topic_dists': theta,
        'doc_lengths': list(docs['doc_length']),
        'vocab': list(vocab['type']),
        'term_frequency': list(vocab['term_freq'])
       }

In [14]:
vis_data = pyLDAvis.prepare(**data)

In [None]:
pyLDAvis.enable_notebook(vis_data)
pyLDAvis.display(vis_data)

## Epistemic Projects Topic Model

In [16]:
params = extract_params(os.path.join(dataDir, 'epistem.gz'))

In [17]:
alpha = [float(x) for x in params[0][1:]]
beta = params[1]
print("{}, {}".format(alpha, beta))

[0.09952595329989007, 4.174668390934641, 0.2711753175000779, 0.5026666974017482, 0.3350615371626598, 2.012307267229494, 0.12149092665099705, 4.867271862616894, 0.1109018922493752, 1.6869880781263402, 1.5107834696350717, 0.4199381178529643, 3.7719970997199477, 0.09949752848038032, 1.842052656376951, 0.16489968150643655, 0.7629578962571578, 0.1155108281805849, 0.4653520264476048, 2.0159298389613434], 0.034278656769356965


In [18]:
df = state_to_df(os.path.join(dataDir, 'epistem.gz'))

In [19]:
df['type'] = df.type.astype(str)
df[:10]

Unnamed: 0,#doc,source,pos,typeindex,type,topic
0,0,sample-data\epistem\Berkeley_HumanKnowledgeCLE...,0,0,treatise,19
1,0,sample-data\epistem\Berkeley_HumanKnowledgeCLE...,1,1,principles,1
2,0,sample-data\epistem\Berkeley_HumanKnowledgeCLE...,2,2,human,14
3,0,sample-data\epistem\Berkeley_HumanKnowledgeCLE...,3,3,knowledge,12
4,0,sample-data\epistem\Berkeley_HumanKnowledgeCLE...,4,4,chief,3
5,0,sample-data\epistem\Berkeley_HumanKnowledgeCLE...,5,5,error,19
6,0,sample-data\epistem\Berkeley_HumanKnowledgeCLE...,6,6,difficulty,10
7,0,sample-data\epistem\Berkeley_HumanKnowledgeCLE...,7,7,sciences,1
8,0,sample-data\epistem\Berkeley_HumanKnowledgeCLE...,8,8,grounds,7
9,0,sample-data\epistem\Berkeley_HumanKnowledgeCLE...,9,9,scepticism,10


In [20]:
# Get document lengths from statefile
docs = df.groupby('#doc')['type'].count().reset_index(name ='doc_length')

docs[:10]

Unnamed: 0,#doc,doc_length
0,0,12267
1,1,19164
2,2,94628


In [21]:
# Get vocab and term frequencies from statefile
vocab = df['type'].value_counts().reset_index()
vocab.columns = ['type', 'term_freq']
vocab = vocab.sort_values(by='type', ascending=True)

vocab[:10]

Unnamed: 0,type,term_freq
9762,abandon,1
7418,abandoned,1
4587,abate,2
7242,abated,1
5352,abatement,2
4501,abbe,2
5729,abbot,2
3554,abhorrence,4
2118,abilities,9
1224,ability,18


In [22]:
phi_df = df.groupby(['topic', 'type'])['type'].count().reset_index(name ='token_count')
phi_df = phi_df.sort_values(by='type', ascending=True)

phi_df[:10]

Unnamed: 0,topic,type,token_count
754,1,abandon,1
8700,11,abandoned,1
7867,10,abate,1
11123,14,abate,1
14748,19,abated,1
0,0,abatement,2
6951,9,abbe,2
3929,5,abbot,2
6496,8,abhorrence,3
12155,15,abhorrence,1


In [23]:
phi = pivot_and_smooth(phi_df, beta, 'topic', 'type', 'token_count')

# phi[:10]

In [24]:
theta_df = df.groupby(['#doc', 'topic'])['topic'].count().reset_index(name ='topic_count')

theta_df[:10]

Unnamed: 0,#doc,topic,topic_count
0,0,1,2536
1,0,2,1
2,0,3,2031
3,0,5,349
4,0,7,2530
5,0,8,1
6,0,9,163
7,0,10,547
8,0,11,49
9,0,12,849


In [25]:
theta = pivot_and_smooth(theta_df, alpha , '#doc', 'topic', 'topic_count')

# theta[:10]

In [26]:
data = {'topic_term_dists': phi, 
        'doc_topic_dists': theta,
        'doc_lengths': list(docs['doc_length']),
        'vocab': list(vocab['type']),
        'term_frequency': list(vocab['term_freq'])
       }

In [27]:
vis_data = pyLDAvis.prepare(**data)

In [28]:
pyLDAvis.display(vis_data)