# Synopsis

Create an LDA of 20news corpus using MALLET.

# Configuration

In [1]:
src_file = '20news_01.csv'

In [25]:
num_topics = 15
num_iters = 1000
show_interval = 100

# Libraries

In [3]:
import pandas as pd
import numpy as np
import sqlite3
import re
import random
import textman as tx
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt

# Pragmas

In [4]:
%matplotlib inline

# Process

## Get corpus

In [5]:
docs = pd.read_csv(src_file, sep='\t')
docs = docs.set_index('doc_id')

## Convert corpus to tokens and vocab

We use a function from TextMan, a bespoke library that incorporates the text processing routines used in earlier notebooks.

In [7]:
tokens, vocab = tx.create_tokens_and_vocab(docs, src_col='doc_content')
tokens['token_num'] = tokens.groupby(['doc_id']).cumcount()
tokens = tokens.reset_index()[['doc_id','token_num','term_id']]
tokens = tokens[tokens.term_id.isin(vocab[vocab.go].index)]
tokens = tokens.set_index(['doc_id','token_num'])

### Add term strings

In [8]:
tokens['term_str'] = tokens.term_id.map(vocab.term)

In [9]:
tokens.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,term_id,term_str
doc_id,token_num,Unnamed: 2_level_1,Unnamed: 3_level_1
76209,0,4557,people
76209,1,5848,sure
76209,2,4713,posts
76209,3,2671,forwarded
76209,4,5882,system


## Remove insignificant words

We use SKlearn's TFIDF vectorizor to quicky get a TFIDF vector space, which we use only to filter the words in our corpus.

In [10]:
vectorizer = TfidfVectorizer(use_idf=1, stop_words='english', token_pattern=r'[A-Za-z][A-Za-z][A-Za-z]+')
X = vectorizer.fit_transform(docs.doc_content.values.tolist())
v = pd.DataFrame(vectorizer.get_feature_names(), columns=['term_str'])
v['idf'] = vectorizer.idf_

In [11]:
v.sort_values('idf', ascending=False).head(10)

Unnamed: 0,term_str,idf
0,aaa,4.921973
3491,nicely,4.921973
3488,nhlpa,4.921973
3486,nga,4.921973
3484,newswriter,4.921973
3483,newsweek,4.921973
3481,newspaper,4.921973
3480,newsgroups,4.921973
3478,newsbytes,4.921973
3476,newly,4.921973


### Take only the most significant words

In [27]:
cutoff = 4.5
v = v[v.idf > cutoff].sort_values('idf', ascending=False).sample(1000)
my_v = v.term_str.tolist()

In [28]:
tokens = tokens[tokens.term_str.isin(my_v)]

In [30]:
# vocab = vocab[vocab.term.isin(my_v)]

## Export corpus for MALLET 

In [31]:
corpus = tx.gather_tokens(tokens, level=0, col='term_str')\
    .reset_index().rename(columns={'term_str':'doc_content'})

In [32]:
corpus.head()

Unnamed: 0,doc_id,doc_content
0,20567,hell hell hell hell atheists hell atheists abu...
1,20758,comments apologists theology theology approach...
2,20859,esther female lunch lunch lunch lunch sub piec...
3,20888,belief belief belief belief religion religion ...
4,20910,goer goer goer goer


In [33]:
corpus.to_csv('20news-corpus.csv', index=False)

In [34]:
!mallet

Unrecognized command: 
Mallet 2.0 commands: 

  import-dir         load the contents of a directory into mallet instances (one per file)
  import-file        load a single file into mallet instances (one per line)
  import-svmlight    load SVMLight format data files into Mallet instances
  info               get information about Mallet instances
  train-classifier   train a classifier from Mallet data files
  classify-dir       classify data from a single file with a saved classifier
  classify-file      classify the contents of a directory with a saved classifier
  classify-svmlight  classify data from a single file in SVMLight format
  train-topics       train a topic model from Mallet data files
  infer-topics       use a trained topic model to infer topics for new documents
  evaluate-topics    estimate the probability of new documents under a trained model
  prune              remove features based on frequency or information gain
  split              divide data into testing, tr

In [35]:
!mallet import-file --input 20news-corpus.csv --output 20news-corpus.mallet --keep-sequence TRUE

In [36]:
!mallet train-topics --input 20news-corpus.mallet --num-topics {num_topics} --num-iterations {num_iters} \
--output-doc-topics 20news-doc-topics.txt \
--output-topic-keys 20news-topic-keys.txt \
--word-topic-counts-file 20news-word-topic-counts-file.txt \
--topic-word-weights-file 20news-topic-word-weights-file.txt \
--xml-topic-report 20news-topic-report.xml \
--xml-topic-phrase-report 20news-topic-phrase-report.xml \
--show-topics-interval {show_interval} \
--use-symmetric-alpha false  \
--optimize-interval 100 \
--diagnostics-file 20news-diagnostics.xml
# LL = log-likelihood

Mallet LDA: 15 topics, 4 topic bits, 1111 topic mask
Data loaded.
max tokens: 68
total tokens: 429
<10> LL/token: -4.73179
<20> LL/token: -4.49812
<30> LL/token: -4.43303
<40> LL/token: -4.36491
<50> LL/token: -4.36546
<60> LL/token: -4.31527
<70> LL/token: -4.22946
<80> LL/token: -4.26502
<90> LL/token: -4.22581

0	0.33333	ajteel barb sysadmin eclipse ida requested interesting alternative 
1	0.33333	bryn vilka den det 
2	0.33333	lunch processor runner centris conservative female survey peace 
3	0.33333	concept supernatural plurality theology dealer comments abuse 
4	0.33333	yalanci vay org reference columbia dbd hovig peace criminals 
5	0.33333	belief universe boone straight sox asks goer 
6	0.33333	bank palestinian meaningless umd disk 
7	0.33333	international molestation courier modems overwhelming azerbaijan magazine iii comments 
8	0.33333	mon teel boulder newsgroups overwhelming pouring 
9	0.33333	pray piece sub esther malaysia dealer 
10	0.33333	min det tor cgy powers floppies 


In [21]:
20

20