#  Libraries

In [1]:
import pandas as pd
import sqlite3, os
import textman as tx

# Configuration

In [2]:
db_file_names = [
                 'db/a_tale_of_two_cities.db',
                 'db/anna_karenina.db',
                 'db/captains_courageous.db',
                 'db/emma.db',
                 'db/far_from_madding_crowd.db',
                 'db/heart_of_darkness.db',
                 'db/jane_eyre.db',
                 'db/pride_and_prejudice.db',
                 'db/portrait_of_a_lady_vol1.db',
                 'db/portrait_of_a_lady_vol2.db'
                 ]

max_words = 10000

# For MALLET
num_topics = 20
num_iters = 1000
show_interval = 100
mallet_path = os.environ.get('MALLET_PATH', '/Users/leonardramsey/Downloads/mallet-2.0.8/bin/mallet')

# Process

## Import novel corpus from database

We use SQL to get what we want quickly.

In [3]:
sql = """
SELECT * FROM token 
WHERE term_id IN (
    SELECT term_id FROM vocab 
    WHERE stop = 0 
    AND term_str NOT IN ('said')
    ORDER BY tfidf_sum DESC LIMIT {}
)
-- AND (pos NOT LIKE 'NNP%')
""".format(max_words)

In [4]:
tokens_list = []

for db_file_index in range(0, len(db_file_names)): 
    with sqlite3.connect(db_file_names[db_file_index]) as db:
        tokens_list.append(pd.read_sql(sql, db))
        # fix tokens dataframe
        tokens_list[db_file_index] = tokens_list[db_file_index].set_index(['chap_num','para_num','sent_num'])
        print(tokens_list[db_file_index].head())
        print(len(tokens_list[db_file_index].term_str.unique()))


                            token_num  pos token_str  punc  num term_str  \
chap_num para_num sent_num                                                 
0        0        0                 0   NN   CHAPTER     0    0  chapter   
                  1                 1   NN    Period     0    0   period   
         1        0                 3  JJS      best     0    0     best   
                  0                 5  NNS     times     0    0    times   
                  0                10  JJS     worst     0    0    worst   

                            term_id  
chap_num para_num sent_num           
0        0        0            1339  
                  1            6121  
         1        0             870  
                  0            8718  
                  0            9641  
9610
                            token_num  pos token_str  punc  num  term_str  \
chap_num para_num sent_num                                                  
0        1        0                 0   JJ

## Convert tokens to a corpus for MALLET input

In [5]:
corpora = {}
for tokens_index in range(0, len(tokens_list)):
    corpus = tx.gather_tokens(tokens_list[tokens_index], level=2, col='term_str')\
        .reset_index().rename(columns={'term_str':'doc_content'})
    corpus['doc_label'] = corpus.apply(lambda x: "doyle-{}-{}".format(x.chap_num, x.para_num), 1)
    corpora[db_file_names[tokens_index]] = corpus
    corpus.head()

## Dump corpus to CSV file

In [6]:
for name in corpora:
    corpus = corpora[name]
    name = name.replace('db', '')
    name = name.replace('/', '')
    name = name.replace('.', '')
    if not os.path.exists('topic_models/%s/' % name):
        os.makedirs('topic_models/%s/' % name)
    corpus[['doc_label','doc_content']].to_csv('topic_models/%s/%s.csv' % (name, name), index=False)

## MALLET Time

### Show MALLET options

In [7]:
!{mallet_path}

Unrecognized command: 
Mallet 2.0 commands: 

  import-dir         load the contents of a directory into mallet instances (one per file)
  import-file        load a single file into mallet instances (one per line)
  import-svmlight    load SVMLight format data files into Mallet instances
  info               get information about Mallet instances
  train-classifier   train a classifier from Mallet data files
  classify-dir       classify data from a single file with a saved classifier
  classify-file      classify the contents of a directory with a saved classifier
  classify-svmlight  classify data from a single file in SVMLight format
  train-topics       train a topic model from Mallet data files
  infer-topics       use a trained topic model to infer topics for new documents
  evaluate-topics    estimate the probability of new documents under a trained model
  prune              remove features based on frequency or information gain
  split              divide data into testing, tr

### Import Each Corpus and Train

In [8]:
for name in corpora:
#     print(corpora)
    name = name.replace('db', '')
    name = name.replace('/', '')
    name = name.replace('.', '')
    
    !{mallet_path} import-file --input topic_models/{name}/{name}.csv --output topic_models/{name}/{name}.mallet --keep-sequence TRUE

    !{mallet_path} train-topics --input topic_models/{name}/{name}.mallet --num-topics {num_topics} --num-iterations {num_iters} \
    --output-doc-topics topic_models/{name}/{name}-doc-topics.txt \
    --output-topic-keys topic_models/{name}/{name}-topic-keys.txt \
    --word-topic-counts-file topic_models/{name}/{name}-word-topic-counts-file.txt \
    --topic-word-weights-file topic_models/{name}/{name}-topic-word-weights-file.txt \
    --xml-topic-report topic_models/{name}/{name}-topic-report.xml \
    --xml-topic-phrase-report topic_models/{name}/{name}-topic-phrase-report.xml \
    --show-topics-interval {show_interval} \
    --use-symmetric-alpha false  \
    --optimize-interval 100 \
    --diagnostics-file topic_models/{name}/{name}-diagnostics.xml


Mallet LDA: 20 topics, 5 topic bits, 11111 topic mask
Data loaded.
max tokens: 114
total tokens: 49221
<10> LL/token: -9.77901
<20> LL/token: -9.58422
<30> LL/token: -9.48008
<40> LL/token: -9.44373
<50> LL/token: -9.40442
<60> LL/token: -9.37728
<70> LL/token: -9.35273
<80> LL/token: -9.33954
<90> LL/token: -9.33484

0	0.25	way little day going first bed among one hill man life many home done citizen beyond large living number person 
1	0.25	might old much people made gone usual often times seen cruncher let hour brother law sister strong change felt family 
2	0.25	three one two night long jacques time upon five head hours day quite ago observed twenty within hundred foot four 
3	0.25	could man day good would many never half one father poor street kind heads night live bar evening king morning 
4	0.25	lorry miss pross tellson even gentleman bank much voice asked yet little look took sir though present touch ears floor 
5	0.25	upon young hand faces prisoner arm great lady stone whose c