# Configuration

In [1]:
db_file_names = [
                 'db/2701-0.db',
                 'db/a_tale_of_two_cities.db',
                 'db/anna_karenina.db',
                 'db/captains_courageous.db',
                 'db/emma.db',
                 'db/far_from_madding_crowd.db',
                 'db/heart_of_darkness.db',
                 'db/jane_eyre.db',
                 'db/pride_and_prejudice.db',
                 'db/portrait_of_a_lady_vol1.db',
                 'db/portrait_of_a_lady_vol2.db'
                 ]
max_words = 10000

# For MALLET
num_topics = 20
num_iters = 1000
show_interval = 100
mallet_path = '/Users/leonardramsey/Downloads/mallet-2.0.8/bin/mallet'

#  Libraries

In [17]:
import pandas as pd
import sqlite3, os
import textman as tx

# Process

## Import novel corpus from database

We use SQL to get what we want quickly.

In [3]:
sql = """
SELECT * FROM token 
WHERE term_id IN (
    SELECT term_id FROM vocab 
    WHERE stop = 0 
    AND term_str NOT IN ('said')
    ORDER BY tfidf_sum DESC LIMIT {}
)
-- AND (pos NOT LIKE 'NNP%')
""".format(max_words)

In [4]:
tokens_list = []

for db_file_index in range(0, len(db_file_names)): 
    with sqlite3.connect(db_file_names[db_file_index]) as db:
        tokens_list.append(pd.read_sql(sql, db))
        # fix tokens dataframe
        tokens_list[db_file_index] = tokens_list[db_file_index].set_index(['chap_num','para_num','sent_num'])
        print(tokens_list[db_file_index].head())
        print(len(tokens_list[db_file_index].term_str.unique()))


                            token_num  pos token_str  punc  num  term_str  \
chap_num para_num sent_num                                                  
0        1        0                 1  VBN  Supplied     0    0  supplied   
                  0                 4   JJ      Late     0    0      late   
                  0                10  NNP    School     0    0    school   
         2        0                 1   NN      pale     0    0      pale   
                  0                 6   NN      coat     0    0      coat   

                            term_id  
chap_num para_num sent_num           
0        1        0           14441  
                  0            8383  
                  0           12718  
         2        0           10359  
                  0            2686  
10000
                            token_num pos token_str  punc  num term_str  \
chap_num para_num sent_num                                                
0        0        0                 0 

## Convert tokens to a corpus for MALLET input

In [5]:
corpora = {}
for tokens_index in range(0, len(tokens_list)):
    corpus = tx.gather_tokens(tokens_list[tokens_index], level=2, col='term_str')\
        .reset_index().rename(columns={'term_str':'doc_content'})
    corpus['doc_label'] = corpus.apply(lambda x: "doyle-{}-{}".format(x.chap_num, x.para_num), 1)
    corpora[db_file_names[tokens_index]] = corpus
    corpus.head()

## Dump corpus to CSV file

In [22]:
for name in corpora:
    corpus = corpora[name]
    name = name.replace('db', '')
    name = name.replace('/', '')
    name = name.replace('.', '')
    corpus[['doc_label','doc_content']].to_csv('topic_models/%s/%s.csv' % (name, name), index=False)

## MALLET Time

### Show MALLET options

In [23]:
!{mallet_path}

Unrecognized command: 
Mallet 2.0 commands: 

  import-dir         load the contents of a directory into mallet instances (one per file)
  import-file        load a single file into mallet instances (one per line)
  import-svmlight    load SVMLight format data files into Mallet instances
  info               get information about Mallet instances
  train-classifier   train a classifier from Mallet data files
  classify-dir       classify data from a single file with a saved classifier
  classify-file      classify the contents of a directory with a saved classifier
  classify-svmlight  classify data from a single file in SVMLight format
  train-topics       train a topic model from Mallet data files
  infer-topics       use a trained topic model to infer topics for new documents
  evaluate-topics    estimate the probability of new documents under a trained model
  prune              remove features based on frequency or information gain
  split              divide data into testing, tr

### Import Each Corpus and Train

In [25]:
for name in corpora:
#     print(corpora)
    name = name.replace('db', '')
    name = name.replace('/', '')
    name = name.replace('.', '')
    if not os.path.exists('topic_models/%s/' % name):
        os.makedirs('topic_models/%s/' % name)
    
    !{mallet_path} import-file --input topic_models/{name}/{name}.csv --output topic_models/{name}/{name}.mallet --keep-sequence TRUE

    !{mallet_path} train-topics --input topic_models/{name}/{name}.mallet --num-topics {num_topics} --num-iterations {num_iters} \
    --output-doc-topics topic_models/{name}/{name}-doc-topics.txt \
    --output-topic-keys topic_models/{name}/{name}-topic-keys.txt \
    --word-topic-counts-file topic_models/{name}/{name}-word-topic-counts-file.txt \
    --topic-word-weights-file topic_models/{name}/{name}-topic-word-weights-file.txt \
    --xml-topic-report topic_models/{name}/{name}-topic-report.xml \
    --xml-topic-phrase-report topic_models/{name}/{name}-topic-phrase-report.xml \
    --show-topics-interval {show_interval} \
    --use-symmetric-alpha false  \
    --optimize-interval 100 \
    --diagnostics-file topic_models/{name}/{name}-diagnostics.xml


Mallet LDA: 20 topics, 5 topic bits, 11111 topic mask
Data loaded.
max tokens: 214
total tokens: 82537
<10> LL/token: -9.73403
<20> LL/token: -9.51448
<30> LL/token: -9.4179
<40> LL/token: -9.36213
<50> LL/token: -9.29985
<60> LL/token: -9.2815
<70> LL/token: -9.2567
<80> LL/token: -9.23492
<90> LL/token: -9.21433

0	0.25	whales two one whale whole small found oil less taken sperm case story say full dead among human leviathan ago 
1	0.25	yet things though man white thus world even many harpooneers sort last times far might board great pequod called waters 
2	0.25	queequeg bed one poor sir must room get little put way coffin short away tell harpooneer rest harpoon live work 
3	0.25	like white air hand water upon half sun almost boy jaw pip deep beneath till straight aspect visible cry rose 
4	0.25	ship would pequod cabin way went ahab time come captain old crew night one seen though along day lay boats 
5	0.25	sea ship whale water shall always large may perhaps gold seas tell found cou