# Configuration

In [1]:
corpus_db = '../2019-02-28_Lab07/novels.db'
max_words = 10000

# For MALLET
num_topics = 20
num_iters = 1000
show_interval = 100

#  Libraries

In [2]:
import pandas as pd
import sqlite3
import textman as tx

# Process

## Import novel corpus from database

We use SQL to get what we want quickly.

In [3]:
sql = """
SELECT * FROM token 
WHERE term_id IN (
    SELECT term_id FROM vocab 
    WHERE stop = 0 
    AND term_str NOT IN ('said')
    ORDER BY tfidf_sum DESC LIMIT {}
)
-- AND (author = 'poe' OR author = 'austen') 
AND (pos NOT LIKE 'NNP%')
""".format(max_words)

In [4]:
with sqlite3.connect(corpus_db) as db:
    tokens = pd.read_sql(sql, db)

OperationalError: unable to open database file

## Fix tokens dataframe

In [None]:
tokens = tokens.set_index(['author','book','chapter'])

In [None]:
tokens.head()

In [None]:
len(tokens.term_str.unique())

## Convert tokens to a corpus for MALLET input

In [None]:
corpus = tx.gather_tokens(tokens, level=2, col='term_str')\
    .reset_index().rename(columns={'term_str':'doc_content'})
corpus['doc_label'] = corpus.apply(lambda x: "doyle-{}-{}".format(x.book, x.chapter), 1)

In [None]:
corpus.head()

## Dump corpus to CSV file

In [None]:
corpus[['doc_label','doc_content']].to_csv('novels-corpus.csv', index=False)

## MALLET Time

### Show MALLET options

In [None]:
!mallet 

### Import corpus

In [None]:
!mallet import-file --input novels-corpus.csv --output novels-corpus.mallet --keep-sequence TRUE

### Train topics

In [None]:
!mallet train-topics --input novels-corpus.mallet --num-topics {num_topics} --num-iterations {num_iters} \
--output-doc-topics novels-doc-topics.txt \
--output-topic-keys novels-topic-keys.txt \
--word-topic-counts-file novels-word-topic-counts-file.txt \
--topic-word-weights-file novels-topic-word-weights-file.txt \
--xml-topic-report novels-topic-report.xml \
--xml-topic-phrase-report novels-topic-phrase-report.xml \
--show-topics-interval {show_interval} \
--use-symmetric-alpha false  \
--optimize-interval 100 \
--diagnostics-file novels-diagnostics.xml
