In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import sqlite3
import os
import re
%matplotlib inline

# PART 1: Import and Preprocess 

In [2]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('tagsets')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/leonardramsey/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/leonardramsey/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/leonardramsey/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package tagsets to
[nltk_data]     /Users/leonardramsey/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/leonardramsey/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Revised import function

Using NLTK for parsing has improved the results of POS tagging. We do this for 10 different texts.

In [3]:
OHCO = ['chap_num', 'para_num', 'sent_num', 'token_num']
CHAPS = OHCO[:1]
PARAS = OHCO[:2]
SENTS = OHCO[:3]

In [4]:
"""
NOTE: NLTK tokenization messes up whitespace and 
handles non-alpha characters differently.
"""

def text_to_tokens(
                   src_file,
                   body_start=0, 
                   body_end=-1, 
                   chap_pat=r'^\s*Chapter.*$', 
                   para_pat=r'\n\n+', 
                   sent_pat=r'([.;?!"“”]+)', 
                   token_pat=r'([\W_]+)'):

    # Text to lines
    lines = open(src_file, 'r', encoding='utf-8').readlines()
    lines = lines[body_start - 1 : body_end + 1]
    df = pd.DataFrame({'line_str':lines})
    df.index.name = 'line_id'
    del(lines)
    
    # FIX CHARACTERS TO IMPROVE TOKENIZATION
    df.line_str = df.line_str.str.replace('—', ' — ')
    df.line_str = df.line_str.str.replace('-', ' - ')

    # Lines to Chapters
    mask = df.line_str.str.match(chap_pat)
    df.loc[mask, 'chap_id'] = df.apply(lambda x: x.name, 1)
    df.chap_id = df.chap_id.ffill().astype('int')
    chap_ids = df.chap_id.unique().tolist()
    df['chap_num'] = df.chap_id.apply(lambda x: chap_ids.index(x))
    chaps = df.groupby('chap_num')\
        .apply(lambda x: ''.join(x.line_str))\
        .to_frame()\
        .rename(columns={0:'chap_str'})
    del(df)

    # Chapters to Paragraphs
    paras = chaps.chap_str.str.split(para_pat, expand=True)\
        .stack()\
        .to_frame()\
        .rename(columns={0:'para_str'})
    paras.index.names = PARAS
    paras.para_str = paras.para_str.str.strip()
    paras.para_str = paras.para_str.str.replace(r'\n', ' ')
    paras.para_str = paras.para_str.str.replace(r'\s+', ' ')
    paras = paras[~paras.para_str.str.match(r'^\s*$')]
    del(chaps)

    # Paragraphs to Sentences
#     sents = paras.para_str.str.split(sent_pat, expand=True)\
    sents = paras.para_str\
        .apply(lambda x: pd.Series(nltk.sent_tokenize(x)))\
        .stack()\
        .to_frame()\
        .rename(columns={0:'sent_str'})
    sents.index.names = SENTS
    del(paras)

    # Sentences to Tokens
#     tokens = sents.sent_str.str.split(token_pat, expand=True)\
    tokens = sents.sent_str\
        .apply(lambda x: pd.Series(nltk.pos_tag(nltk.word_tokenize(x))))\
        .stack()\
        .to_frame()\
        .rename(columns={0:'pos_tuple'})
    tokens.index.names = OHCO
    del(sents)
    
    tokens['pos'] = tokens.pos_tuple.apply(lambda x: x[1])
    tokens['token_str'] = tokens.pos_tuple.apply(lambda x: x[0])
    tokens = tokens.drop('pos_tuple', 1)

    # Tag punctuation and numbers
    tokens['punc'] = tokens.token_str.str.match(r'^[\W_]*$').astype('int')
    tokens['num'] = tokens.token_str.str.match(r'^.*\d.*$').astype('int')
    
    # Extract vocab with minimal normalization
    WORDS = (tokens.punc == 0) & (tokens.num == 0)
    tokens.loc[WORDS, 'term_str'] = tokens.token_str.str.lower()\
        .str.replace(r'["_*.]', '')
    
    vocab = tokens[tokens.punc == 0].term_str.value_counts().to_frame()\
        .reset_index()\
        .rename(columns={'index':'term_str', 'term_str':'n'})
    vocab = vocab.sort_values('term_str').reset_index(drop=True)
    vocab.index.name = 'term_id'
    
    # Get priors for V
    vocab['p'] = vocab.n / vocab.n.sum()
    
    # Add stems
    stemmer = nltk.stem.porter.PorterStemmer()
    vocab['port_stem'] = vocab.term_str.apply(lambda x: stemmer.stem(x))
    
    # Define stopwords
    sw = pd.DataFrame({'x':1}, index=nltk.corpus.stopwords.words('english'))
    vocab['stop'] = vocab.term_str.map(sw.x).fillna(0).astype('int')
    del(sw)
            
    # Add term_ids to tokens 
    tokens['term_id'] = tokens['term_str'].map(vocab.reset_index()\
        .set_index('term_str').term_id).fillna(-1).astype('int')

    return tokens, vocab

def get_docs(tokens, div_names, doc_str = 'term_id', sep='', flatten=False, 
             index_only=False):
    
    if not index_only:
        docs = tokens.groupby(div_names)[doc_str]\
          .apply(lambda x: x.str.cat(sep=sep))
        docs.columns = ['doc_content']
    else:
        docs = tokens.groupby(div_names)[doc_str].apply(lambda x: x.tolist())

    if flatten:
        docs = docs.reset_index().drop(div_names, 1)
    
    return docs

def get_term_id(vocab, term_str):
    return vocab[vocab.term_str == term_str].index[0]

def get_term_str(vocab, term_id):
    return vocab.loc[term_id].term_str

In [5]:
# src_file_name = '2701-0.txt'
# src_file_url = 'https://www.gutenberg.org/files/2701/2701-0.txt'

K_list = []
V_list = []

src_file_names = [
                 'corpus/a_tale_of_two_cities.txt',
                 'corpus/anna_karenina.txt',
                 'corpus/captains_courageous.txt',
                 'corpus/emma.txt',
                 'corpus/far_from_madding_crowd.txt',
                 'corpus/heart_of_darkness.txt',
                 'corpus/jane_eyre.txt',
                 'corpus/pride_and_prejudice.txt',
                 'corpus/portrait_of_a_lady_vol1.txt',
                 'corpus/portrait_of_a_lady_vol2.txt'
                 ]

db_file_names = [
                 'db/a_tale_of_two_cities.db',
                 'db/anna_karenina.db',
                 'db/captains_courageous.db',
                 'db/emma.db',
                 'db/far_from_madding_crowd.db',
                 'db/heart_of_darkness.db',
                 'db/jane_eyre.db',
                 'db/pride_and_prejudice.db',
                 'db/portrait_of_a_lady_vol1.db',
                 'db/portrait_of_a_lady_vol2.db'
                 ]
body_starts = [
               111, # a tale of 2 cities
               61, # anna karenina
               38, # captains courageous
               43, # emma
               112, # far from madding crowd
               38, # heart of darkness
               72, # jane eyre
               98, # pride and predjudice
               43, # portrait of a lady (vol 1)
               43 # portrait of a lady (vol 2)
               ]
body_ends = [
            15900, # a tale of 2 cities
            39892, # anna karenina
            6185, # captains courageous
            16262, # emma
            17196, # far from madding crowd
            3343, # heart of darkness
            20700, # jane eyre
            13339, # pride and predjudice
            12879, # portrait of a lady (vol 1)
            12221 # portrait of a lady (vol 2)
            ]
chap_pats = [
            r'^\s*(?:CHAPTER).*$', # a tale of 2 cities
            r'^\s*(?:Chapter|ETYMOLOGY|Epilogue).*$', # anna karenina
            r'^\s*(?:CHAPTER|ETYMOLOGY|Epilogue).*$', # captains courageous
            r'^\s*(?:CHAPTER|ETYMOLOGY|Epilogue).*$', # emma
            r'^\s*(?:CHAPTER|PREFACE).*$', # far from madding crowd
            r'^\s*(?:CHAPTER|ETYMOLOGY|Epilogue).*$', # heart of darkness
            r'^\s*(?:CHAPTER|PREFACE|Epilogue).*$', # jane eyre
            r'^\s*(?:CHAPTER|ETYMOLOGY|Epilogue).*$', # pride and predjudice
            r'^\s*(?:CHAPTER|PREFACE|Epilogue).*$', # portrait of a lady (vol 1)
            r'^\s*(?:CHAPTER|PREFACE|Epilogue).*$' # portrait of a lady (vol 2)
            ]

In [6]:
for s in range(0,len(src_file_names)):
    print('iteration %d' % s)
    print('src_file_name %s' % src_file_names[s])
    print('body_start %d' % body_starts[s])
    print('body_end %d' % body_ends[s])
    print('chap_pat %s' % chap_pats[s])
    cfg = dict(
        src_file = src_file_names[s],
        body_start = body_starts[s],
        body_end = body_ends[s],
        chap_pat = chap_pats[s]
    )
    print(cfg)

    K, V = text_to_tokens(**cfg)
    K_list.append(K)
    V_list.append(V)
    K_list[s].head()
    V_list[s].head()

iteration 0
src_file_name corpus/a_tale_of_two_cities.txt
body_start 111
body_end 15900
chap_pat ^\s*(?:CHAPTER).*$
{'src_file': 'corpus/a_tale_of_two_cities.txt', 'body_start': 111, 'body_end': 15900, 'chap_pat': '^\\s*(?:CHAPTER).*$'}
iteration 1
src_file_name corpus/anna_karenina.txt
body_start 61
body_end 39892
chap_pat ^\s*(?:Chapter|ETYMOLOGY|Epilogue).*$
{'src_file': 'corpus/anna_karenina.txt', 'body_start': 61, 'body_end': 39892, 'chap_pat': '^\\s*(?:Chapter|ETYMOLOGY|Epilogue).*$'}
iteration 2
src_file_name corpus/captains_courageous.txt
body_start 38
body_end 6185
chap_pat ^\s*(?:CHAPTER|ETYMOLOGY|Epilogue).*$
{'src_file': 'corpus/captains_courageous.txt', 'body_start': 38, 'body_end': 6185, 'chap_pat': '^\\s*(?:CHAPTER|ETYMOLOGY|Epilogue).*$'}
iteration 3
src_file_name corpus/emma.txt
body_start 43
body_end 16262
chap_pat ^\s*(?:CHAPTER|ETYMOLOGY|Epilogue).*$
{'src_file': 'corpus/emma.txt', 'body_start': 43, 'body_end': 16262, 'chap_pat': '^\\s*(?:CHAPTER|ETYMOLOGY|Epilogue)

# PART 2: Add Term Frequencies and Weights to Tables

In [7]:
def euclidean(row):
    D1 = TFIDF.loc[row.name[0]]
    D2 = TFIDF.loc[row.name[1]]
    x = (D1 - D2)**2
    y = x.sum() 
    z = np.sqrt(y)
    return z

def cosine(row):
    D1 = TFIDF.loc[row.name[0]]
    D2 = TFIDF.loc[row.name[1]]
    x = D1 * D2
    y = x.sum()
    a = np.sqrt(D1.sum()**2)
    b = np.sqrt(D2.sum()**2)
    c = np.sqrt(a) * np.sqrt(b)
    z = y / c
    return z

## Create DTM


In [8]:
# Create word mask

# Let's filter out stopwords -- another hyperparameter. 
for db_file_index in range(0, len(db_file_names)): 
    WORDS = (K_list[db_file_index].punc == 0) & (K_list[db_file_index].num == 0) & K_list[db_file_index].term_id.isin(V_list[db_file_index][V_list[db_file_index].stop==0].index)

    # Extrct BOW from tokens

    # To extract a bag-of-words model from our tokens table, we apply a simple `groupby()` operation. Note that we can drop in our hyperparameters easily -- CHAPS and 'term_id' and be replaced. We can easily write a function to simplify this process and make it more configurable. 

    BOW = K_list[db_file_index][WORDS].groupby(OHCO[:1]+['term_id'])['term_id'].count()

    ### Convert BOW to DTM

    DTM = BOW.unstack().fillna(0)

    ## ----- Compute Term Frequencies and Weights -----

    ### Compute TF

    alpha = .000001 # We introduce an arbitrary smoothing value
    alpha_sum = alpha * V.shape[0]
    TF = DTM.apply(lambda x: (x + alpha) / (x.sum() + alpha_sum), axis=1)

    ### Compute TFIDF

    N_docs = DTM.shape[0]
    V_list[db_file_index]['df'] = DTM[DTM > 0].count()
    TFIDF = TF * np.log2(N_docs / V_list[db_file_index][V_list[db_file_index].stop==0]['df'])

    ### Compute TFTH (Experiment)

    THM = -(TF * np.log2(TF))
    TFTH = TF.apply(lambda x: x * THM.sum(), 1)

    ### Add stats to V

    V_list[db_file_index]['tf_sum'] = TF.sum()
    V_list[db_file_index]['tf_mean'] = TF.mean()
    V_list[db_file_index]['tf_max'] = TF.max()
    V_list[db_file_index]['tfidf_sum'] = TFIDF.sum()
    V_list[db_file_index]['tfidf_mean'] = TFIDF.mean()
    V_list[db_file_index]['tfidf_max'] = TFIDF.max()
    V_list[db_file_index]['tfth_sum'] = TFTH.sum()
    V_list[db_file_index]['tfth_mean'] = TFTH.mean()
    V_list[db_file_index]['tfth_max'] = TFTH.max()
    V_list[db_file_index]['th_sum'] = THM.sum()
    V_list[db_file_index]['th_mean'] = THM.mean()
    V_list[db_file_index]['th_max'] = THM.max()

    ## Create Docs table

    D = DTM.sum(1).astype('int').to_frame().rename(columns={0:'term_count'})
    D['tf'] = D.term_count / D.term_count.sum()

    ## Get all doc pairs

    chap_ids = D.index.tolist()
    pairs = [(i,j) for i in chap_ids for j in chap_ids if j > i]
    P = pd.DataFrame(pairs).reset_index(drop=True).set_index([0,1])
    P.index.names = ['doc_x','doc_y']

    ## Compute Euclidean distance

    P['euclidean'] = 0
    P['euclidean'] = P.apply(euclidean, 1)

    ## Compute Cosine similarity

    P['cosine'] = P.apply(cosine, 1)
    
    # Save data
    if not os.path.exists('db/'):
        os.makedirs('db/')
        
    with sqlite3.connect(db_file_names[db_file_index]) as db:
        V_list[db_file_index].to_sql('vocab', db, if_exists='replace', index=True)
        K_list[db_file_index].to_sql('token', db, if_exists='replace', index=True)
        D.to_sql('doc', db, if_exists='replace', index=True)
        P.to_sql('docpair', db, if_exists='replace', index=True)
    #     BOW.to_frame().rename(columns={'term_id':'n'}).to_sql('bow', db, if_exists='replace', index=True)
        TFIDF.stack().to_frame().rename(columns={0:'term_weight'})\
            .to_sql('dtm_tfidf', db, if_exists='replace', index=True)