In [4]:
import pandas as pd
import sqlite3
import requests
import re
import nltk
from nltk.stem.porter import PorterStemmer
import numpy as np

In [5]:
OHCO = ['book_id', 'para_num', 'sent_num', 'token_num']
SENTS = OHCO[:4]
PARAS = OHCO[:3]
CHAPS = OHCO[:2]
BOOKS = OHCO[:1]

In [6]:
# Load data
data_dir = './'

LIBRARY = pd.read_csv(data_dir + "LIBRARY.csv").set_index(BOOKS)
TOKEN = pd.read_csv(data_dir + 'TOKEN.csv').set_index(OHCO)
VOCAB = pd.read_csv(data_dir + 'VOCAB.csv').set_index('term_id')

In [7]:
# enhance vocab table

# stop words
sw = pd.DataFrame(nltk.corpus.stopwords.words('english'), columns=['term_str'])
sw = sw.reset_index().set_index('term_str')
sw.columns = ['dummy']
sw.dummy = 1

VOCAB['stop'] = VOCAB.term_str.map(sw.dummy)
VOCAB['stop'] = VOCAB['stop'].fillna(0).astype('int')

In [8]:
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()
VOCAB['p_stem'] = VOCAB.term_str.apply(str).apply(stemmer.stem)

In [9]:
VOCAB.head()

Unnamed: 0_level_0,term_str,n,p,log_p,stop,p_stem
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
100,A,4277,0.001217456,-9.681915,0,a
47080,AB,1,2.846519e-07,-21.744298,0,ab
39729,ABANDON,1,2.846519e-07,-21.744298,0,abandon
33967,ABBEY,2,5.693038e-07,-20.744298,0,abbey
36300,ABBOT,2,5.693038e-07,-20.744298,0,abbot


In [10]:
# add term_id to TOKEN table
TOKEN['term_id'] = TOKEN.token_str.map(VOCAB.reset_index().set_index('term_str').term_id)
TOKEN.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,pos_tuple,pos,token_str,term_id
book_id,para_num,sent_num,token_num,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
21768,0,0,0,"('Produced', 'VBN')",VBN,Produced,5472
21768,0,0,1,"('by', 'IN')",IN,by,33
21768,0,0,2,"('David', 'NNP')",NNP,David,6693
21768,0,0,3,"('Widger', 'NNP')",NNP,Widger,25278
21768,1,0,0,"('A', 'DT')",DT,A,100


In [11]:
# Add max pos to VOCAB
VOCAB['pos_max'] = TOKEN.groupby(['term_id', 'pos']).count().iloc[:,0].unstack().idxmax(1)
VOCAB.head()

Unnamed: 0_level_0,term_str,n,p,log_p,stop,p_stem,pos_max
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
100,A,4277,0.001217456,-9.681915,0,a,DT
47080,AB,1,2.846519e-07,-21.744298,0,ab,NNP
39729,ABANDON,1,2.846519e-07,-21.744298,0,abandon,NNP
33967,ABBEY,2,5.693038e-07,-20.744298,0,abbey,NNP
36300,ABBOT,2,5.693038e-07,-20.744298,0,abbot,NNP


In [12]:
# Zipf's Law
# Add term rank
if 'term_rank' not in VOCAB.columns:
    VOCAB = VOCAB.sort_values('n', ascending=False).reset_index()
    VOCAB.index.name = 'term_rank'
    VOCAB = VOCAB.reset_index()
    VOCAB = VOCAB.set_index('term_id')
    VOCAB['term_rank'] = VOCAB['term_rank'] + 1
VOCAB.head()

Unnamed: 0_level_0,term_rank,term_str,n,p,log_p,stop,p_stem,pos_max
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,1,the,201564,0.057376,-4.12342,1,the,DT
1,2,of,104234,0.02967,-5.074832,1,of,IN
2,3,and,102976,0.029312,-5.092349,1,and,CC
3,4,to,80831,0.023009,-5.441677,1,to,TO
4,5,a,80180,0.022823,-5.453343,1,a,DT


In [13]:
VOCAB['p'] = VOCAB.n / TOKEN.shape[0]
VOCAB.head()

Unnamed: 0_level_0,term_rank,term_str,n,p,log_p,stop,p_stem,pos_max
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,1,the,201564,0.057376,-4.12342,1,the,DT
1,2,of,104234,0.02967,-5.074832,1,of,IN
2,3,and,102976,0.029312,-5.092349,1,and,CC
3,4,to,80831,0.023009,-5.441677,1,to,TO
4,5,a,80180,0.022823,-5.453343,1,a,DT


In [14]:
VOCAB['zipf_k'] = VOCAB.n * VOCAB.term_rank
VOCAB['zipf_k_p'] = VOCAB.p * VOCAB.term_rank
VOCAB.head()

Unnamed: 0_level_0,term_rank,term_str,n,p,log_p,stop,p_stem,pos_max,zipf_k,zipf_k_p
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,1,the,201564,0.057376,-4.12342,1,the,DT,201564,0.057376
1,2,of,104234,0.02967,-5.074832,1,of,IN,208468,0.059341
2,3,and,102976,0.029312,-5.092349,1,and,CC,308928,0.087937
3,4,to,80831,0.023009,-5.441677,1,to,TO,323324,0.092035
4,5,a,80180,0.022823,-5.453343,1,a,DT,400900,0.114117


In [15]:
# get TFIDF
def make_TFIDF(df, OHCO, count_method, tf_method, idf_method):
    # df: The tokens data frame to use.
    # OHCO: The OHCO level to use, e.g. which "bag" to use.
    # count_type: The type of count to use (e.g. binary counts are regular counts). (n or c, c is T/F in bag or no)
    # TF: The type of TF to use.
    # IDF: The type of IDF to use.

    # set bag
    bag = OHCO

    # Create the bag of words
    BOW = df.groupby(bag+['term_id']).term_id.count()\
        .to_frame().rename(columns={'term_id':'n'})
    BOW['c'] = BOW.n.astype('bool').astype('int')
    
    # BOW to Document Term Matrix
    DTCM = BOW[count_method].unstack().fillna(0).astype('int')

    # selecting TF method
    TF = 0
    if tf_method == 'sum':
        TF = DTCM.T / DTCM.T.sum()

    elif tf_method == 'max':
        TF = DTCM.T / DTCM.T.max()

    elif tf_method == 'log':
        TF = np.log10(1 + DTCM.T)
        
    elif tf_method == 'raw':
        TF = DTCM.T

    elif tf_method == 'double_norm':
        TF = DTCM.T / DTCM.T.max()
        TF = tf_norm_k + (1 - tf_norm_k) * TF[TF > 0] # EXPLAIN; may defeat purpose of norming

    elif tf_method == 'binary':
        TF = DTCM.T.astype('bool').astype('int')
        
    TF = TF.T


    # selecting IDF method
    # print('IDF method:', idf_method)
    DF = DTCM[DTCM > 0].count()
    N = DTCM.shape[0]

    if idf_method == 'standard':
        IDF = np.log10(N / DF)

    elif idf_method == 'max':
        IDF = np.log10(DF.max() / DF) 

    elif idf_method == 'smooth':
        IDF = np.log10((1 + N) / (1 + DF)) + 1 # Correct?

    TFIDF = TF * IDF

    return TFIDF

In [16]:
TFIDF = make_TFIDF(TOKEN,BOOKS,'n','sum','standard')
TFIDF.head()

term_id,0,1,2,3,4,5,6,7,8,9,...,56810,56811,56812,56813,56814,56815,56816,56817,56818,56819
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
108,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
126,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
139,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
244,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
290,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.3e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# PCA

In [22]:
from sklearn.decomposition import PCA
from scipy.linalg import norm
import plotly_express as px
import seaborn as sns

In [18]:
pca_engine = PCA(n_components=10)

In [19]:
DCM = pd.DataFrame(pca_engine.fit_transform(TFIDF), index=TFIDF.index)
DCM.columns = ['PC{}'.format(i) for i in DCM.columns]
DCM['title'] = LIBRARY.title

In [21]:
DCM.head().style.background_gradient()

Unnamed: 0_level_0,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,title
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
108,0.000432,-0.001324,0.000382,-3.8e-05,-0.000142,-6.9e-05,-0.00038,0.000104,0.000112,-3.6e-05,THE RETURN OF SHERLOCK HOLMES
126,-0.000635,0.001358,-0.002032,0.006246,-0.003421,-0.001587,0.000199,0.001667,0.001183,0.000341,THE POISON BELT
139,-0.0005,0.000856,-0.001246,0.003748,-0.001903,-0.000873,0.000118,0.000841,0.000594,0.000164,THE LOST WORLD
244,-1.7e-05,-0.000551,7.4e-05,-1.8e-05,-1.3e-05,4.3e-05,-0.00021,-9.4e-05,-8.2e-05,-5.5e-05,A STUDY IN SCARLET
290,-0.000273,0.000165,-0.000133,5.7e-05,0.000144,0.000213,7e-06,-0.00024,-0.000271,-5.2e-05,THE STARK MUNRO LETTERS BEING SERIES OF TWELVE LETTERS WRITTEN BY J STARK MUNRO M B TO HIS FRIEND AND FORMER FELLOW STUDENT HERBERT SWANBOROUGH OF LOWELL MASSACHUSETTS DURING THE YEARS 1881 1884


In [23]:
px.scatter_3d(DCM, 'PC0', 'PC1','PC2', hover_name='title', height=1000, width=1200)

In [24]:
LOADINGS = pd.DataFrame(pca_engine.components_.T * np.sqrt(pca_engine.explained_variance_))
LOADINGS.columns = ["PC{}".format(i) for i in LOADINGS.columns]

In [25]:
LOADINGS.index = TFIDF.columns
LOADINGS.index.name = 'term_id'
LOADINGS['term_str'] = LOADINGS.apply(lambda x: VOCAB.loc[int(x.name)].term_str, 1)

In [28]:
pc0_pos = LOADINGS.sort_values('PC0', ascending=False).head(10).term_str.str.cat(sep=' ')
pc0_neg = LOADINGS.sort_values('PC0', ascending=True).head(10).term_str.str.cat(sep=' ')
pc1_pos = LOADINGS.sort_values('PC1', ascending=False).head(10).term_str.str.cat(sep=' ')
pc1_neg = LOADINGS.sort_values('PC1', ascending=True).head(10).term_str.str.cat(sep=' ')
pc2_pos = LOADINGS.sort_values('PC2', ascending=False).head(10).term_str.str.cat(sep=' ')
pc2_neg = LOADINGS.sort_values('PC2', ascending=True).head(10).term_str.str.cat(sep=' ')
pc3_pos = LOADINGS.sort_values('PC3', ascending=False).head(10).term_str.str.cat(sep=' ')
pc3_neg = LOADINGS.sort_values('PC3', ascending=True).head(10).term_str.str.cat(sep=' ')

In [31]:
print('BOOKS PC0+', pc0_pos)
print('BOOKS PC0-', pc0_neg)
print('BOOKS PC1+', pc1_pos)
print('BOOKS PC1-', pc1_neg)
print('BOOKS PC2+', pc2_pos)
print('BOOKS PC2-', pc2_neg)
print('BOOKS PC3+', pc3_pos)
print('BOOKS PC3-', pc3_neg)

BOOKS PC0+ Bork Von Holmes Altamont Watson Steiner Martha mister dossier valise
BOOKS PC0- Montgomery Challenger Summerlee Nigel Belmont e Haw Croxley Sadie mdash
BOOKS PC1+ Montgomery Bork Croxley Challenger referee e Von Summerlee mdash Barton
BOOKS PC1- Holmes Watson “I Tregennis Baynes Gennaro — Gregson Lestrade Eccles
BOOKS PC2+ Montgomery Croxley referee Barton Holmes Craggs t thou Master Montgomerys
BOOKS PC2- Challenger Summerlee e mdash Belmont im Sadie orse Cremona Cochrane
BOOKS PC3+ Challenger Summerlee oxygen Austin Malone Professor Challengers Roxton ether McArdle
BOOKS PC3- e mdash orse Cremona im oer Pennarby Till Spider orses


Looking at the PCA's we can see a clear seperation between Sherlock Holmes books and books about George Challenger (The Lost World, The Poison Belt, The Land of the Mist).

Summerlee is a character who travels with Challenger.

In [None]:
# research question.  Can we group books by character reliably?  What methods are most reliable?

# word cloud? (where is this?)
# word2vec, include in PCA?? (add sentement analysis too?)
# Can I use PCA to predict whether a book has sherlock holmes or not?
# clustering analysis to group books with trees

# Do some more research into the books categories of books that ACD wrote