In [1]:
import pandas as pd
import nltk
from nltk.stem.porter import PorterStemmer
import numpy as np
from numpy.linalg import norm
from gensim.models import word2vec
from sklearn.manifold import TSNE
import plotly_express as px

from numpy.linalg import norm
from scipy.spatial.distance import pdist

import seaborn as sns
sns.set(style="ticks")

# tree models
import scipy.cluster.hierarchy as sch
import matplotlib.pyplot as plt

# topic modeling
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA

from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import normalize

import scipy.fftpack as fftpack
from sklearn.neighbors import KernelDensity as KDE

%matplotlib inline

In [2]:
OHCO = ['book_id', 'para_num', 'sent_num', 'token_num']
SENTS = OHCO[:4]
PARAS = OHCO[:3]
CHAPS = OHCO[:2]
BOOKS = OHCO[:1]

In [3]:
# Load data
data_dir = './'

LIBRARY = pd.read_csv(data_dir + "LIBRARY.csv").set_index(BOOKS)
TOKEN = pd.read_csv(data_dir + 'TOKEN.csv').set_index(OHCO)
VOCAB = pd.read_csv(data_dir + 'VOCAB.csv').set_index('term_id')

In [4]:
# enhance vocab table

# stop words
sw = pd.DataFrame(nltk.corpus.stopwords.words('english'), columns=['term_str'])
sw = sw.reset_index().set_index('term_str')
sw.columns = ['dummy']
sw.dummy = 1

VOCAB['stop'] = VOCAB.term_str.map(sw.dummy)
VOCAB['stop'] = VOCAB['stop'].fillna(0).astype('int')

In [5]:
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()
VOCAB['p_stem'] = VOCAB.term_str.apply(str).apply(stemmer.stem)

In [6]:
VOCAB.head()

Unnamed: 0_level_0,term_str,n,p,log_p,stop,p_stem
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
100,A,4277,0.001217456,-9.681915,0,a
44791,AB,1,2.846519e-07,-21.744298,0,ab
50322,ABANDON,1,2.846519e-07,-21.744298,0,abandon
34216,ABBEY,2,5.693038e-07,-20.744298,0,abbey
33710,ABBOT,2,5.693038e-07,-20.744298,0,abbot


In [7]:
# add term_id to TOKEN table
TOKEN['term_id'] = TOKEN.token_str.map(VOCAB.reset_index().set_index('term_str').term_id)
TOKEN.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,pos_tuple,pos,token_str,term_id
book_id,para_num,sent_num,token_num,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
21768,0,0,0,"('Produced', 'VBN')",VBN,Produced,5501
21768,0,0,1,"('by', 'IN')",IN,by,33
21768,0,0,2,"('David', 'NNP')",NNP,David,6715
21768,0,0,3,"('Widger', 'NNP')",NNP,Widger,25909
21768,1,0,0,"('A', 'DT')",DT,A,100


In [8]:
# Add max pos to VOCAB
VOCAB['pos_max'] = TOKEN.groupby(['term_id', 'pos']).count().iloc[:,0].unstack().idxmax(1)
VOCAB.head()

Unnamed: 0_level_0,term_str,n,p,log_p,stop,p_stem,pos_max
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
100,A,4277,0.001217456,-9.681915,0,a,DT
44791,AB,1,2.846519e-07,-21.744298,0,ab,NNP
50322,ABANDON,1,2.846519e-07,-21.744298,0,abandon,NNP
34216,ABBEY,2,5.693038e-07,-20.744298,0,abbey,NNP
33710,ABBOT,2,5.693038e-07,-20.744298,0,abbot,NNP


In [9]:
# Zipf's Law
# Add term rank
if 'term_rank' not in VOCAB.columns:
    VOCAB = VOCAB.sort_values('n', ascending=False).reset_index()
    VOCAB.index.name = 'term_rank'
    VOCAB = VOCAB.reset_index()
    VOCAB = VOCAB.set_index('term_id')
    VOCAB['term_rank'] = VOCAB['term_rank'] + 1
VOCAB.head()

Unnamed: 0_level_0,term_rank,term_str,n,p,log_p,stop,p_stem,pos_max
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,1,the,201564,0.057376,-4.12342,1,the,DT
1,2,of,104234,0.02967,-5.074832,1,of,IN
2,3,and,102976,0.029312,-5.092349,1,and,CC
3,4,to,80831,0.023009,-5.441677,1,to,TO
4,5,a,80180,0.022823,-5.453343,1,a,DT


In [10]:
VOCAB['p'] = VOCAB.n / TOKEN.shape[0]
VOCAB.head()

Unnamed: 0_level_0,term_rank,term_str,n,p,log_p,stop,p_stem,pos_max
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,1,the,201564,0.057376,-4.12342,1,the,DT
1,2,of,104234,0.02967,-5.074832,1,of,IN
2,3,and,102976,0.029312,-5.092349,1,and,CC
3,4,to,80831,0.023009,-5.441677,1,to,TO
4,5,a,80180,0.022823,-5.453343,1,a,DT


In [11]:
VOCAB['zipf_k'] = VOCAB.n * VOCAB.term_rank
VOCAB['zipf_k_p'] = VOCAB.p * VOCAB.term_rank
VOCAB.head()

Unnamed: 0_level_0,term_rank,term_str,n,p,log_p,stop,p_stem,pos_max,zipf_k,zipf_k_p
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,1,the,201564,0.057376,-4.12342,1,the,DT,201564,0.057376
1,2,of,104234,0.02967,-5.074832,1,of,IN,208468,0.059341
2,3,and,102976,0.029312,-5.092349,1,and,CC,308928,0.087937
3,4,to,80831,0.023009,-5.441677,1,to,TO,323324,0.092035
4,5,a,80180,0.022823,-5.453343,1,a,DT,400900,0.114117


In [12]:
# get TFIDF
def make_TFIDF(df, OHCO, count_method, tf_method, idf_method):
    # df: The tokens data frame to use.
    # OHCO: The OHCO level to use, e.g. which "bag" to use.
    # count_type: The type of count to use (e.g. binary counts are regular counts). (n or c, c is T/F in bag or no)
    # TF: The type of TF to use.
    # IDF: The type of IDF to use.

    # set bag
    bag = OHCO

    # Create the bag of words
    BOW = df.groupby(bag+['term_id']).term_id.count()\
        .to_frame().rename(columns={'term_id':'n'})
    BOW['c'] = BOW.n.astype('bool').astype('int')
    
    # BOW to Document Term Matrix
    DTCM = BOW[count_method].unstack().fillna(0).astype('int')

    # selecting TF method
    TF = 0
    if tf_method == 'sum':
        TF = DTCM.T / DTCM.T.sum()

    elif tf_method == 'max':
        TF = DTCM.T / DTCM.T.max()

    elif tf_method == 'log':
        TF = np.log10(1 + DTCM.T)
        
    elif tf_method == 'raw':
        TF = DTCM.T

    elif tf_method == 'double_norm':
        TF = DTCM.T / DTCM.T.max()
        TF = tf_norm_k + (1 - tf_norm_k) * TF[TF > 0] # EXPLAIN; may defeat purpose of norming

    elif tf_method == 'binary':
        TF = DTCM.T.astype('bool').astype('int')
        
    TF = TF.T


    # selecting IDF method
    # print('IDF method:', idf_method)
    DF = DTCM[DTCM > 0].count()
    N = DTCM.shape[0]

    if idf_method == 'standard':
        IDF = np.log10(N / DF)

    elif idf_method == 'max':
        IDF = np.log10(DF.max() / DF) 

    elif idf_method == 'smooth':
        IDF = np.log10((1 + N) / (1 + DF)) + 1 # Correct?

    TFIDF = TF * IDF

    return TFIDF

In [13]:
TFIDF = make_TFIDF(TOKEN,BOOKS,'n','sum','standard')
TFIDF.head()

term_id,0,1,2,3,4,5,6,7,8,9,...,56810,56811,56812,56813,56814,56815,56816,56817,56818,56819
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
108,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
126,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
139,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
244,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
290,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# PCA

In [14]:
from sklearn.decomposition import PCA
from scipy.linalg import norm
import plotly_express as px
import seaborn as sns

In [15]:
pca_engine = PCA(n_components=10)

In [16]:
DCM = pd.DataFrame(pca_engine.fit_transform(TFIDF), index=TFIDF.index)
DCM.columns = ['PC{}'.format(i) for i in DCM.columns]
DCM['title'] = LIBRARY.title

In [19]:
DCM.style.background_gradient()

Unnamed: 0_level_0,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,title
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
108,0.000431,-0.001309,0.000391,-2e-06,-0.000123,-0.000105,-0.00043,9e-05,7.6e-05,4e-05,THE RETURN OF SHERLOCK HOLMES
126,-0.000634,0.001411,-0.002283,0.006361,-0.003054,-0.001573,-0.000403,-0.001003,0.001607,0.000107,THE POISON BELT
139,-0.000499,0.00088,-0.00139,0.003811,-0.001686,-0.000863,-0.000171,-0.000513,0.000812,5.1e-05,THE LOST WORLD
244,-1.9e-05,-0.000548,7.2e-05,-6e-06,2e-06,3.1e-05,-0.000189,0.000169,-0.000153,8.5e-05,A STUDY IN SCARLET
290,-0.000279,0.000145,-0.000125,6.3e-05,0.000135,0.000202,0.000108,0.000139,-0.000304,-4.4e-05,THE STARK MUNRO LETTERS BEING SERIES OF TWELVE LETTERS WRITTEN BY J STARK MUNRO M B TO HIS FRIEND AND FORMER FELLOW STUDENT HERBERT SWANBOROUGH OF LOWELL MASSACHUSETTS DURING THE YEARS 1881 1884
294,-0.000209,0.00018,-0.000111,4.4e-05,0.000134,0.000132,8.8e-05,0.000133,-0.000258,9e-06,THE CAPTAIN OF THE POLESTAR AND OTHER TALES
355,-0.000334,0.000256,-4.4e-05,0.000397,9.9e-05,0.00013,0.000294,3.3e-05,-0.00037,-0.000158,THE PARASITE A STORY
356,-0.000371,0.000155,-0.000289,7e-06,0.000236,0.000462,5e-06,6.3e-05,-0.000209,-0.000293,BEYOND THE CITY
423,-0.000253,0.000116,-0.000108,5.5e-05,0.000112,0.000152,5.7e-05,0.000122,-0.000236,-2.8e-05,ROUND THE RED LAMP BEING FACTS AND FANCIES OF MEDICAL LIFE
439,-0.000302,0.000173,-0.000192,0.000135,0.000153,0.000177,0.000125,0.000235,-0.00042,-0.000226,THE VITAL MESSAGE


In [20]:
px.scatter_3d(DCM, 'PC0', 'PC1','PC2', hover_name='title', height=1000, width=1200)

In [21]:
LOADINGS = pd.DataFrame(pca_engine.components_.T * np.sqrt(pca_engine.explained_variance_))
LOADINGS.columns = ["PC{}".format(i) for i in LOADINGS.columns]

In [22]:
LOADINGS.index = TFIDF.columns
LOADINGS.index.name = 'term_id'
LOADINGS['term_str'] = LOADINGS.apply(lambda x: VOCAB.loc[int(x.name)].term_str, 1)

In [23]:
pc0_pos = LOADINGS.sort_values('PC0', ascending=False).head(10).term_str.str.cat(sep=' ')
pc0_neg = LOADINGS.sort_values('PC0', ascending=True).head(10).term_str.str.cat(sep=' ')
pc1_pos = LOADINGS.sort_values('PC1', ascending=False).head(10).term_str.str.cat(sep=' ')
pc1_neg = LOADINGS.sort_values('PC1', ascending=True).head(10).term_str.str.cat(sep=' ')
pc2_pos = LOADINGS.sort_values('PC2', ascending=False).head(10).term_str.str.cat(sep=' ')
pc2_neg = LOADINGS.sort_values('PC2', ascending=True).head(10).term_str.str.cat(sep=' ')
pc3_pos = LOADINGS.sort_values('PC3', ascending=False).head(10).term_str.str.cat(sep=' ')
pc3_neg = LOADINGS.sort_values('PC3', ascending=True).head(10).term_str.str.cat(sep=' ')

In [24]:
print('BOOKS PC0+', pc0_pos)
print('BOOKS PC0-', pc0_neg)
print('BOOKS PC1+', pc1_pos)
print('BOOKS PC1-', pc1_neg)
print('BOOKS PC2+', pc2_pos)
print('BOOKS PC2-', pc2_neg)
print('BOOKS PC3+', pc3_pos)
print('BOOKS PC3-', pc3_neg)

BOOKS PC0+ Bork Von Holmes Altamont Watson Steiner Martha mister dossier valise
BOOKS PC0- Montgomery Challenger Summerlee Nigel Belmont e Haw Croxley mdash Sadie
BOOKS PC1+ Montgomery Bork Croxley Challenger referee e Von Summerlee mdash Barton
BOOKS PC1- Holmes Watson Tregennis “I Baynes Gennaro — Gregson Eccles Lestrade
BOOKS PC2+ Montgomery Croxley referee Barton Holmes Craggs t thou Master Montgomerys
BOOKS PC2- Challenger Summerlee e mdash Belmont Sadie im orse Cochrane dragoman
BOOKS PC3+ Challenger Summerlee oxygen Austin Malone Professor Challengers Roxton ether McArdle
BOOKS PC3- e mdash orse Cremona im oer Pennarby Till Spider orses


Looking at the PCA's we can see a clear seperation between Sherlock Holmes books and books about George Challenger (The Lost World, The Poison Belt, The Land of the Mist).

Summerlee is a character who travels with Challenger.

## Word2Vec - Word Similarity

In [None]:
LIBRARY.shape

In [None]:
#gensims prefered format
bag = BOOKS # book
corpus = TOKEN[~TOKEN.pos.str.match('NNPS?')]\
    .groupby(bag)\
    .token_str.apply(lambda  x:  x.tolist())\
    .reset_index()['token_str'].tolist()

In [None]:
window = 5
model = word2vec.Word2Vec(corpus, window=window, min_count=200, workers=4) #size = 246

In [None]:
coords = pd.DataFrame(index=range(len(model.wv.key_to_index)))
coords['label'] = model.wv.index_to_key
coords['vector'] = coords['label'].apply(lambda x: model.wv.get_vector(x))

In [None]:
coords.head()

In [None]:
tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
tsne_values = tsne_model.fit_transform(coords['vector'].tolist())

In [None]:
coords['x'] = tsne_values[:,0]
coords['y'] = tsne_values[:,1]

In [None]:
coords.head()

In [None]:
px.scatter(coords, 'x', 'y', text='label', height=1000).update_traces(mode='text')

In [None]:
bag = PARAS # paragraph
corpus2 = TOKEN[~TOKEN.pos.str.match('NNPS?')]\
    .groupby(bag)\
    .token_str.apply(lambda  x:  x.tolist())\
    .reset_index()['token_str'].tolist()
window = 5
model2 = word2vec.Word2Vec(corpus2, window=window, min_count=200, workers=4) #size = 246
coords2 = pd.DataFrame(index=range(len(model2.wv.key_to_index)))
coords2['label'] = model2.wv.index_to_key
coords2['vector'] = coords2['label'].apply(lambda x: model2.wv.get_vector(x))
tsne_values2 = tsne_model.fit_transform(coords2['vector'].tolist())
coords2['x'] = tsne_values2[:,0]
coords2['y'] = tsne_values2[:,1]
px.scatter(coords2, 'x', 'y', text='label', height=1000).update_traces(mode='text')

In [None]:
# include proper nouns
bag = PARAS
corpus3 = TOKEN\
    .groupby(bag)\
    .token_str.apply(lambda  x:  x.tolist())\
    .reset_index()['token_str'].tolist()
window = 5
model3 = word2vec.Word2Vec(corpus3, window=window, min_count=200, workers=4) #size = 246
coords3 = pd.DataFrame(index=range(len(model3.wv.key_to_index)))
coords3['label'] = model3.wv.index_to_key
coords3['vector'] = coords3['label'].apply(lambda x: model3.wv.get_vector(x))
tsne_values3 = tsne_model.fit_transform(coords3['vector'].tolist())
coords3['x'] = tsne_values3[:,0]
coords3['y'] = tsne_values3[:,1]
px.scatter(coords3, 'x', 'y', text='label', height=1000).update_traces(mode='text')

ACD's corpus is very uniform in tone, even between different series/characters

## Book Similarity

In [None]:
# create DOC table
DOC=TFIDF
DOC = DOC.reset_index()
DOC.index.name = 'doc_id'
DOC['title'] = DOC.book_id.map(LIBRARY.title.str[:40])
DOC = DOC[['book_id', 'title']].rename_axis(None, axis='columns')

DOC#.head()

In [None]:
# pd.options.display.max_rows = 30
# LIBRARY.title.str.len()

In [None]:
# normalized tables
L0 = TFIDF.astype('bool').astype('int')
L1 = TFIDF.apply(lambda x: x / x.sum(), 1)
L2 = TFIDF.apply(lambda x: x / norm(x), 1)
((L2.T)**2).sum() # looks good

In [None]:
# create pairs
PAIRS = pd.DataFrame(index=pd.MultiIndex.from_product([DOC.index.tolist(), DOC.index.tolist()])).reset_index()
PAIRS = PAIRS[PAIRS.level_0 < PAIRS.level_1].set_index(['level_0','level_1'])
PAIRS.index.names = ['doc_a', 'doc_b']
PAIRS.head()

In [None]:
PAIRS.shape

In [None]:
PAIRS['cityblock'] = pdist(TFIDF, 'cityblock')
PAIRS['euclidean'] = pdist(TFIDF, 'euclidean')
PAIRS['cosine'] = pdist(TFIDF, 'cosine')
PAIRS['jaccard'] = pdist(L0, 'jaccard') # Fast, and similar to js
PAIRS['dice'] = pdist(L0, 'dice')
PAIRS['js'] = pdist(L1, 'jensenshannon') # Turns out to be really slow
PAIRS['euclidean2'] = pdist(L2, 'euclidean') # Should be the same as cosine 

PAIRS['yule'] = pdist(L0, 'yule') # Should be the same as cosine 
PAIRS.head()

In [None]:
def hca(sims, linkage_method='ward', color_thresh=.3, figsize=(10, 10)):
    tree = sch.linkage(sims, method=linkage_method)
    labels = DOC.title.values
    plt.figure()
    fig, axes = plt.subplots(figsize=figsize)
    dendrogram = sch.dendrogram(tree, 
                                labels=labels, 
                                orientation="left", 
                                count_sort=True,
                                distance_sort=True,
                                above_threshold_color='.75',
                                color_threshold=color_thresh
                               )
    plt.tick_params(axis='both', which='major', labelsize=14)

In [None]:
print("Method: cityblock")
hca(PAIRS.cityblock, color_thresh=.3)
print("Method: euclidean")
hca(PAIRS.euclidean, color_thresh=.3)
print("Method: cosine")
hca(PAIRS.cosine, color_thresh=.3)
print("Method: jaccard")
hca(PAIRS.jaccard, color_thresh=.3)
print("Method: dice")
hca(PAIRS.dice, color_thresh=.3)
print("Method: js")
hca(PAIRS.js, color_thresh=.3)
print("Method: euclidean2")
hca(PAIRS.euclidean2, color_thresh=.3)
print("Method: yule")
hca(PAIRS.yule, color_thresh=.3)

## Topic modeling

In [None]:
n_terms = 4000
n_topics = 30
max_iter = 5

In [None]:
# make paragraphs out of regular nouns only
PARA_TOKEN = TOKEN[TOKEN.pos.str.match(r'^NNS?$')]\
    .groupby(PARAS).token_str\
    .apply(lambda x: ' '.join(x))\
    .to_frame()\
    .rename(columns={'token_str':'para_str'})

In [None]:
PARA_TOKEN.sample(15)

In [None]:
tfv = CountVectorizer(max_features=n_terms, stop_words='english')
tf = tfv.fit_transform(PARA_TOKEN.para_str)
TERMS = tfv.get_feature_names()

In [None]:
lda = LDA(n_components=n_topics, max_iter=max_iter, learning_offset=50., random_state=0)

In [None]:
THETA = pd.DataFrame(lda.fit_transform(tf), index=PARA_TOKEN.index)
THETA.columns.name = 'topic_id'

In [None]:
THETA.sample(20).style.background_gradient()

In [None]:
PHI = pd.DataFrame(lda.components_, columns=TERMS)
PHI.index.name = 'topic_id'
PHI.columns.name  = 'term_str'

In [None]:
PHI.T.head().style.background_gradient()

In [None]:
TOPICS = PHI.stack().to_frame().rename(columns={0:'weight'})\
    .groupby('topic_id')\
    .apply(lambda x: 
           x.weight.sort_values(ascending=False)\
               .head(10)\
               .reset_index()\
               .drop('topic_id',1)\
               .term_str)

In [None]:
TOPICS

In [None]:
TOPICS['label'] = TOPICS.apply(lambda x: str(x.name) + ' ' + ' '.join(x), 1)

In [None]:
TOPICS['doc_weight_sum'] = THETA.sum()

In [None]:
TOPICS.sort_values('doc_weight_sum', ascending=True).plot.barh(y='doc_weight_sum', x='label', figsize=(5,10)) 

In [None]:
def plot_tree(tree, labels):
    plt.figure()
    fig, axes = plt.subplots(figsize=(5, 10))
    dendrogram = sch.dendrogram(tree, labels=labels, orientation="left")
    plt.tick_params(axis='both', which='major', labelsize=14)

In [None]:
SIMS = pdist(normalize(PHI), metric='euclidean')
TREE = sch.linkage(SIMS, method='ward')

In [None]:
labels  = ["{}: {}".format(a,b) for a, b in zip(PARA_TOKEN.index,  PARA_TOKEN.topterms.tolist())]

In [None]:
# research question.  Can we group books by character reliably?  What methods are most reliable?

# word cloud? (where is this?)
# word2vec, include in PCA?? (add sentement analysis too?) - Include in VOCAB table
# Can I use PCA to predict whether a book has sherlock holmes or not?
# clustering analysis to group books with trees - more exploring
# LDA topic modeling
# sentiment analysis to compare plots? Select books?

# Do some more research into the books categories of books that ACD wrote

In [None]:
salex = pd.read_csv('salex_nrc.csv').set_index('token_str')
salex.columns = [col.replace('nrc_','') for col in salex.columns]
salex['polarity'] = salex.positive - salex.negative
emo_cols = "anger anticipation disgust fear joy sadness surprise trust polarity".split()
TOKEN = TOKEN.join(salex, on='token_str', how='left')
TOKEN[emo_cols] = TOKEN[emo_cols].fillna(0)

In [None]:
def get_dct_transform(raw_values, low_pass_size=5, x_reverse_len=100):
    if low_pass_size > len(raw_values):
        raise ValueError("low_pass_size must be smaller than or equal to length of raw_values")

    values_dct = fftpack.dct(raw_values, type = 2) # also try 4

    keepers = values_dct[:low_pass_size]
    padded_keepers = list(keepers) + list(np.zeros(x_reverse_len - low_pass_size))

    dct_out = fftpack.idct(padded_keepers)

    return dct_out

In [None]:
NOVELS = TOKEN.groupby(SENTS)[emo_cols].mean().reset_index().set_index('book_id').join(LIBRARY[['title','author']])
NOVELS = NOVELS.reset_index().set_index(['title'] + SENTS)

In [None]:
def plot_novel(title, dct=True, low_pass_size=5, x_reverse_len=100, emo='polarity', item=1, color='blue'):
    global NOVELS, axes, plot_cfg

    X = NOVELS.loc[title, emo].values

    method="DCT"
    X = get_dct_transform(X, low_pass_size=low_pass_size, x_reverse_len=x_reverse_len)


    plot_cfg['title'] = "{} ({})".format(title, method)

    pd.Series(X).plot(**plot_cfg, ax=axes[i], color = color);

In [None]:
titles = {"THE HOUND OF THE BASKERVILLES","A STUDY IN SCARLET","THE SIGN OF THE FOUR","THE VALLEY OF FEAR", "THE LOST WORLD","THE POISON BELT"}

In [None]:
fig, axes = plt.subplots(len(titles),1)
plot_cfg = dict(
    figsize=(25, 5 * len(titles)),
    legend = False,
    fontsize=14,
    rot=45
)

for i, title in enumerate(sorted(titles)):
    if (title in ["THE LOST WORLD","THE POISON BELT"]):
        plot_novel(title, item=i, color="red")
    else:
        plot_novel(title, item=i)

In [None]:
PLOTS = {}
FIG = dict(
    figsize=(25, 5),
    legend=True,
    fontsize=20,
    rot=45
)