# Imports

In [96]:
import pickle
import glob
import re, os
import pandas as pd

from gensim import corpora
from gensim.models import TfidfModel
from gensim import similarities

import spacy
from spacy.lang.en import English

import nltk
from nltk.stem import PorterStemmer

from scipy.cluster import hierarchy

from tqdm.notebook import trange, tqdm
from tqdm import tqdm_gui
import time

import plotly.graph_objs as go
import plotly.io as pio
import plotly.express as px

In [97]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

# Loading in data

In [2]:
# defining folder where data is kept
# using glob to import the files from the defined folder

folder = "/kaggle/input/book-dataset/"
files = glob.glob(folder+ '*.txt')
files.sort()

In [3]:
# inspecting list of files, to ensure dataset was propertly loaded
files

['/kaggle/input/book-dataset/Autobiography.txt',
 '/kaggle/input/book-dataset/CoralReefs.txt',
 '/kaggle/input/book-dataset/DescentofMan.txt',
 '/kaggle/input/book-dataset/DifferentFormsofFlowers.txt',
 '/kaggle/input/book-dataset/EffectsCrossSelfFertilization.txt',
 '/kaggle/input/book-dataset/ExpressionofEmotionManAnimals.txt',
 '/kaggle/input/book-dataset/FormationVegetableMould.txt',
 '/kaggle/input/book-dataset/FoundationsOriginofSpecies.txt',
 '/kaggle/input/book-dataset/GeologicalObservationsSouthAmerica.txt',
 '/kaggle/input/book-dataset/InsectivorousPlants.txt',
 '/kaggle/input/book-dataset/LifeandLettersVol1.txt',
 '/kaggle/input/book-dataset/LifeandLettersVol2.txt',
 '/kaggle/input/book-dataset/MonographCirripedia.txt',
 '/kaggle/input/book-dataset/MonographCirripediaVol2.txt',
 '/kaggle/input/book-dataset/MovementClimbingPlants.txt',
 '/kaggle/input/book-dataset/OriginofSpecies.txt',
 '/kaggle/input/book-dataset/PowerMovementPlants.txt',
 '/kaggle/input/book-dataset/Variati

### Isolating text and title for each book

In [4]:
# loading in book content and titles into seperate lists we can use later

txts = []
titles = []

for n in files:
    f = open(n, encoding='utf-8-sig')
    # remove all non alpha numeric characters
    text = re.sub('[\W_]+',' ',f.read())
    # load titles and text into two sepereate lists
    titles.append(os.path.basename(n).replace('.txt', ''))
    txts.append(text)

In [5]:
# taking a look at the first 200 characters of the first book title to ensure we're pulling the titles and text in correctly.
print(titles[0])
print(txts[0][1:400])

Autobiography
THE AUTOBIOGRAPHY OF CHARLES DARWIN From The Life and Letters of Charles Darwin By Charles Darwin Edited by his Son Francis Darwin My father s autobiographical recollections given in the present chapter were written for his children and written without any thought that they would ever be published To many this may seem an impossibility but those who knew my father will understand how it was not o


### Grab index for the book 'Decent of Man'
We'll be seeing how closely the other books in our dataset compare to this title.

In [6]:
for i in range(len(titles)):
    if titles[i] == 'DescentofMan':
        dom = i
# Print the stored index
print(dom)

2


# Preprocessing
This will include:
* Loading in stopwords
* Tokenizing
* Stemming text in each book

### Load in stopwords

In [7]:
# using spacey's stop word set
stopwords = spacy.lang.en.stop_words.STOP_WORDS

# inspecting 10 in the set
list(stopwords)[:10]

['afterwards',
 'while',
 'which',
 'moreover',
 "'re",
 'whereas',
 'or',
 'both',
 'call',
 'one']

### Pre-process text in corpus
After converting all text to lowercase and splitting each word on spaces, we'll create a new list represents a book and each item in each list is all the words in the book text that *IS NOT* a stopword as we defined in the cell above.

In [9]:
txts_lower_split = [txt.lower().split() for txt in txts]
texts = [[word for word in txt if word not in stopwords] for txt in txts_lower_split]

print(texts[2][:100])

['descent', 'man', 'selection', 'relation', 'sex', 'works', 'charles', 'darwin', 'f', 'r', 's', 'life', 'letters', 'charles', 'darwin', 'autobiographical', 'chapter', 'edited', 'francis', 'darwin', 'portraits', '3', 'volumes', '36s', 'popular', 'edition', 'condensed', '1', 'volume', '7s', '6d', 'naturalist', 's', 'journal', 'researches', 'natural', 'history', 'geology', 'countries', 'visited', 'voyage', 'round', 'world', '100', 'illustrations', 'pritchett', '21s', 'popular', 'edition', 'woodcuts', '3s', '6d', 'cheaper', 'edition', '2s', '6d', 'net', 'origin', 'species', 'means', 'natural', 'selection', 'preservation', 'favoured', 'races', 'struggle', 'life', 'large', 'type', 'edition', '2', 'volumes', '12s', 'popular', 'edition', '6s', 'cheaper', 'edition', 'portrait', '2s', '6d', 'contrivances', 'orchids', 'fertilized', 'insects', 'woodcuts', '7s', '6d', 'variation', 'animals', 'plants', 'domestication', 'illustrations', '15s', 'descent', 'man', 'selection', 'relation', 'sex', 'illust

### Stemming tokenized words

In [33]:
porter = PorterStemmer()
stem_texts = [[porter.stem(token) for token in text] for text in texts]

In [57]:
# dumping to pickle so we don't have to repeat the stemming step when session ends
with open('/kaggle/working/stem_texts.p', 'wb') as f:
    pickle.dump(stem_texts, f)

In [58]:
# open pickled stemmed tokens
with open('/kaggle/working/stem_texts.p', 'rb') as f:
    stem_texts = pickle.load(f)

In [60]:
# remove pickled file from working directory if needed

# os.remove("/kaggle/working/stem_texts.p")

In [62]:
# previewing first 20 stemmed tokens from Descent of Man using its index.
stem_texts[2][:20]

['descent',
 'man',
 'select',
 'relat',
 'sex',
 'work',
 'charl',
 'darwin',
 'f',
 'r',
 's',
 'life',
 'letter',
 'charl',
 'darwin',
 'autobiograph',
 'chapter',
 'edit',
 'franci',
 'darwin']

# Building a bag of words model
We can use methods from gensim to create a dictionary and bag of words model for the stemmed tokens in each book

In [64]:
dictionary = corpora.Dictionary(texts_stem)
bows = [dictionary.doc2bow(i) for i in texts_stem]

# Print the first five elements of the Descent of Mans Bag of words model
print(bows[2][:5])

[(0, 58), (5, 35), (6, 38), (8, 27), (9, 4)]


# Most common words 
Great we have a bag of words model for each book, using the dictionary created in the previous cell, but let's convert that bag of words model into a dataframe in order to inspect the top tokens for each book.

In [90]:
df_bow_dom = pd.DataFrame(bows[2])
df_bow_dom.columns = ['index', 'occurrences']
df_bow_dom['token'] = [dictionary[index] for index in df_bow_dom["index"]]

# sort the created dataframe by occurences
df_bow_dom_sorted = df_bow_dom.sort_values(by='occurrences', ascending=False)
display(df_bow_dom_sorted)

Unnamed: 0,index,occurrences,token
1092,1427,2867,male
5818,7711,2135,femal
341,471,1445,colour
494,659,1407,differ
1580,2050,1326,sex
...,...,...,...
6243,8136,1,gurgl
6244,8137,1,guttatum
6248,8141,1,gypsi
6250,8143,1,gyru


## Further exploration
Let's take a look at what kind of tokens are used most. We'll use spacey to examine top words based on part of speech

In [132]:
# defining part of speech names from NLTK docs that we can use to isolate tokens
adjectives = ['JJ', 'JJR', 'JJS']
nouns = ['NN', 'NNS', 'NNP', 'NNPS']
verbs = ['VB','VBD','VBG','VBN','VBP','VBZ']

In [141]:
# adding part of speech column for each token
df_bow_dom_sorted['pos'] = [i[1] for i in list(nltk.pos_tag(df_bow_dom_sorted['token']))]

In [145]:
df_bow_dom_sorted.head()

Unnamed: 0,index,occurrences,token,pos
1092,1427,2867,male,NN
5818,7711,2135,femal,JJ
341,471,1445,colour,NN
494,659,1407,differ,NN
1580,2050,1326,sex,NN


In [183]:
# using pandas query function to create a new dataframe of just nouns and adjectives
# how cool is df.query!?

df_dom_nouns = df_bow_dom_sorted.query(f'pos in {nouns}')
df_dom_adj = df_bow_dom_sorted.query(f'pos == {adjectives}')

# Building a reusable plot template

In [189]:
custom = go.layout.Template()

custom.layout = go.Layout(
    margin=dict(t=120, r=50, b=90, l=100),
    yaxis = dict( title_standoff = 10, gridcolor="#3B5CAB"),
    xaxis = dict( title_standoff = 20, gridcolor="#213A78"),
    plot_bgcolor="#213A78",
    paper_bgcolor="#213A78",
    font=dict(
        family='Montserrat, proportional',
        color='white',
        size=13
    ),
    title_font=dict(
    size=22
    ),
    autosize=True
)

custom.data.scatter = [
    go.Scatter(
        marker=dict(
            symbol="circle",
            size=8,
            color="#3EFFE8",
        ),
        line=dict(color='#3EFFE8'),
    )
]

pio.templates['custom'] = custom

In [191]:
fig = go.Figure(data=go.Bar(y=df_dom_nouns['occurrences'][:20], x = df_dom_nouns['token'][:20]))
  
fig.update_layout(title="Frequency Of Top 20 Nouns In Text",
                  yaxis = dict( title_text = "Frequency"),
                  xaxis = dict( title_text = "Top 20 nouns"),
                  template='plotly_white+custom')

fig.show()

In [192]:
fig = go.Figure(data=go.Bar(y=df_dom_adj['occurrences'][:20], x = df_dom_adj['token'][:20]))
  
fig.update_layout(title="Frequency Of Top 20 Adjectives In Text",
                  yaxis = dict( title_text = "Frequency"),
                  xaxis = dict( title_text = "Top 20 adjectives"),
                  template='plotly_white+custom')

fig.show()