# Exploratory book analysis

#### An important step when working with the NLTK package is making sure that all the modules, algorithms, corpora, lists are available when running the program/s.

In [3]:
#The following commands import the modules, algorithms, corpora, lists that will be needed for exploratory book analysis
import nltk #imports the nltk module
nltk.download("words") # downloads English words
nltk.download("stopwords") # downloads English stopwords 
from nltk.text import Text #imports the Text module that enables the exploratory search of the text
from nltk import bigrams # imports the bigrams module
from nltk.collocations import * # imports the collocations methods
import string # imports string module that allows running processs on strings
from string import punctuation # imports punctuation symbols
from urllib2 # imports module for fetching and processing internet resources
import os #this module allows interfacing with the underlying operating system
import re #imports regular expression module 
from collections import OrderedDict #imports collections module, an alternative to list, dict, set, tuple
from nltk.draw import dispersion_plot
from IPython.display import display # a module that wraps the object to be displayed


[nltk_data] Downloading package words to /home/harish/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to /home/harish/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


ImportError: No module named 'urllib2'

#### Opening up and reading the text of a book available on the Project Gutenberg web site

In [None]:
url = "https://www.gutenberg.org/files/140/140-0.txt" #assigning a link to a variable
data = urllib2.urlopen(url.encode('utf-8')).read() #opening up the link/file
data = data.replace("\r\n", " ")
data = unicode(data, errors="ignore") #ignores encoding errors
data[:10000]
#Alternatives:
# https://www.gutenberg.org/files/140/140-0.txt The Jungle by Upton Sinclair
#https://www.gutenberg.org/files/543/543-0.txt # Main Street by Sinclair Lewis
#https://www.gutenberg.org/files/1342/1342-0.txt Pride and Prejudice by Jane Austin

#### Additional processing of the file, removing text metadata 

In [None]:
data = re.sub('^(.*?)START OF THIS PROJECT GUTENBERG EBOOK', '', data) # remove anything from the start of the string to the first occurence of Chapter 1

In [None]:
data [:2000]

In [None]:
len(data) # how long the entire string/file is (how many characters)?

In [None]:
data[750000:] # checking the end of the file

#### Removing the Project Gutenberg declaration at the end of the book

In [None]:
data = re.sub('End of the Project Gutenberg(.*?)$', '', data) #removing the Project Gutenberg declaration at the end of the file

#### Now that the information that we don't consider to be the content of the book has been removed, we proceed with tokenizing the book into words

In [None]:
words_string = nltk.word_tokenize(data) # the entire text file is tokenized into individual words by using the nltk word_tokenize function

In [None]:
len(words_string) #checking the lenght of the word list created, how many words in the text?

In [None]:
words_string[200:500] #checking the word list 

#### Removing stopwords from the text 

In [None]:
stopwords = set(nltk.corpus.stopwords.words('english')) # defines stopwords list to use (part of NLTK)

In [None]:
stopwords

#### Extending the stopwords list

In [None]:
morewords =['--'] # adding more stopwords to the list of stopwords (this list can be further customized)
stopwords.update(morewords)

#### Removing words that are less than a character long

In [None]:
words_string = [word for word in words_string if len(word) > 1] # removes words that are less than a character long

#### Lowercasing all the words

In [None]:
words_string = [word.lower() for word in words_string] # lowercases all the words in the file

#### Removing stopwords from text

In [None]:
words_string = [word for word in words_string if word not in stopwords] # removes stopwords from the file

In [None]:
len(words_string) # notice a sharp decrease of the number of words after stopwords have been removed

#### Book explorations: concordances and collocations

In [None]:
textList = Text(words_string) # creates a Text object based on the list of words from the text; this text object allows exploratory analysis of the text

In [None]:
type(textList) 

In [None]:
textList[6050:8500] 

In [None]:
textList.concordance('chicago') # the text object allows us to examine concordances of different words in the text

In [None]:
textList.similar('chicago', 20) # what are the words that appear in similar contexts to the word 'jurgis'? 

In [None]:
textIndex = nltk.text.ContextIndex(textList) # builds a textIndex object based on the textList

In [None]:
textIndex.word_similarity_dict('chicago')['packingtown'] #how similar are 'carriage' and 'elizabeth'?

In [None]:
textIndex.common_contexts(['chicago', 'packingtown']) # what are the common contexts for the words?

In [None]:
textList.dispersion_plot(['chicago', 'packingtown']) #outputs a dispersion plot for words 'chicago' and 'packingtown'

#### Establishing collocations in the text

In [None]:
textList.collocations() # obtains collocations ("the habitual juxtaposition of a particular word with another word or words with a frequency greater than chance"; "a conventional way of saying things")

#### Exploring bigrams (a pair of consecutive written units, in our case words)

In [None]:
bigram_measures = nltk.collocations.BigramAssocMeasures() # obtains bigram association measures

In [None]:
finder = BigramCollocationFinder.from_words(words_string, 5) # finds bigrams in the text
finder.apply_freq_filter(5) #removes bigrams that appear less than 5 times
bigrams = finder.nbest(bigram_measures.likelihood_ratio, 1000) #obtain the bigrams with the highest likelihood ratio
print '\n'.join('%s %s' % v for v in bigrams) #printing them out

#### Determine frequency distribution of the words 

In [None]:
fdist = nltk.FreqDist(words_string)

In [None]:
for word, frequency in fdist.most_common(100): #finds most common words in the text
    display ("%s %d" % (str(word), frequency)) 

#### Cumulative distribution of 50 most frequently used words

In [None]:
fdist.plot(50, cumulative=False)

#### Determining the window size 

In [None]:
# If we are interested in examining the surrounding context of a particular word in the text, we need to decide on the parameters of that context.
# If, for example, we are interested in three words that precede a particular word and three words after then we are interested in a 7-word window size.
# This window size can be established as follows:
ngrams = [words_string[i:i+7] for i in range(len(words_string)-6)]# http://digitalhistoryhacks.blogspot.com/2006/08/easy-pieces-in-python-keyword-in.html
    
kwicdict = {}           # opens up a kwicdict        
for n in ngrams: #indexes the fourth word in a sliding window of 7 (three words on each side) 
    if n[3] not in kwicdict: 
        kwicdict[n[3]] = [n]
    else:
        kwicdict[n[3]].append(n)

#### Keyword in context

In [None]:
for n in kwicdict['chicago']: # nicely formatted output for the word we are interested in determining the context for
    outstring = ' '.join(n[:3]).rjust(30) #output as a string and right justify first three words in the sliding window
    outstring += str(n[3]).center(len(n[3])+6) # output as a string and center fourth word in the sliding window
    outstring += ' '.join(n[4:]) # output as a string last three words in the sliding window of 7 consecutive words in the text
    print outstring