<h2>Tokenizing</h2>

In [1]:
# note: there are three spaces between "at" and "her" to make the example more
# realistic (texts are frequently plagued by such idiosyncracies)
text = "She looked at   her father's arm-chair."

text_fr = "Qu'est-ce que c'est?"

# tokenize on spaces
text.split(' ')
# Out[3]: ['She', 'looked', 'at', '', '', 'her', "father's", 'arm-chair.']

['She', 'looked', 'at', '', '', 'her', "father's", 'arm-chair.']

In [2]:
text_fr.split(' ')
# Out[4]: ["Qu'est-ce", 'que', "c'est?"]

["Qu'est-ce", 'que', "c'est?"]

In [52]:
# scikit-learn
# note that CountVectorizer discards "words" that contain only one character, such as "s"
# CountVectorizer also transforms all words into lowercase
from sklearn.feature_extraction.text import CountVectorizer

CountVectorizer().build_tokenizer()(text)
# Out[6]: ['She', 'looked', 'at', 'her', 'father', 'arm', 'chair']

['She', 'looked', 'at', 'her', 'father', 'arm', 'chair']

In [53]:
CountVectorizer().build_tokenizer()(text_fr)
# Out[7]: ['Qu', 'est', 'ce', 'que', 'est']

['Qu', 'est', 'ce', 'que', 'est']

In [54]:
# nltk word_tokenize uses the TreebankWordTokenizer and needs to be given
# a single sentence at a time.
from nltk.tokenize import word_tokenize

word_tokenize(text)
# Out[9]: ['She', 'looked', 'at', 'her', 'father', "'s", 'arm-chair', '.']

['She', 'looked', 'at', 'her', 'father', "'s", 'arm-chair', '.']

<h2>Stemming</h2>

In [55]:
from nltk.stem.snowball import GermanStemmer

stemmer = GermanStemmer()

# note that the stem function works one word at a time
# words = ["Wald", "Walde", "Wälder", "Wäldern", "Waldes","Walds"]
words = ["Wald", "Walde", "Waldes","Walds"]

[stemmer.stem(w) for w in words]

[u'wald', u'wald', u'wald', u'wald']

In [56]:
# note that the stemming algorithm "understands" grammar to some extent and that if "Waldi" were to 
# appear in a text, it would not be stemmed.
stemmer.stem("Waldi")
# Out[23]: 'waldi'

u'waldi'

<h2>Chunking Splitting</h2>

In [57]:
import os

import numpy as np

# plays are in the directory data/french-tragedy
# gather all the filenames, sorted alphabetically
corpus_path = os.path.join('data', 'data', 'french-tragedy')

# look at the first few filenames
# (we are sorting because different operating systems may list files in different orders)
sorted(os.listdir(corpus_path))[0:5]
# Out[27]: 
# ['Crebillon_TR-V-1703-Idomenee.txt',
#  'Crebillon_TR-V-1707-Atree.txt',
#  'Crebillon_TR-V-1708-Electre.txt',
#  'Crebillon_TR-V-1711-Rhadamisthe.txt',
#  'Crebillon_TR-V-1717-Semiramis.txt']

['Crebillon_TR-V-1703-Idomenee.txt',
 'Crebillon_TR-V-1707-Atree.txt',
 'Crebillon_TR-V-1708-Electre.txt',
 'Crebillon_TR-V-1711-Rhadamisthe.txt',
 'Crebillon_TR-V-1717-Semiramis.txt']

In [58]:
# we will need the entire path, e.g., 'data/Crebillon_TR-V-1703-Idomenee.txt'
# rather than just 'Crebillon_TR-V-1703-Idomenee.txt' alone.
tragedy_filenames = [os.path.join(corpus_path, fn) for fn in sorted(os.listdir(corpus_path))]

# alternatively, using the Python standard library package 'glob'
import glob

tragedy_filenames = glob.glob(corpus_path + os.sep + '*.txt')

In [59]:
def split_text(filename, n_words):
  """Split a text into chunks approximately `n_words` words in length."""
  input = open(filename, 'r')
  words = input.read().split(' ')
  input.close()
  chunks = []
  current_chunk_words = []
  current_chunk_word_count = 0
  for word in words:
      current_chunk_words.append(word)
      current_chunk_word_count += 1
      if current_chunk_word_count == n_words:
          chunks.append(' '.join(current_chunk_words))
          current_chunk_words = []
          current_chunk_word_count = 0
  chunks.append(' '.join(current_chunk_words) )
  return chunks

In [60]:
tragedy_filenames = [os.path.join(corpus_path, fn) for fn in sorted(os.listdir(corpus_path))]

# alternatively, using glob
tragedy_filenames = glob.glob(corpus_path + os.sep + '*.txt')

# for consistency across platforms (Linux, OS X, Windows) we must sort the filenames
tragedy_filenames.sort()

chunk_length = 1000

chunks = []

for filename in tragedy_filenames:
      chunk_counter = 0
      texts = split_text(filename, chunk_length)
      for text in texts:
          chunk = {'text': text, 'number': chunk_counter, 'filename': filename}
          chunks.append(chunk)
          chunk_counter += 1
   
# we started with this many files ...
len(tragedy_filenames)
# Out[38]: 59

59

In [61]:
# for file in tragedy_filenames:
#     print(file)

In [62]:
# ... and now we have this many
len(chunks)
# Out[39]: 2740

2740

In [63]:
# from the triples we can create a document-term matrix
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(min_df=5, max_df=.95)

dtm = vectorizer.fit_transform([c['text'] for c in chunks])

vocab = np.array(vectorizer.get_feature_names())
print(vocab)

[u'abaisse' u'abaissement' u'abaisser' ..., u'\xf4tera' u'\xf4tez'
 u'\xf4t\xe9']


In [64]:
# make sure the directory exists (not working!!!)
output_dir = '/tmp/'

for chunk in chunks:
      basename = os.path.basename(chunk['filename'])
      fn = os.path.join(output_dir,
                        "{}{:04d}".format(basename, chunk['number']))
      with open(fn, 'w') as f:
          f.write(chunk['text'])

<h2>Grouping</h2>

In [65]:
# in every filename the author's last name is followed by an underscore ('_'),
# for example: Voltaire_TR-V-1764-Olympie.txt
# os.path.basename(...) gets us the filename from a path, e.g.,
os.path.basename('french-tragedy/Voltaire_TR-V-1764-Olympie.txt')
# Out[54]: 'Voltaire_TR-V-1764-Olympie.txt'

'Voltaire_TR-V-1764-Olympie.txt'

In [66]:
# using the split method we can break up the string on the underscore ('_')
os.path.basename('french-tragedy/Voltaire_TR-V-1764-Olympie.txt').split('_')
# Out[55]: ['Voltaire', 'TR-V-1764-Olympie.txt']

['Voltaire', 'TR-V-1764-Olympie.txt']

In [67]:
# putting these two steps together
author = os.path.basename('french-tragedy/Voltaire_TR-V-1764-Olympie.txt').split('_')[0]

author
# Out[57]: 'Voltaire'

'Voltaire'

In [68]:
# and for all the authors
authors = [os.path.basename(filename).split('_')[0] for filename in tragedy_filenames]

authors
# Out[59]: 

['Crebillon',
 'Crebillon',
 'Crebillon',
 'Crebillon',
 'Crebillon',
 'Crebillon',
 'Crebillon',
 'Crebillon',
 'Crebillon',
 'PCorneille',
 'PCorneille',
 'PCorneille',
 'PCorneille',
 'PCorneille',
 'PCorneille',
 'PCorneille',
 'PCorneille',
 'PCorneille',
 'PCorneille',
 'PCorneille',
 'PCorneille',
 'PCorneille',
 'PCorneille',
 'PCorneille',
 'PCorneille',
 'PCorneille',
 'PCorneille',
 'PCorneille',
 'PCorneille',
 'Racine',
 'Racine',
 'Racine',
 'Racine',
 'Racine',
 'Racine',
 'Racine',
 'Racine',
 'Racine',
 'Racine',
 'Racine',
 'Voltaire',
 'Voltaire',
 'Voltaire',
 'Voltaire',
 'Voltaire',
 'Voltaire',
 'Voltaire',
 'Voltaire',
 'Voltaire',
 'Voltaire',
 'Voltaire',
 'Voltaire',
 'Voltaire',
 'Voltaire',
 'Voltaire',
 'Voltaire',
 'Voltaire',
 'Voltaire',
 'Voltaire']

In [69]:
# to ignore duplicates we can transform the list into a set (which only records unique elements)
set(authors)
# Out[60]: {'Crebillon', 'PCorneille', 'Racine', 'Voltaire'}

{'Crebillon', 'PCorneille', 'Racine', 'Voltaire'}

In [70]:
# as there is no guarantee about the ordering in a set (or a dictionary) we will typically
# first drop duplicates and then save our unique names as a sorted list. Because there are
# no duplicates in this list, we can be confident that the ordering is the same every time.
sorted(set(authors))
# Out[61]: ['Crebillon', 'PCorneille', 'Racine', 'Voltaire']

['Crebillon', 'PCorneille', 'Racine', 'Voltaire']

In [71]:
# and we have a way of finding which indexes in authors correspond to each author using array indexing
authors = np.array(authors)  # convert from a Python list to a NumPy array

first_author = sorted(set(authors))[0]

first_author
# Out[64]: 'Crebillon'

'Crebillon'

In [72]:
authors == first_author
# Out[65]: 

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False], dtype=bool)

In [73]:
np.nonzero(authors == first_author)  # if we want the actual indexes
# Out[66]: (array([0, 1, 2, 3, 4, 5, 6, 7, 8]),)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8]),)

In [74]:
authors[np.nonzero(authors == first_author)]
# Out[67]: 


array(['Crebillon', 'Crebillon', 'Crebillon', 'Crebillon', 'Crebillon',
       'Crebillon', 'Crebillon', 'Crebillon', 'Crebillon'], 
      dtype='|S10')

In [75]:
# alternatively, we can find those indexes of texts *not* written by `first_author`
authors[authors != first_author]
# Out[68]: 

array(['PCorneille', 'PCorneille', 'PCorneille', 'PCorneille',
       'PCorneille', 'PCorneille', 'PCorneille', 'PCorneille',
       'PCorneille', 'PCorneille', 'PCorneille', 'PCorneille',
       'PCorneille', 'PCorneille', 'PCorneille', 'PCorneille',
       'PCorneille', 'PCorneille', 'PCorneille', 'PCorneille', 'Racine',
       'Racine', 'Racine', 'Racine', 'Racine', 'Racine', 'Racine',
       'Racine', 'Racine', 'Racine', 'Racine', 'Voltaire', 'Voltaire',
       'Voltaire', 'Voltaire', 'Voltaire', 'Voltaire', 'Voltaire',
       'Voltaire', 'Voltaire', 'Voltaire', 'Voltaire', 'Voltaire',
       'Voltaire', 'Voltaire', 'Voltaire', 'Voltaire', 'Voltaire',
       'Voltaire', 'Voltaire'], 
      dtype='|S10')

In [147]:
# first get a document-term-matrix of word frequencies for our corpus
vectorizer = CountVectorizer(input='filename')

# fit_transform returns a sparse matrix (which uses less memory)
# but we want to work with a normal numpy array.
dtm = vectorizer.fit_transform(tragedy_filenames).toarray()
        
vocab = np.array(vectorizer.get_feature_names())

# Print the top 10 higest ranked words

In [157]:
# import os
# import nltk
# import numpy as np
# from sklearn.feature_extraction.text import CountVectorizer


# filenames = ['Austen_Emma.txt', 'Austen_Pride.txt', 'Austen_Sense.txt', 'CBronte_Jane.txt', 'CBronte_Professor.txt',
#  'CBronte_Villette.txt']

# filenames_with_path = [os.path.join(corpus_path, fn) for fn in filenames]

# # these texts have underscores ('_') that indicate italics; remove them.
# raw_texts = []
# for fn in filenames_with_path:
#     with open(fn) as f:
#         text = f.read()
#         text = text.replace('_', '')  # remove underscores (italics)
#         raw_texts.append(text)

# vectorizer = CountVectorizer(input='content')
# dtm = vectorizer.fit_transform(raw_texts).toarray()

# vocab = np.array(vectorizer.get_feature_names())


# # normalize counts to rates per 1000 words
# rates = 1000 * dtm / np.sum(dtm, axis=1, keepdims=True)
# # print the top 10 words along with their rates and the difference
# vocab[ranking][0:10]

# # indices so we can refer to the rows for the relevant author
# austen_indices, cbronte_indices = [], []

# for index, fn in enumerate(filenames):
#     if "Austen" in fn:
#         austen_indices.append(index)
#     elif "CBronte" in fn:
#         cbronte_indices.append(index)

# # this kind of slicing should be familiar if you've used R or Octave/Matlab
# austen_rates = rates[austen_indices, :]

# cbronte_rates = rates[cbronte_indices, :]

# # np.mean(..., axis=0) calculates the column-wise mean
# austen_rates_avg = np.mean(austen_rates, axis=0)

# cbronte_rates_avg = np.mean(cbronte_rates, axis=0)

# # calculate absolute value because we only care about the magnitude of the difference
# keyness = np.abs(austen_rates_avg - cbronte_rates_avg)

# ranking = np.argsort(keyness)[::-1]  # from highest to lowest; [::-1] reverses order in Python sequences

# # print the top 10 words along with their rates and the difference
# vocab[ranking][0:10]
    

In [77]:
authors = np.array([os.path.basename(filename).split('_')[0] for filename in tragedy_filenames])

# allocate an empty array to store our aggregated word frequencies
authors_unique = sorted(set(authors))

dtm_authors = np.zeros((len(authors_unique), len(vocab)))

for i, author in enumerate(authors_unique):
     dtm_authors[i, :] = np.sum(dtm[authors==author, :], axis=0)
   

In [79]:
dtm[0:10]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 1, 0, ..., 0, 0, 0]])

<h2>Grouping using Pandas</h2>

In [144]:
import pandas

authors = [os.path.basename(filename).split('_')[0] for filename in tragedy_filenames]

dtm_authors = pandas.DataFrame(dtm).groupby(authors).sum()

dtm_authors.head()
# dtm_authors = dtm_authors.values

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16597,16598,16599,16600,16601,16602,16603,16604,16605,16606
Crebillon,0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,2,0,2,0
PCorneille,1,1,2,2,1,1,1,1,1,1,...,1,2,0,1,1,2,16,2,10,4
Racine,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,2,0,1,0
Voltaire,0,0,0,0,0,0,0,0,0,0,...,2,0,1,1,0,1,5,1,1,0


In [145]:
projectNames = dtm_authors.index.values
dtm_authors = dtm_authors.values
dtm_authors[:10]

array([[ 0,  0,  0, ...,  0,  2,  0],
       [ 1,  1,  2, ...,  2, 10,  4],
       [ 0,  0,  0, ...,  0,  1,  0],
       [ 0,  0,  0, ...,  1,  1,  0]])

In [146]:
projectNames

array(['Crebillon', 'PCorneille', 'Racine', 'Voltaire'], dtype=object)