In [None]:

from collections import Counter
import re

# load text from file


In [None]:
with open("text.txt") as f:
    text = f.read()


In [None]:
words = text.split()
print(words[:10])


In [None]:
counts = Counter(words)
for x in counts:
    print(f"{x}: {counts[x]}")



In [None]:
# Issues/critiques
print(f"Not very reusable in current form")
print(f"Case matters: \"the\" occurs {counts['the']} times and \"The\" occurs {counts['The']} times")
print(f"Sometimes we seem to include punctuation:   \"use,\" occurs {counts['use,']} times and \"use\" occurs {counts['use']} times")
print(f"Some of our words are really numbers like 9 or 43")
print(f"Some of our words are symbols like /\\\\s{{1}}/.")


In [None]:
# Second approximation
# Make this a function for reusability
# Select our words more carefully: what is a word?
# Allow case sensitivity to be an option.
# Exclude things that aren't "words"




In [None]:
def tokens(text):
    """
    Count occurrences of each word in a string and return a dictionary
    with words as keys and counts as values
    """
    words = text.split()
    counts = Counter(words)
    return counts
    

In [None]:
counts = tokens(text)


In [None]:
# Keys and Items of a dictionary


In [None]:
print(counts.keys())

In [None]:
print(list(counts.values()))

In [None]:
print(counts.items())

In [None]:
words = counts.keys()


In [None]:
print(sorted(words))

In [None]:
sorted(list(counts.items()))

In [None]:
sorted(counts.items(),key=lambda x: x[1])

In [None]:
sorted(counts.items(),key=lambda x: -x[1])

In [None]:
[x[0] for x in sorted(counts.items(),key=lambda x: -x[1])]

In [None]:
[x[0] for i,x in enumerate(sorted(counts.items(),key=lambda x: -x[1])) if i<10 ]

In [None]:
def most_common(text,N):
    counts = tokens(text)
    return [x[0] for i, x in enumerate(sorted(counts.items(),key=lambda x: -x[1])) if i<N]

In [None]:
most_common(text,20)

In [None]:
def least_common(text,N):
    counts = tokens(text)
    return [x[0] for i,x in enumerate(sorted(counts.items())) if i<N]

In [None]:
least_common(text,10)

In [None]:

class Splitter:
    """
    A simple string tokenizer.

    ...
    Attributes
    ----------
    text : the text being analyzed
    counts : a python dictionary with keys the words in the text and values as the counts

    Methods
    ----------
    words() : a list of the words ocurring in the text
    n()     : the total number of distinct words in the text
    top(N)  : list of the N most frequently occurring words
    bottom(N) : list of the N least frequently occurring words
  
   
    """
    def __init__(self, text):
        """
        Initialize the object and store the counts dictionary
        """
        self.text = text
        self.counts = self.__tokens(text)
        return

    def __tokens(text):
        """
        split the text and compute the counts (private method)
        """
        words = text.split()
        counts = Counter(words)
        return counts

    def words(self):
        """
        return the words occurring in the text
        """
        return list(self.counts.keys())

    def n(self):
        """
        return the number of distinct words in the text
        """
        return len(self.counts)

    def top(self,N):
        """
        return a list of the N most common words
        """
        return [x[0] for i,x in enumerate(sorted(counts.items(),key=lambda x: -x[1])) if i<N]

    def bottom(self,N):
        """
        return a list of the N least common words
        """
        return [x[0] for i,x in enumerate(sorted(counts.items())) if i<N]

  

In [None]:
S = Splitter()

In [None]:
S.tokens(text)

In [None]:
S=Splitter(text)

In [None]:
S.text

In [None]:
S.counts

In [None]:
S.words()

In [None]:
S.n()

In [None]:
S.top(10)

In [None]:
S.bottom(20)

In [None]:
help(Splitter)

In [None]:
help(Splitter)

In [None]:
dir(Splitter)

In [None]:
help(Splitter)

In [None]:
help(tokens)