In [2]:
# ignore twython library missing, we aren't using it's functionality
# Must use nltk.download() and get the Opinion Lexicon and Vader Lexicon
from basic_nlp import *



## Tokenize and Tag

In [4]:
text = """I had my suspicions in early 2003. Who wouldn't? That is why when my wife, Stacy, out on her daily walk yesterday, I texted my Stacy that I was going shopping and I would be back soon."""

tokens = tokenize(text)
tok = tokenizeFindAllRegex(r"""([A-Za-z0-9&]+[']?[A-Za-z]?)""")
tokens = tok(text)
tagged = pos(tokens)
tokens

['I',
 'had',
 'my',
 'suspicions',
 'in',
 'early',
 '2003',
 'Who',
 "wouldn't",
 'That',
 'is',
 'why',
 'when',
 'my',
 'wife',
 'Stacy',
 'out',
 'on',
 'her',
 'daily',
 'walk',
 'yesterday',
 'I',
 'texted',
 'my',
 'Stacy',
 'that',
 'I',
 'was',
 'going',
 'shopping',
 'and',
 'I',
 'would',
 'be',
 'back',
 'soon']

## Lemmatize

In [5]:
lemmatize(tagged)

['I',
 'have',
 'my',
 'suspicion',
 'in',
 'early',
 '2003',
 'Who',
 "wouldn't",
 'That',
 'be',
 'why',
 'when',
 'my',
 'wife',
 'Stacy',
 'out',
 'on',
 'her',
 'daily',
 'walk',
 'yesterday',
 'I',
 'texted',
 'my',
 'Stacy',
 'that',
 'I',
 'be',
 'go',
 'shopping',
 'and',
 'I',
 'would',
 'be',
 'back',
 'soon']

### Process Suffixes

In [6]:
tokenSuffixes(tokens)
dict(freq(tokenSuffixes(tokens)))

{'cy': 2,
 'ed': 1,
 'en': 1,
 'er': 1,
 'ily': 1,
 'ing': 2,
 'ly': 1,
 's': 3,
 'y': 5}

## Create N-Grams

In [59]:
dict(freq(grams(tokens, 1)))

{('2003',): 1,
 ('I',): 4,
 ('Stacy',): 2,
 ('That',): 1,
 ('Who',): 1,
 ('and',): 1,
 ('back',): 1,
 ('be',): 1,
 ('daily',): 1,
 ('early',): 1,
 ('going',): 1,
 ('had',): 1,
 ('her',): 1,
 ('in',): 1,
 ('is',): 1,
 ('my',): 3,
 ('on',): 1,
 ('out',): 1,
 ('shopping',): 1,
 ('soon',): 1,
 ('suspicions',): 1,
 ('texted',): 1,
 ('that',): 1,
 ('walk',): 1,
 ('was',): 1,
 ('when',): 1,
 ('why',): 1,
 ('wife',): 1,
 ('would',): 1,
 ("wouldn't",): 1,
 ('yesterday',): 1}


## POS Grams

In [60]:
posOnly = posTagOnly(tagged)
freq(grams(posOnly, 3))

FreqDist({('CC', 'PRP', 'MD'): 1,
          ('CD', 'WP', 'VBP'): 1,
          ('DT', 'VBZ', 'WRB'): 1,
          ('IN', 'JJ', 'CD'): 1,
          ('IN', 'PRP', 'VBD'): 1,
          ('IN', 'PRP$', 'JJ'): 1,
          ('JJ', 'CD', 'WP'): 1,
          ('JJ', 'NN', 'NN'): 1,
          ('MD', 'VB', 'RB'): 1,
          ('NN', 'CC', 'PRP'): 1,
          ('NN', 'IN', 'PRP'): 1,
          ('NN', 'NN', 'PRP'): 1,
          ('NN', 'NNP', 'RP'): 1,
          ('NN', 'PRP', 'VBD'): 1,
          ('NNP', 'RP', 'IN'): 1,
          ('NNS', 'IN', 'JJ'): 1,
          ('PRP', 'MD', 'VB'): 1,
          ('PRP', 'VBD', 'PRP$'): 2,
          ('PRP', 'VBD', 'VBG'): 1,
          ('PRP$', 'JJ', 'NN'): 1,
          ('PRP$', 'NN', 'IN'): 1,
          ('PRP$', 'NN', 'NNP'): 1,
          ('PRP$', 'NNS', 'IN'): 1,
          ('RP', 'IN', 'PRP$'): 1,
          ('VB', 'RB', 'RB'): 1,
          ('VBD', 'PRP$', 'NN'): 1,
          ('VBD', 'PRP$', 'NNS'): 1,
          ('VBD', 'VBG', 'NN'): 1,
          ('VBG', 'NN', 'CC'): 

## Syllable grams

In [7]:
syllableGrams(tokens, 3)

NameError: name 'syllableGrams' is not defined

## Vowelless grams

In [62]:
vowelGrams(tokens, 3)

NameError: name 'vowelGrams' is not defined

## Uppercase/Lowercase

In [63]:
ull = upperLowerLen(tokens)

In [64]:
capLetterFreq(ull)

0.06201550387596899

In [65]:
cases = wordCases(ull)

In [66]:
freq(grams(cases, 3))

FreqDist({('AC', 'NC', 'NC'): 4,
          ('FC', 'NC', 'AC'): 1,
          ('FC', 'NC', 'FC'): 1,
          ('FC', 'NC', 'NC'): 2,
          ('NC', 'AC', 'NC'): 3,
          ('NC', 'FC', 'NC'): 4,
          ('NC', 'NC', 'AC'): 2,
          ('NC', 'NC', 'FC'): 3,
          ('NC', 'NC', 'NC'): 15})

In [15]:
sentimentGrams([tokens[:int(len(tokens)/2)]])
sentimentGrams([tokens[int(len(tokens)/2):]])
sentimentGrams(ngrams(tokens, 3))

[{'LiuHu': {'compound': 0.0, 'neg': 0.0, 'neu': 1.0, 'pos': 0.0},
  'Vader': {'compound': 0.0, 'neg': 0.0, 'neu': 1.0, 'pos': 0.0}},
 {'LiuHu': {'compound': -0.333, 'neg': 0.3333, 'neu': 0.6667, 'pos': 0.0},
  'Vader': {'compound': -0.3612, 'neg': 0.556, 'neu': 0.444, 'pos': 0.0}},
 {'LiuHu': {'compound': -0.333, 'neg': 0.3333, 'neu': 0.6667, 'pos': 0.0},
  'Vader': {'compound': -0.3612, 'neg': 0.556, 'neu': 0.444, 'pos': 0.0}},
 {'LiuHu': {'compound': -0.333, 'neg': 0.3333, 'neu': 0.6667, 'pos': 0.0},
  'Vader': {'compound': -0.3612, 'neg': 0.556, 'neu': 0.444, 'pos': 0.0}},
 {'LiuHu': {'compound': 0.0, 'neg': 0.0, 'neu': 1.0, 'pos': 0.0},
  'Vader': {'compound': 0.0, 'neg': 0.0, 'neu': 1.0, 'pos': 0.0}},
 {'LiuHu': {'compound': 0.0, 'neg': 0.0, 'neu': 1.0, 'pos': 0.0},
  'Vader': {'compound': 0.0, 'neg': 0.0, 'neu': 1.0, 'pos': 0.0}},
 {'LiuHu': {'compound': 0.0, 'neg': 0.0, 'neu': 1.0, 'pos': 0.0},
  'Vader': {'compound': 0.0, 'neg': 0.0, 'neu': 1.0, 'pos': 0.0}},
 {'LiuHu': {'compo

In [68]:
sentimentGrams([tokens])

[{'LiuHu': {'compound': -0.027, 'neg': 0.027, 'neu': 0.973, 'pos': 0.0},
  'Vader': {'compound': -0.3612, 'neg': 0.072, 'neu': 0.928, 'pos': 0.0}}]

## Chunk and Entity Removal

In [69]:
chunked = chunk(tagged)
removeNamedEntities(chunked, removeNumbers=True)

[('I', 'PRP'),
 ('had', 'VBD'),
 ('my', 'PRP$'),
 ('suspicions', 'NNS'),
 ('in', 'IN'),
 ('early', 'JJ'),
 ('NumTok', '[CD]'),
 ('Who', 'WP'),
 ("wouldn't", 'VBP'),
 ('That', 'DT'),
 ('is', 'VBZ'),
 ('why', 'WRB'),
 ('when', 'WRB'),
 ('my', 'PRP$'),
 ('wife', 'NN'),
 ('Stacy', 'NNP'),
 ('out', 'RP'),
 ('on', 'IN'),
 ('her', 'PRP$'),
 ('daily', 'JJ'),
 ('walk', 'NN'),
 ('yesterday', 'NN'),
 ('I', 'PRP'),
 ('texted', 'VBD'),
 ('my', 'PRP$'),
 ('Stacy', 'NN'),
 ('that', 'IN'),
 ('I', 'PRP'),
 ('was', 'VBD'),
 ('going', 'VBG'),
 ('shopping', 'NN'),
 ('and', 'CC'),
 ('I', 'PRP'),
 ('would', 'MD'),
 ('be', 'VB'),
 ('back', 'RB'),
 ('soon', 'RB')]

In [70]:
removeNamedEntities(chunked, removeNumbers=False)

[('I', 'PRP'),
 ('had', 'VBD'),
 ('my', 'PRP$'),
 ('suspicions', 'NNS'),
 ('in', 'IN'),
 ('early', 'JJ'),
 ('2003', 'CD'),
 ('Who', 'WP'),
 ("wouldn't", 'VBP'),
 ('That', 'DT'),
 ('is', 'VBZ'),
 ('why', 'WRB'),
 ('when', 'WRB'),
 ('my', 'PRP$'),
 ('wife', 'NN'),
 ('Stacy', 'NNP'),
 ('out', 'RP'),
 ('on', 'IN'),
 ('her', 'PRP$'),
 ('daily', 'JJ'),
 ('walk', 'NN'),
 ('yesterday', 'NN'),
 ('I', 'PRP'),
 ('texted', 'VBD'),
 ('my', 'PRP$'),
 ('Stacy', 'NN'),
 ('that', 'IN'),
 ('I', 'PRP'),
 ('was', 'VBD'),
 ('going', 'VBG'),
 ('shopping', 'NN'),
 ('and', 'CC'),
 ('I', 'PRP'),
 ('would', 'MD'),
 ('be', 'VB'),
 ('back', 'RB'),
 ('soon', 'RB')]

## Punctuation features

In [71]:
punctuationFeatures(text)

{"'": (1, 0.0054, 0.1429),
 ',': (3, 0.0163, 0.4286),
 '.': (2, 0.0109, 0.2857),
 '?': (1, 0.0054, 0.1429)}