# N Grams and Tokenization in Python

### Approaching Tokenization

In [1]:
#import nltk
#nltk.download('popular', halt_on_error=False)

In [2]:
import nltk as nltk
import pandas as pd
import re
from collections import Counter, defaultdict

In [3]:
directory = 'C://Users//Nick//Documents//Teaching//Data Projects//Text//Books//'

book = 'Book_2.txt'
#book_short = '3boat10_short.txt'
#book_out = '3boat10_out.txt'

### Using Re

In [4]:
# Find the N most common words in book
top_N = 10

words = re.findall(r'\w+', open(directory+book, encoding="utf8").read().lower())
# \w -- matches a "word" character: a letter or digit or underbar [a-zA-Z0-9_].
# + pattern must appear at least once. 

word_freq = Counter(words).most_common(top_N)
word_freq

[('the', 3793),
 ('and', 3125),
 ('a', 1897),
 ('to', 1726),
 ('of', 1466),
 ('it', 1317),
 ('he', 1249),
 ('was', 1166),
 ('that', 1026),
 ('i', 1015)]

In [5]:
word_freq = Counter(words).most_common()

word_freq_df = pd.DataFrame(word_freq,
                    columns=['Word', 'Frequency']).set_index('Word')

word_freq_df.sort_values('Frequency', ascending=False, inplace=True)

word_freq_df.head(n=10)

Unnamed: 0_level_0,Frequency
Word,Unnamed: 1_level_1
the,3793
and,3125
a,1897
to,1726
of,1466
it,1317
he,1249
was,1166
that,1026
i,1015


In [6]:
word_freq_df.shape

(7426, 1)

### Using NLTK

In [7]:
f = open(directory+book, encoding="utf8")
raw = f.read()

words = nltk.tokenize.word_tokenize(raw)
fdist = nltk.FreqDist(words)

print(fdist)

#fdist.items() - will give all words
fdist.most_common(10)

<FreqDist with 8695 samples and 90506 outcomes>


[(',', 4938),
 ('the', 3362),
 ('.', 3162),
 ('and', 2967),
 ('’', 2284),
 ('a', 1770),
 ('to', 1716),
 ('“', 1534),
 ('”', 1527),
 ('of', 1455)]

### NLTK also has embedded RegexpTokenizer

In [8]:
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')
tokenizer.tokenize("Flu season hitting earlier, with dozens more outbreaks — and more severe symptoms")

['Flu',
 'season',
 'hitting',
 'earlier',
 'with',
 'dozens',
 'more',
 'outbreaks',
 'and',
 'more',
 'severe',
 'symptoms']

In [9]:
fdist_df = pd.DataFrame(fdist.most_common(),
                    columns=['Word', 'Frequency'])

fdist_df.head(n=10)

Unnamed: 0,Word,Frequency
0,",",4938
1,the,3362
2,.,3162
3,and,2967
4,’,2284
5,a,1770
6,to,1716
7,“,1534
8,”,1527
9,of,1455


In [10]:
fdist_df.shape

(8695, 2)

### Cleaning-up tokenization

In [11]:
#from nltk.corpus import stopwords

#default_stopwords = set(nltk.corpus.stopwords.words('english'))

words = nltk.tokenize.word_tokenize(raw)

#stopwords = stopwords.words('english')
stopwords = set(nltk.corpus.stopwords.words('english'))

# Remove single-character tokens (mostly punctuation)
words = [word for word in words if len(word) > 1]

# Remove numbers
#words = [word for word in words if not word.isnumeric()]

# Remove punctuation
words = [word for word in words if word.isalpha()]

# Lowercase all words (default_stopwords are lowercase too)
words = [word.lower() for word in words]

# Remove stopwords
words = [word for word in words if word not in stopwords]

fdist = nltk.FreqDist(words)

print(fdist)

#fdist.items() - will give all words
fdist.most_common(10)

<FreqDist with 6951 samples and 33521 outcomes>


[('tom', 802),
 ('said', 355),
 ('huck', 249),
 ('would', 237),
 ('one', 187),
 ('time', 182),
 ('could', 176),
 ('got', 172),
 ('well', 170),
 ('joe', 168)]

In [12]:
fdist_df = pd.DataFrame(fdist.most_common(),
                    columns=['Word', 'Frequency'])

fdist_df.head(n=10)

Unnamed: 0,Word,Frequency
0,tom,802
1,said,355
2,huck,249
3,would,237
4,one,187
5,time,182
6,could,176
7,got,172
8,well,170
9,joe,168


# N Grams
### Basic N-Gramming

In [13]:
sentence = 'quick brown fox jumps over the lazy dog'
n = 3
kgrams = nltk.ngrams(sentence.split(), n)
for grams in kgrams:
  print (grams)

('quick', 'brown', 'fox')
('brown', 'fox', 'jumps')
('fox', 'jumps', 'over')
('jumps', 'over', 'the')
('over', 'the', 'lazy')
('the', 'lazy', 'dog')


In [14]:
tokens = nltk.word_tokenize(raw)

#Create your bigrams or trigrams
bgs = nltk.bigrams(tokens)
tgs = nltk.trigrams(tokens)

#compute frequency distribution for all the bigrams in the text
fdist_2 = nltk.FreqDist(bgs)
fdist_3 = nltk.FreqDist(tgs)

#for k,v in fdist.items():
#    print (k,v)

In [15]:
fdist_df = pd.DataFrame(fdist_2.most_common(),
                    columns=['Word', 'Frequency'])

print(fdist_df.shape)

fdist_df.head(n=10)

(42154, 2)


Unnamed: 0,Word,Frequency
0,"(,, and)",1135
1,"(’, s)",859
2,"(”, “)",819
3,"(’, t)",640
4,"(of, the)",357
5,"(?, ”)",305
6,"(in, the)",295
7,"(., He)",287
8,"(., The)",286
9,"(!, ”)",259


In [16]:
fdist_df.iloc[10000:10005]

Unnamed: 0,Word,Frequency
10000,"(nudges, and)",1
10001,"(collect, his)",1
10002,"(,, complaining)",1
10003,"(hear, them)",1
10004,"(the, teaspoon)",1


In [17]:
fdist_df = pd.DataFrame(fdist_3.most_common(),
                    columns=['Word', 'Frequency'])

print(fdist_df.shape)

fdist_df.head(n=10)

(71609, 2)


Unnamed: 0,Word,Frequency
0,"(?, ”, “)",229
1,"(don, ’, t)",186
2,"(said, :, “)",123
3,"(ain, ’, t)",115
4,"(!, ”, “)",106
5,"(I, ’, ll)",101
6,"(it, ’, s)",99
7,"(Tom, ’, s)",96
8,"(”, “, I)",84
9,"(“, Well, ,)",84


In [18]:
fdist_df.iloc[10000:10005]

Unnamed: 0,Word,Frequency
10000,"(find, fault, with)",1
10001,"(s, what, you)",1
10002,"(lifting, pretty, warning)",1
10003,"(and, so, absorbed)",1
10004,"(awful, fix, .)",1


## Cleaning-up  N-Grams

#### Eliminating puctuation and case sensitivity from N-Grams

In [19]:
#tokens = nltk.tokenize.word_tokenize(raw)
tokens = nltk.word_tokenize(raw)

#stopwords = stopwords.words('english')
stopwords = set(nltk.corpus.stopwords.words('english'))

word_list = []

# Filter out words that have punctuation and make everything lower-case
cleaned_words = [w.lower() for w in tokens if w.isalnum()]

bgs = [b for b in nltk.bigrams(cleaned_words)]
tgs = [b for b in nltk.trigrams(cleaned_words)]

In [20]:
fdist_2 = nltk.FreqDist(bgs)
fdist_df = pd.DataFrame(fdist_2.most_common(),
                    columns=['Word', 'Frequency'])

fdist_df.head(n=10)

Unnamed: 0,Word,Frequency
0,"(of, the)",359
1,"(in, the)",309
2,"(don, t)",216
3,"(and, the)",186
4,"(it, was)",182
5,"(to, the)",169
6,"(it, s)",160
7,"(he, was)",148
8,"(and, then)",128
9,"(ain, t)",118


In [21]:
fdist_3 = nltk.FreqDist(tgs)
fdist_df = pd.DataFrame(fdist_3.most_common(),
                    columns=['Word', 'Frequency'])

fdist_df.head(n=10)

Unnamed: 0,Word,Frequency
0,"(i, don, t)",69
1,"(there, was, a)",44
2,"(don, t, you)",31
3,"(there, was, no)",25
4,"(don, t, know)",25
5,"(it, ain, t)",24
6,"(out, of, the)",23
7,"(i, can, t)",21
8,"(it, s, a)",21
9,"(i, won, t)",20


#### Eliminating puctuation, case sensitivity and stop-words from N-Grams

In [22]:
#tokens = nltk.tokenize.word_tokenize(raw)
tokens = nltk.word_tokenize(raw)

#stopwords = stopwords.words('english')
stopwords = set(nltk.corpus.stopwords.words('english'))

word_list = []

# Filter out words that have punctuation and make everything lower-case
cleaned_words = [w.lower() for w in tokens if w.isalnum()]

bgs = [b for b in nltk.bigrams(cleaned_words) if b[0] not in stopwords and b[1] not in stopwords]
tgs = [b for b in nltk.trigrams(cleaned_words) if b[0] not in stopwords and b[1] not in stopwords and b[2] not in stopwords]

In [23]:
fdist_2 = nltk.FreqDist(bgs)
fdist_df = pd.DataFrame(fdist_2.most_common(),
                    columns=['Word', 'Frequency'])

fdist_df.head(n=10)

Unnamed: 0,Word,Frequency
0,"(injun, joe)",68
1,"(aunt, polly)",56
2,"(tom, said)",43
3,"(said, tom)",29
4,"(tom, sawyer)",28
5,"(muff, potter)",24
6,"(joe, harper)",18
7,"(said, huck)",17
8,"(ha, nted)",15
9,"(widow, douglas)",14


In [24]:
fdist_3 = nltk.FreqDist(tgs)
fdist_df = pd.DataFrame(fdist_3.most_common(),
                    columns=['Word', 'Frequency'])

fdist_df.head(n=10)

Unnamed: 0,Word,Frequency
0,"(ha, nted, house)",9
1,"(said, aunt, polly)",8
2,"(tom, said, oh)",4
3,"(said, injun, joe)",4
4,"(two, weeks, ago)",3
5,"(well, go, long)",3
6,"(two, thousand, verses)",3
7,"(ha, nted, room)",2
8,"(aunt, polly, entered)",2
9,"(saw, injun, joe)",2


#### Creating targeted N-Grams

In [25]:
#tokens = nltk.tokenize.word_tokenize(raw)
tokens = nltk.word_tokenize(raw)

#stopwords = stopwords.words('english')
stopwords = set(nltk.corpus.stopwords.words('english'))

word_list = []

# Filter out words that have punctuation and make everything lower-case
cleaned_words = [w.lower() for w in tokens if w.isalnum()]

bgs = [b for b in nltk.bigrams(cleaned_words) if b[0] not in stopwords and b[1] not in stopwords and \
       (b[0] == 'tom' or b[1] == 'tom')]

tgs = [b for b in nltk.trigrams(cleaned_words) if b[0] not in stopwords and b[1] not in stopwords and \
       (b[0] == 'tom' or b[1] == 'tom' or b[2] == 'tom')]

In [26]:
fdist_2 = nltk.FreqDist(bgs)
fdist_df = pd.DataFrame(fdist_2.most_common(),
                    columns=['Word', 'Frequency'])

fdist_df.head(n=10)

Unnamed: 0,Word,Frequency
0,"(tom, said)",43
1,"(said, tom)",29
2,"(tom, sawyer)",28
3,"(tom, got)",10
4,"(oh, tom)",10
5,"(tom, tom)",9
6,"(tom, went)",8
7,"(tom, turned)",7
8,"(tom, felt)",7
9,"(tom, took)",7


In [27]:
fdist_3 = nltk.FreqDist(tgs)
fdist_df = pd.DataFrame(fdist_3.most_common(),
                    columns=['Word', 'Frequency'])

fdist_df.head(n=10)

Unnamed: 0,Word,Frequency
0,"(tom, said, he)",6
1,"(said, tom, i)",5
2,"(tom, said, oh)",4
3,"(tom, began, to)",4
4,"(oh, tom, you)",3
5,"(tom, said, it)",3
6,"(tom, huck, i)",3
7,"(tom, sawyer, the)",3
8,"(tom, tried, to)",3
9,"(said, tom, what)",3


### Creating N-Grams of custom length

In [28]:
n = 4
fourgrams = nltk.ngrams(raw.split(), n)

n = 5
fivegrams = nltk.ngrams(raw.split(), n)

n = 6
sixgrams = nltk.ngrams(raw.split(), n)

In [29]:
fdist_4 = nltk.FreqDist(fourgrams)
fdist_4_df = pd.DataFrame(fdist_4.most_common(),
                    columns=['Word', 'Frequency'])

fdist_4_df.head(n=10)

Unnamed: 0,Word,Frequency
0,"(the, rest, of, the)",9
1,"(the, middle, of, the)",7
2,"(I, don’t, want, to)",6
3,"(At, the, end, of)",5
4,"(he, did, not, know)",5
5,"(for, the, space, of)",4
6,"(in, the, midst, of)",4
7,"(at, the, head, of)",4
8,"(occurred, to, him, that)",4
9,"(the, edge, of, the)",4


In [30]:
fdist_5 = nltk.FreqDist(fivegrams)
fdist_5_df = pd.DataFrame(fdist_5.most_common(),
                    columns=['Word', 'Frequency'])

fdist_5_df.head(n=10)

Unnamed: 0,Word,Frequency
0,"(Black, Avenger, of, the, Spanish)",4
1,"(it, occurred, to, him, that)",4
2,"(within, a, few, feet, of)",3
3,"(And, so, forth, and, so)",3
4,"(a, moment., Then, he, said:)",3
5,"(to, the, roof, of, the)",3
6,"(so, forth, and, so, on.)",3
7,"(the, old, man, and, his)",3
8,"(in, order, that, he, might)",2
9,"(the, prosecution, said:, “Take, the)",2


In [31]:
fdist_6 = nltk.FreqDist(sixgrams)
fdist_6_df = pd.DataFrame(fdist_6.most_common(),
                    columns=['Word', 'Frequency'])

fdist_6_df.head(n=10)

Unnamed: 0,Word,Frequency
0,"(And, so, forth, and, so, on.)",3
1,"(put, sackcloth, and, ashes, on, his)",2
2,"(put, his, face, in, his, hands)",2
3,"(it, occurred, to, him, that, he)",2
4,"(I, got, to, ask, to, go)",2
5,"(are, the, poor, in, spirit,, for)",2
6,"(five, or, six, miles, below, the)",2
7,"(in, order, that, he, might, be)",2
8,"(know, how, to, give, him, up!)",2
9,"(doodle-bug,, tell, me, what, I, want)",2


In [33]:
writer = pd.ExcelWriter(directory+'n_grams_out.xlsx')
fdist_4_df.to_excel(writer,'FourGrams')
fdist_5_df.to_excel(writer,'FiveGrams')
fdist_6_df.to_excel(writer,'SixGrams')
writer.save()