**Importing Libraries**

In [272]:
import nltk
import numpy as np
import pandas as pd
import re
import string
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

**Connecting WorkBook with Drive**

In [273]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [274]:
path='/content/drive/My Drive/Data.txt'

In [275]:
file = open(path,'r')

In [276]:
paragraphList = file.readlines()
print(paragraphList)

["Tommy, the curious puppy, dug a hole in the backyard to find buried treasures. To his surprise, he uncovered a collection of old toys buried by the previous owner's children. Delighted, Tommy claimed the toys as his own and proudly paraded around the yard with a squeaky duck in his mouth. The backyard treasure hunt turned into a playful adventure, and Tommy's wagging tail showed that sometimes, the best surprises are right under your nose—or, in this case, under your paws."]


**Transforming text from a string into a collection of sentences.**

In [277]:
sentences = re.findall(r"[^.!?]+", paragraphList[0])
sentenceList = []
for sentence in sentences:
  sentenceList.append(sentence.lower())
print(sentenceList)

['tommy, the curious puppy, dug a hole in the backyard to find buried treasures', " to his surprise, he uncovered a collection of old toys buried by the previous owner's children", ' delighted, tommy claimed the toys as his own and proudly paraded around the yard with a squeaky duck in his mouth', " the backyard treasure hunt turned into a playful adventure, and tommy's wagging tail showed that sometimes, the best surprises are right under your nose—or, in this case, under your paws"]


**Carrying out tokenization for both words and sentences.**

In [278]:
#Tokenization

# Word Token
from nltk.tokenize import word_tokenize
tokenized_docs = []
for doc in sentenceList:
  # print(word_tokenize(doc))
  tokenized_docs.append(word_tokenize(doc))
# print(tokenized_docs)

In [279]:
# Sentence Token
from nltk.tokenize import sent_tokenize
sent_token = []
for doc in sentenceList:
  sent_token.append(sent_tokenize(doc))
print(sent_token)

[['tommy, the curious puppy, dug a hole in the backyard to find buried treasures'], [" to his surprise, he uncovered a collection of old toys buried by the previous owner's children"], [' delighted, tommy claimed the toys as his own and proudly paraded around the yard with a squeaky duck in his mouth'], [" the backyard treasure hunt turned into a playful adventure, and tommy's wagging tail showed that sometimes, the best surprises are right under your nose—or, in this case, under your paws"]]


Punctuation Removal

In [280]:
# Removing punctuation
regex = re.compile('[%s]' % re.escape(string.punctuation))
tokenized_docs_no_punctuation = []
for review in tokenized_docs:
    new_review = []
    for token in review:
        new_token = regex.sub(u'', token)
        if not new_token == u'':
            new_review.append(new_token)
    tokenized_docs_no_punctuation.append(new_review)
print(tokenized_docs_no_punctuation)

[['tommy', 'the', 'curious', 'puppy', 'dug', 'a', 'hole', 'in', 'the', 'backyard', 'to', 'find', 'buried', 'treasures'], ['to', 'his', 'surprise', 'he', 'uncovered', 'a', 'collection', 'of', 'old', 'toys', 'buried', 'by', 'the', 'previous', 'owner', 's', 'children'], ['delighted', 'tommy', 'claimed', 'the', 'toys', 'as', 'his', 'own', 'and', 'proudly', 'paraded', 'around', 'the', 'yard', 'with', 'a', 'squeaky', 'duck', 'in', 'his', 'mouth'], ['the', 'backyard', 'treasure', 'hunt', 'turned', 'into', 'a', 'playful', 'adventure', 'and', 'tommy', 's', 'wagging', 'tail', 'showed', 'that', 'sometimes', 'the', 'best', 'surprises', 'are', 'right', 'under', 'your', 'nose—or', 'in', 'this', 'case', 'under', 'your', 'paws']]


Removing stop words

In [281]:
# Cleaning text of stopwords
from nltk.corpus import stopwords
tokenized_docs_no_stopwords = []
for doc in tokenized_docs_no_punctuation:
    new_term_vector = []
    for word in doc:
        if not word in stopwords.words('english'):
            new_term_vector.append(word)
    tokenized_docs_no_stopwords.append(new_term_vector)
print(tokenized_docs_no_stopwords)

[['tommy', 'curious', 'puppy', 'dug', 'hole', 'backyard', 'find', 'buried', 'treasures'], ['surprise', 'uncovered', 'collection', 'old', 'toys', 'buried', 'previous', 'owner', 'children'], ['delighted', 'tommy', 'claimed', 'toys', 'proudly', 'paraded', 'around', 'yard', 'squeaky', 'duck', 'mouth'], ['backyard', 'treasure', 'hunt', 'turned', 'playful', 'adventure', 'tommy', 'wagging', 'tail', 'showed', 'sometimes', 'best', 'surprises', 'right', 'nose—or', 'case', 'paws']]


Reducing words to their simplest form.

In [282]:
# Stemming and Lemmatization

# from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

# porter = PorterStemmer()
wordnet = WordNetLemmatizer()
preprocessed_docs = []

for doc in tokenized_docs_no_stopwords:
    final_doc = []
    for word in doc:
        #final_doc.append(porter.stem(word))
        final_doc.append(wordnet.lemmatize(word))
    preprocessed_docs.append(final_doc)
print(preprocessed_docs)

[['tommy', 'curious', 'puppy', 'dug', 'hole', 'backyard', 'find', 'buried', 'treasure'], ['surprise', 'uncovered', 'collection', 'old', 'toy', 'buried', 'previous', 'owner', 'child'], ['delighted', 'tommy', 'claimed', 'toy', 'proudly', 'paraded', 'around', 'yard', 'squeaky', 'duck', 'mouth'], ['backyard', 'treasure', 'hunt', 'turned', 'playful', 'adventure', 'tommy', 'wagging', 'tail', 'showed', 'sometimes', 'best', 'surprise', 'right', 'nose—or', 'case', 'paw']]


Gathering the essential data for creating the data frame.

In [283]:
words_Set = set()
for sentence in preprocessed_docs:
  for word in sentence:
    words_Set.add(word)
words_Set = list(words_Set)
print(words_Set)

['tail', 'playful', 'hole', 'yard', 'squeaky', 'hunt', 'wagging', 'tommy', 'owner', 'showed', 'uncovered', 'find', 'adventure', 'child', 'sometimes', 'around', 'curious', 'turned', 'old', 'best', 'paw', 'duck', 'surprise', 'mouth', 'previous', 'claimed', 'right', 'treasure', 'dug', 'nose—or', 'puppy', 'buried', 'paraded', 'case', 'backyard', 'collection', 'proudly', 'toy', 'delighted']


In [284]:
rows = []
count=0
for val in sentences:
  rows.append(count)
  # print(count)
  count=count+1

In [285]:
cols = []
count=0
for val in words_Set:
  cols.append(count)
  count=count+1

Building the Data Frame.

In [286]:
df = pd.DataFrame(index=rows,columns=list(words_Set))
type(df)

pandas.core.frame.DataFrame

Initializing Data Frame

In [287]:
for col in df.columns:
    df[col].values[:] = 0


In [288]:
df

Unnamed: 0,tail,playful,hole,yard,squeaky,hunt,wagging,tommy,owner,showed,...,nose—or,puppy,buried,paraded,case,backyard,collection,proudly,toy,delighted
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Performing the one-hot encoding process.


In [289]:
for R in range(df.shape[0]):
  for C in range(df.shape[1]):
    for word in preprocessed_docs[R]:
      # print(R)
      if(words_Set[C]==word):
        df[words_Set[C]][R] = df[words_Set[C]][R] + 1
    # print(R,C)

df

Unnamed: 0,tail,playful,hole,yard,squeaky,hunt,wagging,tommy,owner,showed,...,nose—or,puppy,buried,paraded,case,backyard,collection,proudly,toy,delighted
0,0,0,1,0,0,0,0,1,0,0,...,0,1,1,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,1,0,1,0
2,0,0,0,1,1,0,0,1,0,0,...,0,0,0,1,0,0,0,1,1,1
3,1,1,0,0,0,1,1,1,0,1,...,1,0,0,0,1,1,0,0,0,0


Carrying out data flattening.

In [290]:
finalList = []
for R in range(df.shape[0]):
  for C in range(df.shape[1]):
    # print(df[words_Set[C]][R])
    finalList.append(df[words_Set[C]][R])
print(finalList)

[0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0]
