### Aim: To implement Bag of Words model for document analysis

In [1]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
from nltk.corpus import stopwords
stopwords_en = set(stopwords.words("english"))

In [3]:
# Cleaning

ip = """A bag of words is a representation of text that describes the occurrence of words within a document. 
We just keep track of word counts and disregard the grammatical details and the word order. 
It is called a “bag” of words because any information about the order or structure of words in the document is discarded. 
The model is only concerned with whether known words occur in the document, not where in the document
"""
sents = nltk.sent_tokenize(ip)

sents_rm_stopwords = []
for sent in sents:
    sents_rm_stopwords.append(' '.join(w for w in nltk.word_tokenize(sent) if w.lower() not in stopwords_en))

sents_rm_stopwords

['bag words representation text describes occurrence words within document .',
 'keep track word counts disregard grammatical details word order .',
 'called “ bag ” words information order structure words document discarded .',
 'model concerned whether known words occur document , document']

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words='english')

In [5]:
X = vectorizer.fit_transform(sents_rm_stopwords)

In [6]:
# print(vectorizer.get_feature_names())
print(X.toarray()) 

[[1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 1 0 1 0 0 1 0 2]
 [0 0 0 1 0 1 0 1 0 1 0 1 0 0 0 0 1 0 0 0 1 0 0 2 0]
 [1 1 0 0 0 0 1 0 1 0 1 0 0 0 0 0 1 0 1 0 0 0 0 0 2]
 [0 0 1 0 0 0 0 0 2 0 0 0 1 1 1 0 0 0 0 0 0 1 0 0 1]]


### Aim: To implement TFIDF model for document analysis

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfvectorizer = TfidfVectorizer()

In [24]:
analyze = tfidfvectorizer.build_analyzer()

In [25]:
Y = tfidfvectorizer.fit_transform(sents_rm_stopwords)

In [33]:
arr = Y.toarray() 
print(arr)

[[0.28488986 0.         0.         0.         0.36134666 0.
  0.         0.         0.23064289 0.         0.         0.
  0.         0.         0.         0.36134666 0.         0.36134666
  0.         0.36134666 0.         0.         0.36134666 0.
  0.46128579]
 [0.         0.         0.         0.3068352  0.         0.3068352
  0.         0.3068352  0.         0.3068352  0.         0.3068352
  0.         0.         0.         0.         0.2419124  0.
  0.         0.         0.3068352  0.         0.         0.6136704
  0.        ]
 [0.29219998 0.37061862 0.         0.         0.         0.
  0.37061862 0.         0.23656107 0.         0.37061862 0.
  0.         0.         0.         0.         0.29219998 0.
  0.37061862 0.         0.         0.         0.         0.
  0.47312213]
 [0.         0.         0.37696812 0.         0.         0.
  0.         0.         0.48122774 0.         0.         0.
  0.37696812 0.37696812 0.37696812 0.         0.         0.
  0.         0.         0.   

In [36]:
temp = []
main = []
for row in arr:
  for ele in row:
    temp.append(round(ele,2))
  main.append(temp)
  temp = []


In [37]:
main

[[0.28,
  0.0,
  0.0,
  0.0,
  0.36,
  0.0,
  0.0,
  0.0,
  0.23,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.36,
  0.0,
  0.36,
  0.0,
  0.36,
  0.0,
  0.0,
  0.36,
  0.0,
  0.46],
 [0.0,
  0.0,
  0.0,
  0.31,
  0.0,
  0.31,
  0.0,
  0.31,
  0.0,
  0.31,
  0.0,
  0.31,
  0.0,
  0.0,
  0.0,
  0.0,
  0.24,
  0.0,
  0.0,
  0.0,
  0.31,
  0.0,
  0.0,
  0.61,
  0.0],
 [0.29,
  0.37,
  0.0,
  0.0,
  0.0,
  0.0,
  0.37,
  0.0,
  0.24,
  0.0,
  0.37,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.29,
  0.0,
  0.37,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.47],
 [0.0,
  0.0,
  0.38,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.48,
  0.0,
  0.0,
  0.0,
  0.38,
  0.38,
  0.38,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.38,
  0.0,
  0.0,
  0.24]]

####################################################################################################################

## Trials and dummy test codes


In [11]:
from nltk.tokenize import sent_tokenize

In [12]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
text = """A bag of words is a representation of text that describes the occurrence of words within a document. 
We just keep track of word counts and disregard the grammatical details and the word order. 
It is called a “bag” of words because any information about the order or structure of words in the document is discarded. 
The model is only concerned with whether known words occur in the document, not where in the document
"""
nlp_Sen = sent_tokenize(text)
print(nlp_Sen)

['A bag of words is a representation of text that describes the occurrence of words within a document.', 'We just keep track of word counts and disregard the grammatical details and the word order.', 'It is called a “bag” of words because any information about the order or structure of words in the document is discarded.', 'The model is only concerned with whether known words occur in the document, not where in the document']


In [14]:
from nltk.corpus import stopwords
stopwords_en = set(stopwords.words("english"))

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words='english')

In [16]:
X = vectorizer.fit_transform(nlp_Sen)

In [17]:
# print(vectorizer.get_feature_names())
print(X.toarray()) 

[[1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 1 0 1 0 0 2]
 [0 0 0 1 0 1 0 1 0 1 0 1 0 0 0 0 1 0 0 0 1 2 0]
 [1 1 0 0 0 0 1 0 1 0 1 0 0 0 0 0 1 0 1 0 0 0 2]
 [0 0 1 0 0 0 0 0 2 0 0 0 1 1 1 0 0 0 0 0 0 0 1]]


In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer2 = TfidfVectorizer()

In [19]:
analyze = vectorizer2.build_analyzer()

In [20]:
Y = vectorizer2.fit_transform(nlp_Sen)

In [21]:
print(Y.toarray()) 

[[0.         0.         0.         0.2186234  0.         0.
  0.         0.         0.27729607 0.         0.         0.
  0.17699449 0.         0.         0.         0.17699449 0.
  0.         0.         0.         0.         0.         0.
  0.27729607 0.53098346 0.         0.         0.         0.27729607
  0.         0.27729607 0.27729607 0.14470459 0.         0.
  0.         0.         0.         0.27729607 0.         0.35398898]
 [0.         0.46986332 0.         0.         0.         0.
  0.         0.23493166 0.         0.23493166 0.         0.23493166
  0.         0.23493166 0.         0.         0.         0.
  0.23493166 0.23493166 0.         0.         0.         0.
  0.         0.14995384 0.         0.         0.18522282 0.
  0.         0.         0.         0.24519417 0.23493166 0.23493166
  0.         0.         0.         0.         0.46986332 0.        ]
 [0.24076901 0.         0.24076901 0.18982505 0.24076901 0.24076901
  0.         0.         0.         0.         0.24

In [22]:
line = """This is a sample sentence, showing off the stop words filtration.
"""
words = line.split()
print("words:", words)
ans = []
for r in words:
    if not r in stop_words:
        ans.append(r)



words: ['This', 'is', 'a', 'sample', 'sentence,', 'showing', 'off', 'the', 'stop', 'words', 'filtration.']


NameError: ignored

In [None]:
import nltk
from nltk.corpus import stopwords
stopwords_en = set(stopwords.words('english'))

ip = """A bag of words is a representation of text that describes the occurrence of words within a document. 
We just keep track of word counts and disregard the grammatical details and the word order. 
It is called a “bag” of words because any information about the order or structure of words in the document is discarded. 
The model is only concerned with whether known words occur in the document, not where in the document
"""
sents = nltk.sent_tokenize(ip)

sents_rm_stopwords = []
for sent in sents:
    sents_rm_stopwords.append(' '.join(w for w in nltk.word_tokenize(sent) if w.lower() not in stopwords_en))

print(sents_rm_stopwords)

In [None]:
# import string
# stopwords_punctuation = stopwords_en.union(string.punctuation)
# sent_rm_punctuation = []

# for sent in sents_rm_stopwords:
#     sent_rm_punctuation.append(' '.join(w for w in nltk.word_tokenize(sent) if w.lower() not in stopwords_en))

# print(sent_rm_punctuation)
