In [7]:
# Example text corpus

from sklearn.feature_extraction.text import CountVectorizer
corpus = [
    "This is the first document. jitesh ",
    "This document is the second document.  dewangan",
    "And this is the third one.",
    "Is this the first document?",
]

# Create an instance of the CountVectorizer
vectorizer = CountVectorizer()
# Fit and transform the corpus into a document-term matrix
X = vectorizer.fit_transform(corpus)

# Print the vocabulary and the document-term matrix
print("Vocabulary:", vectorizer.get_feature_names_out())
print("Document-Term Matrix:")
print(X.toarray())

Vocabulary: ['and' 'dewangan' 'document' 'first' 'is' 'jitesh' 'one' 'second' 'the'
 'third' 'this']
Document-Term Matrix:
[[0 0 1 1 1 1 0 0 1 0 1]
 [0 1 2 0 1 0 0 1 1 0 1]
 [1 0 0 0 1 0 1 0 1 1 1]
 [0 0 1 1 1 0 0 0 1 0 1]]


In [14]:
## unigram and bigram 
## ngram_range=(min, max) -> use all n-gram in this range
vectorizer2 = CountVectorizer(analyzer='word', ngram_range=(1, 3))
X2 = vectorizer2.fit_transform(corpus)
print("vocab: " , vectorizer2.get_feature_names_out())
print("Document-Term Matrix:")
print(X2.toarray())

vocab:  ['and' 'and this' 'and this is' 'dewangan' 'document' 'document dewangan'
 'document is' 'document is the' 'document jitesh' 'first'
 'first document' 'first document jitesh' 'is' 'is the' 'is the first'
 'is the second' 'is the third' 'is this' 'is this the' 'jitesh' 'one'
 'second' 'second document' 'second document dewangan' 'the' 'the first'
 'the first document' 'the second' 'the second document' 'the third'
 'the third one' 'third' 'third one' 'this' 'this document'
 'this document is' 'this is' 'this is the' 'this the' 'this the first']
Document-Term Matrix:
[[0 0 0 0 1 0 0 0 1 1 1 1 1 1 1 0 0 0 0 1 0 0 0 0 1 1 1 0 0 0 0 0 0 1 0 0
  1 1 0 0]
 [0 0 0 1 2 1 1 1 0 0 0 0 1 1 0 1 0 0 0 0 0 1 1 1 1 0 0 1 1 0 0 0 0 1 1 1
  0 0 0 0]
 [1 1 1 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 1 1 1 1 0 0
  1 1 0 0]
 [0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 1 1 0 0 0 0 0 1 1 1 0 0 0 0 0 0 1 0 0
  0 0 1 1]]


In [18]:
#using TFIDF
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
    "This is the first document. jitesh ",
    "This document is the second document.  dewangan",
    "And this is the third one.",
    "Is this the first document?",
]

vectorizer3 = TfidfVectorizer()
X3 = vectorizer3.fit_transform(corpus)
print("vocab: " , vectorizer3.get_feature_names_out())
print("Document-Term Matrix:")
print(X3.toarray())

vocab:  ['and' 'dewangan' 'document' 'first' 'is' 'jitesh' 'one' 'second' 'the'
 'third' 'this']
Document-Term Matrix:
[[0.         0.         0.37835697 0.46734613 0.30933162 0.59276931
  0.         0.         0.30933162 0.         0.30933162]
 [0.         0.47422682 0.60538568 0.         0.24747123 0.
  0.         0.47422682 0.24747123 0.         0.24747123]
 [0.51184851 0.         0.         0.         0.26710379 0.
  0.51184851 0.         0.26710379 0.51184851 0.26710379]
 [0.         0.         0.46979139 0.58028582 0.38408524 0.
  0.         0.         0.38408524 0.         0.38408524]]


In [73]:
## using word2vec
from gensim.models import Word2Vec
def corpus_to_token(corpus):
    token = []
    for sent in corpus:
        #print(sent)
        token.append(sent.lower().split())
        #print(token)
    return token


tokenizedtext  = corpus_to_token(corpus)
word2vec_model = Word2Vec(sentences=tokenizedtext, vector_size=100, min_count=1, workers=4)

In [None]:
# STOP OWRLD removal from tkt
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
def remove_stopwords(tokens):
    token = []
    for sent in corpus:
        vocab = sent.split()
        token.append([word for word in vocab if word not in stop_words])
        print(token)
    return token
    #return [word for word in tokens if word not in stop_words]
print(remove_stopwords(corpus))

[['This', 'first', 'document.', 'jitesh']]
[['This', 'first', 'document.', 'jitesh'], ['This', 'document', 'second', 'document.', 'dewangan']]
[['This', 'first', 'document.', 'jitesh'], ['This', 'document', 'second', 'document.', 'dewangan'], ['And', 'third', 'one.']]
[['This', 'first', 'document.', 'jitesh'], ['This', 'document', 'second', 'document.', 'dewangan'], ['And', 'third', 'one.'], ['Is', 'first', 'document?']]
[['This', 'first', 'document.', 'jitesh'], ['This', 'document', 'second', 'document.', 'dewangan'], ['And', 'third', 'one.'], ['Is', 'first', 'document?']]


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jiteshdewangan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
