In [12]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()
v.fit(["Shaun is looking for job"])
v.vocabulary_

{'shaun': 4, 'is': 1, 'looking': 3, 'for': 0, 'job': 2}

In [26]:
new_doc = "ABC is searching for  job"

# Transform the new document using the fitted CountVectorizer
vector = v.transform([new_doc])

# Convert the sparse matrix to a dense array
vector_array = vector.toarray()

print("Vector representation of the new document:")
print(vector_array)

Vector representation of the new document:
[[1 1 1 0 0]]


In [29]:
v = CountVectorizer(ngram_range=(2,3))
v.fit(["Shaun is looking for job"])
v.vocabulary_

{'shaun is': 5,
 'is looking': 1,
 'looking for': 3,
 'for job': 0,
 'shaun is looking': 6,
 'is looking for': 2,
 'looking for job': 4}

In [30]:
new_doc = "ABC is searching for  job"

# Transform the new document using the fitted CountVectorizer
vector = v.transform([new_doc])

# Convert the sparse matrix to a dense array
vector_array = vector.toarray()

print("Vector representation of the new document:")
print(vector_array)

Vector representation of the new document:
[[1 0 0 0 0 0 0]]


In [5]:
v = CountVectorizer(ngram_range=(1,3))
v.fit(["Absolutely wonderful - silky and easy and comfortable"])
v.vocabulary_

{'absolutely': 0,
 'wonderful': 14,
 'silky': 11,
 'and': 3,
 'easy': 8,
 'comfortable': 7,
 'absolutely wonderful': 1,
 'wonderful silky': 15,
 'silky and': 12,
 'and easy': 5,
 'easy and': 9,
 'and comfortable': 4,
 'absolutely wonderful silky': 2,
 'wonderful silky and': 16,
 'silky and easy': 13,
 'and easy and': 6,
 'easy and comfortable': 10}

In [6]:
v.get_feature_names()



['absolutely',
 'absolutely wonderful',
 'absolutely wonderful silky',
 'and',
 'and comfortable',
 'and easy',
 'and easy and',
 'comfortable',
 'easy',
 'easy and',
 'easy and comfortable',
 'silky',
 'silky and',
 'silky and easy',
 'wonderful',
 'wonderful silky',
 'wonderful silky and']

In [7]:
v.get_stop_words()

In [8]:
##We will not take a simple collection of text documents, preprocess them to remove stop words, lemmatize etc and then generate bag of 1 grams and 2 grams from it

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

texts = [
    "Shaun is looking for job",
    "John is applying for a new position",
    "Emily is searching for job opportunities"
]

vectorizer = CountVectorizer()

# Fit and transform the text data
X = vectorizer.fit_transform(texts)

print("Vocabulary:")
print(vectorizer.get_feature_names())

# Display BOW representation of the text data
print("\nBag-of-Words representation:")
print(X.toarray())


Vocabulary:
['applying', 'emily', 'for', 'is', 'job', 'john', 'looking', 'new', 'opportunities', 'position', 'searching', 'shaun']

Bag-of-Words representation:
[[0 0 1 1 1 0 1 0 0 0 0 1]
 [1 0 1 1 0 1 0 1 0 1 0 0]
 [0 1 1 1 1 0 0 0 1 0 1 0]]


In [9]:
corpus = [
    "Thor ate pizza",
    "Loki is tall",
    "Loki is eating pizza"
]

In [10]:
import spacy

# load english language model and create nlp object from it
nlp = spacy.load("en_core_web_sm") 

def preprocess(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens) 

In [11]:
preprocess("Thor ate pizza")

'Thor eat pizza'

In [12]:
preprocess("Loki is eating pizza")

'Loki eat pizza'

In [13]:
corpus_processed = [
    preprocess(text) for text in corpus
]
corpus_processed

['Thor eat pizza', 'Loki tall', 'Loki eat pizza']

In [14]:
v = CountVectorizer(ngram_range=(1,2))
v.fit(corpus_processed)
v.vocabulary_

{'thor': 7,
 'eat': 0,
 'pizza': 5,
 'thor eat': 8,
 'eat pizza': 1,
 'loki': 2,
 'tall': 6,
 'loki tall': 4,
 'loki eat': 3}

In [15]:
##Now generate bag of n gram vector for few sample documents

In [16]:
v.transform(["Thor eat pizza"]).toarray()

array([[1, 1, 0, 0, 0, 1, 0, 1, 1]], dtype=int64)

In [19]:
from sklearn.feature_extraction.text import CountVectorizer

In [17]:
corpus = [
'This is the first document.',
 'This document is the second document.',
 'And this is the third one.',
 'Is this the first document?',
]

In [18]:
vectorizer = CountVectorizer(ngram_range=(2, 2))

In [19]:
X = vectorizer.fit_transform(corpus)

In [20]:
vectorizer.get_feature_names_out()

array(['and this', 'document is', 'first document', 'is the', 'is this',
       'second document', 'the first', 'the second', 'the third',
       'third one', 'this document', 'this is', 'this the'], dtype=object)

In [21]:
vectorizer.vocabulary_

{'this is': 11,
 'is the': 3,
 'the first': 6,
 'first document': 2,
 'this document': 10,
 'document is': 1,
 'the second': 7,
 'second document': 5,
 'and this': 0,
 'the third': 8,
 'third one': 9,
 'is this': 4,
 'this the': 12}

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

texts = [
    "Shaun is looking for job",
    "John is applying for a new position",
    "Emily is searching for job opportunities"
]

# Initialize TfidfVectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the text data
X = vectorizer.fit_transform(texts)

# Display vocabulary
print("Vocabulary:")
print(vectorizer.get_feature_names())

# Display TF-IDF representation of the text data
print("\nTF-IDF representation:")
print(X.toarray())


Vocabulary:
['applying', 'emily', 'for', 'is', 'job', 'john', 'looking', 'new', 'opportunities', 'position', 'searching', 'shaun']

TF-IDF representation:
[[0.         0.         0.32630952 0.32630952 0.42018292 0.
  0.55249005 0.         0.         0.         0.         0.55249005]
 [0.46138073 0.         0.27249889 0.27249889 0.         0.46138073
  0.         0.46138073 0.         0.46138073 0.         0.        ]
 [0.         0.48359121 0.28561676 0.28561676 0.36778358 0.
  0.         0.         0.48359121 0.         0.48359121 0.        ]]


In [24]:
from sklearn.preprocessing import OneHotEncoder

categories = [['red'], ['blue'], ['green'], ['red'], ['blue']]


encoder = OneHotEncoder()

# Fit and transform the categorical data
X = encoder.fit_transform(categories)


print("One-hot encoded representation:")
print(X.toarray())


One-hot encoded representation:
[[0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]]
