Word Embeddings or Word vectorization is a methodology in NLP to map words or phrases from vocabulary to
a corresponding vector of real numbers which used to find word predictions, word similarities/semantics. The
process of converting words into numbers are called Vectorization.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
df = pd.read_csv('./DataSet/data.csv')

In [3]:
df.head()

Unnamed: 0,test,class
0,I love Bangladesh,1
1,Could you give me an iphone?,0
2,Hello how are you?,1
3,I want to talk you.,1


# Count Vectorizer

Count Vectorizer is a great tool provided by the scikit-learn library in Python. It is used to transform a
given text into a vector on the basis of the frequency (count) of each word that occurs in the entire text.

In [4]:
cv = CountVectorizer()

In [5]:
x = cv.fit_transform(df['test'])

In [6]:
x

<4x14 sparse matrix of type '<class 'numpy.int64'>'
	with 16 stored elements in Compressed Sparse Row format>

In [7]:
x.toarray()

array([[0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1],
       [0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1]], dtype=int64)

In [8]:
df.head()

Unnamed: 0,test,class
0,I love Bangladesh,1
1,Could you give me an iphone?,0
2,Hello how are you?,1
3,I want to talk you.,1


In [9]:
df3 = df.copy()

In [10]:
df2 = pd.DataFrame(x.toarray(), index=df['test'], columns=cv.get_feature_names())

In [11]:
df2.head()

Unnamed: 0_level_0,an,are,bangladesh,could,give,hello,how,iphone,love,me,talk,to,want,you
test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
I love Bangladesh,0,0,1,0,0,0,0,0,1,0,0,0,0,0
Could you give me an iphone?,1,0,0,1,1,0,0,1,0,1,0,0,0,1
Hello how are you?,0,1,0,0,0,1,1,0,0,0,0,0,0,1
I want to talk you.,0,0,0,0,0,0,0,0,0,0,1,1,1,1


In [12]:
columns=cv.get_feature_names()

In [13]:
columns

['an',
 'are',
 'bangladesh',
 'could',
 'give',
 'hello',
 'how',
 'iphone',
 'love',
 'me',
 'talk',
 'to',
 'want',
 'you']

# TF-IDF

In [14]:
idf = TfidfVectorizer()

In [15]:
x = idf.fit_transform(df3['test'])

In [16]:
x.toarray()

array([[0.        , 0.        , 0.70710678, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.70710678, 0.        ,
        0.        , 0.        , 0.        , 0.        ],
       [0.43003652, 0.        , 0.        , 0.43003652, 0.43003652,
        0.        , 0.        , 0.43003652, 0.        , 0.43003652,
        0.        , 0.        , 0.        , 0.27448674],
       [0.        , 0.5417361 , 0.        , 0.        , 0.        ,
        0.5417361 , 0.5417361 , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.34578314],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.5417361 , 0.5417361 , 0.5417361 , 0.34578314]])

In [17]:
df4 = pd.DataFrame(x.toarray(), index=df['test'], columns=idf.get_feature_names())
df4

Unnamed: 0_level_0,an,are,bangladesh,could,give,hello,how,iphone,love,me,talk,to,want,you
test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
I love Bangladesh,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0
Could you give me an iphone?,0.430037,0.0,0.0,0.430037,0.430037,0.0,0.0,0.430037,0.0,0.430037,0.0,0.0,0.0,0.274487
Hello how are you?,0.0,0.541736,0.0,0.0,0.0,0.541736,0.541736,0.0,0.0,0.0,0.0,0.0,0.0,0.345783
I want to talk you.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.541736,0.541736,0.541736,0.345783


# Word2Vec

In [18]:
!pip install gensim



In [19]:
from gensim.models import Word2Vec, KeyedVectors

In [20]:
import nltk

In [21]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [22]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [24]:
df = pd.read_csv('./DataSet/data.csv')

In [25]:
df.head()

Unnamed: 0,test,class
0,I love Bangladesh,1
1,Could you give me an iphone?,0
2,Hello how are you?,1
3,I want to talk you.,1


In [26]:
from nltk.tokenize import word_tokenize

In [27]:
text_vsc = [nltk.word_tokenize(test) for test in df['test']]

In [28]:
text_vsc

[['I', 'love', 'Bangladesh'],
 ['Could', 'you', 'give', 'me', 'an', 'iphone', '?'],
 ['Hello', 'how', 'are', 'you', '?'],
 ['I', 'want', 'to', 'talk', 'you', '.']]

In [29]:
model = Word2Vec(text_vsc, min_count=1)

In [30]:
model

<gensim.models.word2vec.Word2Vec at 0x1a1cd792520>

In [31]:
model.wv.most_similar('Hello')

[('?', 0.17272651195526123),
 ('Bangladesh', 0.16695065796375275),
 ('give', 0.11118057370185852),
 ('talk', 0.10947787016630173),
 ('you', 0.07967710494995117),
 ('an', 0.04130828380584717),
 ('me', 0.03771404176950455),
 ('to', 0.01324356161057949),
 ('I', 0.008316037245094776),
 ('love', -0.005900918506085873)]