# Bag of Words

<img src="image/bagofwords.png",width=400,height=400>

In [1]:
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
docs = ['Merhaba arkadaşlar nasılsınız arkadaşlar',
        'Arkadaşlar Deepcon18 atölye çalışmasına hoşgeldiniz',
        'Bugün sizlerle birlikte arkadaşlar embedding yöntemlerini öğreneceğiz',
        'Fakat önce biraz eski yöntemlere göz atalım']

In [3]:
docs

['Merhaba arkadaşlar nasılsınız arkadaşlar',
 'Arkadaşlar Deepcon18 atölye çalışmasına hoşgeldiniz',
 'Bugun sizlerle birlikte arkadaşlar embedding yöntemlerini öğreneceğiz',
 'Fakat önce biraz eski yöntemlere göz atalım']

In [4]:
CountVectorizer()

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [5]:
bow_vector = CountVectorizer()
bow_matrix = bow_vector.fit_transform(docs)

In [6]:
bow_vector.get_feature_names()

['arkadaşlar',
 'atalım',
 'atölye',
 'biraz',
 'birlikte',
 'bugun',
 'deepcon18',
 'embedding',
 'eski',
 'fakat',
 'göz',
 'hoşgeldiniz',
 'merhaba',
 'nasılsınız',
 'sizlerle',
 'yöntemlere',
 'yöntemlerini',
 'çalışmasına',
 'önce',
 'öğreneceğiz']

In [7]:
len(bow_vector.get_feature_names())

20

In [8]:
bow_matrix.toarray()

array([[2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0],
       [1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0],
       [1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1],
       [0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0]],
      dtype=int64)

In [9]:
import pandas as pd
bow_df = pd.DataFrame(bow_matrix.toarray(), columns = bow_vector.get_feature_names())


In [10]:
docs

['Merhaba arkadaşlar nasılsınız arkadaşlar',
 'Arkadaşlar Deepcon18 atölye çalışmasına hoşgeldiniz',
 'Bugun sizlerle birlikte arkadaşlar embedding yöntemlerini öğreneceğiz',
 'Fakat önce biraz eski yöntemlere göz atalım']

In [11]:
bow_df

Unnamed: 0,arkadaşlar,atalım,atölye,biraz,birlikte,bugun,deepcon18,embedding,eski,fakat,göz,hoşgeldiniz,merhaba,nasılsınız,sizlerle,yöntemlere,yöntemlerini,çalışmasına,önce,öğreneceğiz
0,2,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0
1,1,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0
2,1,0,0,0,1,1,0,1,0,0,0,0,0,0,1,0,1,0,0,1
3,0,1,0,1,0,0,0,0,1,1,1,0,0,0,0,1,0,0,1,0


# Tf-idf 

<img src="image/tfidf.png",width=400,height=400>

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

In [13]:
TfidfVectorizer()

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [14]:
tfidf_vector = TfidfVectorizer()
tfidf_matrix = tfidf_vector.fit_transform(docs)

In [15]:
tfidf_matrix.toarray()


array([[0.67006073, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.52488981, 0.52488981, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.30403549, 0.        , 0.47633035, 0.        , 0.        ,
        0.        , 0.47633035, 0.        , 0.        , 0.        ,
        0.        , 0.47633035, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.47633035, 0.        , 0.        ],
       [0.25215917, 0.        , 0.        , 0.        , 0.39505606,
        0.39505606, 0.        , 0.39505606, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.39505606,
        0.        , 0.39505606, 0.        , 0.        , 0.39505606],
       [0.        , 0.37796447, 0.        , 0.37796447, 0.        ,
        0.        , 0.        , 0.        , 0.37796447, 0.37796447,
        0.37796447, 0.        , 0.        , 0

In [16]:
tfidf_vector.get_feature_names()

['arkadaşlar',
 'atalım',
 'atölye',
 'biraz',
 'birlikte',
 'bugun',
 'deepcon18',
 'embedding',
 'eski',
 'fakat',
 'göz',
 'hoşgeldiniz',
 'merhaba',
 'nasılsınız',
 'sizlerle',
 'yöntemlere',
 'yöntemlerini',
 'çalışmasına',
 'önce',
 'öğreneceğiz']

In [17]:
pd.DataFrame(np.round(tfidf_matrix.toarray(),3), columns = tfidf_vector.get_feature_names())

Unnamed: 0,arkadaşlar,atalım,atölye,biraz,birlikte,bugun,deepcon18,embedding,eski,fakat,göz,hoşgeldiniz,merhaba,nasılsınız,sizlerle,yöntemlere,yöntemlerini,çalışmasına,önce,öğreneceğiz
0,0.67,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.525,0.525,0.0,0.0,0.0,0.0,0.0,0.0
1,0.304,0.0,0.476,0.0,0.0,0.0,0.476,0.0,0.0,0.0,0.0,0.476,0.0,0.0,0.0,0.0,0.0,0.476,0.0,0.0
2,0.252,0.0,0.0,0.0,0.395,0.395,0.0,0.395,0.0,0.0,0.0,0.0,0.0,0.0,0.395,0.0,0.395,0.0,0.0,0.395
3,0.0,0.378,0.0,0.378,0.0,0.0,0.0,0.0,0.378,0.378,0.378,0.0,0.0,0.0,0.0,0.378,0.0,0.0,0.378,0.0


reference : https://medium.com/deep-learning-turkiye/metin-i%CC%87%C5%9Fleme-1-eski-tarz-y%C3%B6ntemler-bag-of-words-ve-tfxidf-76d5a0cf1b29