# Vectorization with SciKit Learn

```yaml
Course:   DS 5001
Module:   92 Helper Notebooks
Topic:    Using SciKit Learn to Vectorize your Corpus
```

# Set Up

In [1]:
import configparser
config = configparser.ConfigParser()
config.read("../../../env.ini")
data_hone = config['DEFAULT']['data_home']
output_dir = config['DEFAULT']['output_dir']
local_lib = config['DEFAULT']['local_lib']

In [2]:
data_prefix = 'austen-melville'
OHCO = ['book_id','chap_id','para_num','sent_num','token_num']
colors = 'YlGnBu'

In [3]:
ngram_range = (1,2)
n_terms = 4000

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns; sns.set()

In [5]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

# Import CORPUS

In [6]:
LIB = pd.read_csv(f"{output_dir}/{data_prefix}-LIB.csv").rename(columns={'book':'book_id'}).set_index(OHCO[:1])

In [7]:
CORPUS = pd.read_csv(f"{output_dir}/{data_prefix}-CORPUS.csv").set_index(OHCO)

In [8]:
# CORPUS.head()

# Create DOC

In [9]:
def gather_docs(CORPUS, ohco_level, term_col='term_str'):
    OHCO = CORPUS.index.names
    CORPUS[term_col] = CORPUS[term_col].astype('str')
    DOC = CORPUS.groupby(OHCO[:ohco_level])[term_col].apply(lambda x:' '.join(x)).to_frame('doc_str')
    return DOC

In [10]:
DOC = gather_docs(CORPUS, 2)

In [11]:
DOC['n_tokens'] = DOC.doc_str.apply(lambda x: len(x.split()))

In [12]:
DOC

Unnamed: 0_level_0,Unnamed: 1_level_0,doc_str,n_tokens
book_id,chap_id,Unnamed: 2_level_1,Unnamed: 3_level_1
105,1,sir walter elliot of kellynch hall in somerset...,2625
105,2,mr shepherd a civil cautious lawyer who whatev...,1974
105,3,i must take leave to observe sir walter said m...,2838
105,4,he was not mr wentworth the former curate of m...,1805
105,5,on the morning appointed for admiral and mrs c...,3322
...,...,...,...
34970,110,in the midst of all these mental confusions th...,658
34970,111,gaining the apostles and leaving his two compa...,1084
34970,112,pierre passed on to a remote quarter of the bu...,659
34970,113,that sundown pierre stood solitary in a low du...,380


# Method 1: CountVectorizer + TfidfTransformer

## Create DTM

In [13]:
count_engine = CountVectorizer(
    stop_words = 'english',
    ngram_range = ngram_range,
    max_features = n_terms)

In [14]:
X = count_engine.fit_transform(DOC.doc_str)

In [15]:
X.toarray()

array([[0, 0, 1, ..., 1, 3, 1],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 2, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [16]:
# count_engine.get_feature_names()

In [17]:
DTM = pd.DataFrame(X.toarray(), 
                   columns=count_engine.get_feature_names_out(), 
                   index=DOC.index)

In [18]:
DTM

Unnamed: 0_level_0,Unnamed: 1_level_0,abandoned,abbey,able,aboard,abode,abrazza,abroad,absence,absent,absolute,...,young ladies,young lady,young man,young men,young people,young woman,younger,youngest,youth,youthful
book_id,chap_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
105,1,0,0,1,0,0,0,1,0,0,0,...,0,0,2,0,0,0,1,1,3,1
105,2,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
105,3,0,0,0,0,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,2,0
105,4,0,0,0,0,0,0,0,0,0,0,...,0,0,3,0,0,0,1,0,3,0
105,5,0,0,2,0,0,0,1,2,0,0,...,2,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34970,110,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
34970,111,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
34970,112,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
34970,113,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Get VOCAB

In [19]:
VOCAB = DTM.sum().to_frame('n')

In [20]:
VOCAB.sort_index()

Unnamed: 0,n
abandoned,72
abbey,84
able,414
aboard,125
abode,57
...,...
young woman,70
younger,86
youngest,43
youth,229


In [21]:
VOCAB['n_chars'] = VOCAB.apply(lambda x: len(x.name), 1)
VOCAB['n_tokens'] = VOCAB.apply(lambda x: len(x.name.split()), 1)

In [22]:
VOCAB.value_counts('n_tokens')

n_tokens
1    3817
2     183
Name: count, dtype: int64

In [23]:
VOCAB[VOCAB.n_tokens == 2]

Unnamed: 0,n,n_chars,n_tokens
art thou,42,8,2
aye aye,42,7,2
board ship,46,10,2
braid beard,68,11,2
bread fruit,83,11,2
...,...,...,...
young lady,109,10,2
young man,301,9,2
young men,94,9,2
young people,68,12,2


## Create TFIDF

In [24]:
tfidf_engine = TfidfTransformer(norm='l2', use_idf=True)

In [25]:
X1 = tfidf_engine.fit_transform(DTM)

In [26]:
TFIDF = pd.DataFrame(X1.toarray(), columns=DTM.columns, index=DTM.index)

In [27]:
TFIDF

Unnamed: 0_level_0,Unnamed: 1_level_0,abandoned,abbey,able,aboard,abode,abrazza,abroad,absence,absent,absolute,...,young ladies,young lady,young man,young men,young people,young woman,younger,youngest,youth,youthful
book_id,chap_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
105,1,0.000000,0.0,0.011987,0.0,0.000000,0.0,0.016259,0.000000,0.0,0.0,...,0.000000,0.000000,0.028132,0.0,0.000000,0.000000,0.018303,0.021815,0.043166,0.020031
105,2,0.000000,0.0,0.013681,0.0,0.022532,0.0,0.000000,0.000000,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.000000,0.022121,0.000000,0.000000,0.000000,0.000000
105,3,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.028749,0.000000,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.025442,0.000000
105,4,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,...,0.000000,0.000000,0.070330,0.0,0.000000,0.000000,0.030504,0.000000,0.071942,0.000000
105,5,0.000000,0.0,0.023698,0.0,0.000000,0.0,0.016072,0.029660,0.0,0.0,...,0.034333,0.000000,0.000000,0.0,0.019608,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34970,110,0.055004,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
34970,111,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.041898,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
34970,112,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
34970,113,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


## Add stats to VOCAB

In [28]:
VOCAB['tfidf_mean'] = TFIDF.mean()
VOCAB['df'] = DTM[DTM > 0].count()
VOCAB['dfidf'] = VOCAB.df * np.log2(len(TFIDF)/VOCAB.df)

In [29]:
VOCAB.sort_values('dfidf', ascending=False).head(10)

Unnamed: 0,n,n_chars,n_tokens,tfidf_mean,df,dfidf
home,954,4,1,0.011147,437,628.922479
cried,1232,5,1,0.014958,438,628.917316
certain,707,7,1,0.009779,433,628.91004
looking,880,7,1,0.010437,429,628.84429
manner,748,6,1,0.008846,427,628.791269
general,831,7,1,0.010142,427,628.791269
felt,1000,4,1,0.011266,427,628.791269
set,717,3,1,0.009069,445,628.789369
oh,1311,2,1,0.015113,447,628.723524
times,737,5,1,0.01015,447,628.723524


# Method 2: TfidfVectorizer

This method combines the two.

In [30]:
# TfidfVectorizer?

## Create TFIDF

In [31]:
tfidf_engine2 = TfidfVectorizer(
    stop_words = 'english',
    ngram_range = ngram_range,
    max_features = n_terms,
    norm = 'l2', 
    use_idf = True)

In [32]:
X2 = tfidf_engine2.fit_transform(DOC.doc_str)

In [33]:
TFIDF2 = pd.DataFrame(X2.toarray(), columns=tfidf_engine2.get_feature_names_out(), index=DTM.index)

In [34]:
TFIDF2

Unnamed: 0_level_0,Unnamed: 1_level_0,abandoned,abbey,able,aboard,abode,abrazza,abroad,absence,absent,absolute,...,young ladies,young lady,young man,young men,young people,young woman,younger,youngest,youth,youthful
book_id,chap_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
105,1,0.000000,0.0,0.011987,0.0,0.000000,0.0,0.016259,0.000000,0.0,0.0,...,0.000000,0.000000,0.028132,0.0,0.000000,0.000000,0.018303,0.021815,0.043166,0.020031
105,2,0.000000,0.0,0.013681,0.0,0.022532,0.0,0.000000,0.000000,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.000000,0.022121,0.000000,0.000000,0.000000,0.000000
105,3,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.028749,0.000000,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.025442,0.000000
105,4,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,...,0.000000,0.000000,0.070330,0.0,0.000000,0.000000,0.030504,0.000000,0.071942,0.000000
105,5,0.000000,0.0,0.023698,0.0,0.000000,0.0,0.016072,0.029660,0.0,0.0,...,0.034333,0.000000,0.000000,0.0,0.019608,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34970,110,0.055004,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
34970,111,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.041898,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
34970,112,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
34970,113,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


## Get VOCAB

In [35]:
VOCAB2 = TFIDF2.mean().to_frame('tfidf_mean')

In [36]:
VOCAB2

Unnamed: 0,tfidf_mean
abandoned,0.001886
abbey,0.001909
able,0.005772
aboard,0.003997
abode,0.001702
...,...
young woman,0.001313
younger,0.001852
youngest,0.000926
youth,0.004836


## Add stats to VOCAB

In [37]:
VOCAB2['df'] = TFIDF2[TFIDF2 > 0].count()
VOCAB2['dfidf'] = VOCAB2.df * np.log2(len(TFIDF2)/VOCAB2.df)

In [38]:
VOCAB2.sort_values('dfidf', ascending=False).head(10)

Unnamed: 0,tfidf_mean,df,dfidf
home,0.011147,437,628.922479
cried,0.014958,438,628.917316
certain,0.009779,433,628.91004
looking,0.010437,429,628.84429
manner,0.008846,427,628.791269
general,0.010142,427,628.791269
felt,0.011266,427,628.791269
set,0.009069,445,628.789369
oh,0.015113,447,628.723524
times,0.01015,447,628.723524


# Create BOW

In [39]:
BOW = DTM[DTM > 0].stack().to_frame('n')\
    .join(TFIDF[TFIDF > 0].stack().to_frame('tfidf'))

In [40]:
BOW

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n,tfidf
book_id,chap_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
105,1,able,1.0,0.011987
105,1,abroad,1.0,0.016259
105,1,acknowledged,1.0,0.017843
105,1,acquaintance,4.0,0.050617
105,1,added,2.0,0.022707
...,...,...,...,...
34970,114,ye,1.0,0.041773
34970,114,yes,2.0,0.067853
34970,114,yes yes,1.0,0.068079
34970,114,young,2.0,0.056646
