In [1]:
import pandas as pd
import numpy as np
from numpy.linalg import norm
from scipy.linalg import norm, eigh

In [2]:
OHCO = ['review_id', 'para_num', 'sent_num', 'token_num']

## BOW

In [3]:
TOKENS = pd.read_csv('TOKENS.csv')
TOKENS.set_index(OHCO, inplace=True)

VOCAB = pd.read_csv('VOCAB.csv')
VOCAB.set_index('term_str', inplace=True)

In [4]:
bags = dict(
    SENTS = OHCO[:3],
    PARAS = OHCO[:2],
    REVIEWS = OHCO[:1]
)

bag = 'REVIEWS'

BOW = TOKENS.groupby(bags[bag]+['term_str']).term_str.count().to_frame('n') 

In [5]:
DTM = BOW.n.unstack(fill_value=0)

In [6]:
tf_method = 'sum'         # sum, max, log, double_norm, raw, binary
tf_norm_k = .5            # only used for double_norm
idf_method = 'standard'   # standard, max, smooth
gradient_cmap = 'YlGnBu'  # YlGn, GnBu, YlGnBu; For tables; see https://matplotlib.org/3.1.0/tutorials/colors/colormaps.html

tf = {
    'sum': (DTM.T / DTM.T.sum()).T,
    'max': (DTM.T / DTM.T.max()).T,
    'log': (np.log2(1 + DTM.T)).T,
    'raw':  DTM,
    'double_norm': (DTM.T / DTM.T.max()).T,
    'binary': DTM.T.astype('bool').astype('int').T
}

TF = tf[tf_method]

DF = DTM.astype('bool').sum() 

N = DTM.shape[0]

idf = {
    'standard': np.log2(N / DF),
    'max': np.log2(DF.max() / DF),
    'smooth': np.log2((1 + N) / (1 + DF)) + 1
}

IDF = idf[idf_method]

TFIDF = TF * IDF
DFIDF = DF * IDF

In [7]:
VOCAB['df'] = DF
VOCAB['idf'] = IDF
VOCAB['dfidf'] = DFIDF

VOCAB.sample(10)

Unnamed: 0_level_0,n,p,i,porter_stem,max_pos,max_pos_group,stop,df,idf,dfidf
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1850,1,7.523973e-07,20.342002,1850,CD,CD,0,1.0,12.287712,12.287712
clichésbut,1,7.523973e-07,20.342002,clichésbut,NN,NN,0,1.0,12.287712,12.287712
metalbeast,3,2.257192e-06,18.757039,metalbeast,NNP,NN,0,1.0,12.287712,12.287712
strip,24,1.805754e-05,15.757039,strip,NN,NN,0,22.0,7.828281,172.222177
commenced,1,7.523973e-07,20.342002,commenc,VBN,VB,0,1.0,12.287712,12.287712
girlfriendhe,1,7.523973e-07,20.342002,girlfriendh,NN,NN,0,1.0,12.287712,12.287712
devos,2,1.504795e-06,19.342002,devo,NNP,NN,0,2.0,11.287712,22.575425
swarms,4,3.009589e-06,18.342002,swarm,NNS,NN,0,2.0,11.287712,22.575425
immodest,1,7.523973e-07,20.342002,immodest,JJ,JJ,0,1.0,12.287712,12.287712
ze,2,1.504795e-06,19.342002,ze,NN,NN,0,1.0,12.287712,12.287712


In [8]:
VOCAB.to_csv('VOCAB.csv')

In [9]:
BOW['tf'] = TF.stack()
BOW['tfidf'] = TFIDF.stack()

BOW.sample(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,n,tf,tfidf
review_id,term_str,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1944,described,1,0.002008,0.013013
2350,feelgood,1,0.002004,0.017968
973,who,5,0.012048,0.013741
2317,recommended,1,0.005051,0.027744
3510,fixed,1,0.003676,0.031995
194,storyline,1,0.01,0.050589
477,either,1,0.002404,0.009722
647,cover,1,0.00361,0.020866
4045,opens,1,0.001312,0.008471
506,last,1,0.003759,0.012521


In [10]:
BOW.to_csv('BOW.csv')

In [11]:
TFIDF_L2 = TFIDF.apply(lambda x: x / norm(x), 1) # Pythagorean, AKA Euclidean

In [12]:
TFIDF_L2.head(5)

term_str,0,00,000,001,001p,003400,007,00s,01,010,...,à,álex,ángel,ángela,é,édith,élan,í,ôkami,überdreadful
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
TFIDF.to_csv('TFIDF.csv')
TFIDF_L2.to_csv('TFIDF_L2.csv')