# Vectorizing Plato's Dialogues
## Iris Wu (iw5hte@virginia.edu) DS 5001 Spring 2023

## End goal of this notebook:
Produce a vector representation of the corpus to generate TFIDF values to add to the TOKEN (aka CORPUS) and VOCAB tables (F4).

### Setting up necessary tools:

Importing useful packages -

In [25]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import nltk

Defining useful filepaths for reading and outputting data -

In [26]:
data_in = 'data/output'
data_out = 'data/output'
data_prefix = 'plato'

Setting useful configurations -

In [27]:
OHCO = ['book_id', 'chap_id', 'para_num', 'sent_num', 'token_num']
SENTS = OHCO[:4]
PARAS = OHCO[:3]
CHAPS = OHCO[:2]
BOOKS = OHCO[:1]

In [28]:
bag = CHAPS

### Importing the data -

In [29]:
LIB = pd.read_csv(f"{data_in}/{data_prefix}-LIB.csv").set_index(BOOKS)
TOKEN = pd.read_csv(f'{data_in}/{data_prefix}-CORPUS.csv').set_index(OHCO).dropna()
VOCAB = pd.read_csv(f'{data_in}/{data_prefix}-VOCAB.csv').set_index('term_str').dropna()
POS_GROUP = pd.read_csv(f'{data_in}/{data_prefix}-POS_GROUP.csv').set_index('pos_group')

### Defining useful functions to create a bag of words and calculate TF IDF

In [30]:
def make_bow(corp, b):
    cats = b + ['term_str']
    BOW = corp.groupby(by=cats).term_str.count().to_frame('n') 
    return BOW

In [31]:
def calc_TFIDF(bow, measure):
    DTCM = bow.n.unstack().fillna(0).astype('int')
    DF = DTCM.astype('bool').sum()
    print('TF method:', measure)
    if measure == 'sum':
        TF = DTCM.T / DTCM.T.sum()
    elif measure == 'max':
        TF = DTCM.T / DTCM.T.max()
    elif measure == 'log':
        TF = np.log2(1 + DTCM.T)
    elif measure == 'raw':
        TF = DTCM.T
    elif measure == 'double_norm':
        TF = DTCM.T / DTCM.T.max()
    elif measure == 'binary':
        TF = DTCM.T.astype('bool').astype('int')
    TF = TF.T
    DF = DTCM.astype('bool').sum()
    N = DTCM.shape[0]
    IDF = np.log2(N / DF)
    TFIDF = TF * IDF
    return [DF, IDF, TFIDF]

### Calculating TFIDF with sum

In [32]:
bow = make_bow(TOKEN, bag)
tf_idf_agg = calc_TFIDF(bow, 'sum')
tf_idf = tf_idf_agg[2]
df = tf_idf_agg[0]
idf = tf_idf_agg[1]
tf_idf

TF method: sum


Unnamed: 0_level_0,term_str,1,10,100,10000,11,12,120,1248,13927,151,...,zeus,zeuxippus,zeuxis,zodiac,zone,zones,zopyrus,zoroaster,zosin,zugon
book_id,chap_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1497,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1497,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000339,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1497,3,0.0,0.0,0.0,0.0,0.000417,0.000303,0.0,0.0,0.0,0.0,...,0.000201,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1497,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1497,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1497,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1497,7,0.000215,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1497,8,0.00044,0.0,0.003031,0.000514,0.0,0.0,0.0,0.0,0.0,0.0,...,4.9e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1497,9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,6.2e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1497,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.8e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Adding TFIDF data to other DataFrames

In [33]:
bow['tfidf'] = tf_idf.stack()
bow

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n,tfidf
book_id,chap_id,term_str,Unnamed: 3_level_1,Unnamed: 4_level_1
1497,1,a,213,0.000000
1497,1,abiding,1,0.000190
1497,1,able,13,0.000172
1497,1,abler,1,0.000390
1497,1,abode,1,0.000261
...,...,...,...,...
1750,12,yourself,1,0.000039
1750,12,youth,2,0.000057
1750,12,youths,1,0.000135
1750,12,zeal,1,0.000210


Adding TFIDF data to VOCAB table -

In [34]:
VOCAB['tfidf_mean'] = bow.groupby('term_str').tfidf.mean()
VOCAB['tfidf_sum'] = tf_idf.sum()
VOCAB['tfidf_median'] = bow.groupby('term_str').tfidf.median()
VOCAB['tfidf_max'] = tf_idf.max()
VOCAB['df'] = df
VOCAB['idf'] = idf
VOCAB['dfidf'] = VOCAB.df * VOCAB.idf
VOCAB = VOCAB.sort_values(['tfidf_sum'], ascending=False)
VOCAB = VOCAB.dropna()
VOCAB

Unnamed: 0_level_0,n,p,i,n_chars,cat_pos,max_pos,stop,stem_porter,stem_snowball,stem_lancaster,n_pos,tfidf_mean,tfidf_sum,tfidf_median,tfidf_max,df,idf,dfidf
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
socrates,5337,0.007224,7.113013,8,"{'MD', 'IN', 'NNPS', 'PDT', 'NNP', 'NN', 'POS'...",NNP,0,socrat,socrat,socr,16,0.004553,0.177549,0.001816,0.017516,39,0.521953,20.356155
alcibiades,612,0.000828,10.237438,10,"{'IN', 'NNP', 'NN', 'POS', 'VBP', 'VBZ', 'NNS'...",NNP,0,alcibiad,alcibiad,alcibiad,9,0.027458,0.164746,0.002359,0.104657,6,3.222392,19.334355
theaetetus,1100,0.001489,9.391538,10,"{'IN', 'NNP', 'NN', 'POS', 'VBP', 'VB', 'VBN',...",NNP,0,theaetetu,theaetetus,theaetet,9,0.041168,0.164672,0.030031,0.103966,4,3.807355,15.229420
protarchus,619,0.000838,10.221031,10,"{'NNP', 'NN', 'VBZ', 'VB', 'NNS', 'VBN'}",NNP,0,protarchu,protarchus,protarch,6,0.153635,0.153635,0.153635,0.153635,1,5.807355,5.807355
cleinias,946,0.001280,9.609130,8,"{'IN', 'PDT', 'NNP', 'NN', 'VB', 'VBN'}",NNP,0,cleinia,cleinia,cleinia,6,0.008232,0.139937,0.006393,0.020605,17,1.719892,29.238165
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
all,3694,0.005000,7.643858,3,"{'IN', 'PDT', 'DT', 'CD', 'NNP', 'NN', 'POS', ...",DT,1,all,all,al,16,0.000000,0.000000,0.000000,0.000000,56,0.000000,0.000000
in,13321,0.018031,5.793412,2,"{'IN', 'CD', 'POS', 'NN', 'NNP', 'VBP', 'VB', ...",IN,1,in,in,in,10,0.000000,0.000000,0.000000,0.000000,56,0.000000,0.000000
some,1293,0.001750,9.158320,4,"{'DT', 'CD', 'NN', 'NNS', 'VB', 'JJ'}",DT,1,some,some,som,6,0.000000,0.000000,0.000000,0.000000,56,0.000000,0.000000
we,5200,0.007038,7.150530,2,"{'NNP', 'NN', 'POS', 'VBP', 'VBD', 'VBZ', 'VB'...",PRP,1,we,we,we,11,0.000000,0.000000,0.000000,0.000000,56,0.000000,0.000000


Adding TFIDF data to TOKENS table - 

In [37]:
TOKEN = TOKEN.reset_index().set_index(['book_id', 'chap_id', 'term_str'])
TOKEN['tfidf'] = tf_idf.stack()
TOKEN = TOKEN.reset_index().set_index(['book_id', 'chap_id', 'para_num', 'sent_num', 'token_num'])
TOKEN

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,term_str,pos_tuple,pos,token_str,pos_group,tfidf
book_id,chap_id,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1497,1,1,0,0,i,"('I', 'PRP')",PRP,I,PR,0.002478
1497,1,1,0,1,went,"('went', 'VBD')",VBD,went,VB,0.000196
1497,1,1,0,2,down,"('down', 'RB')",RB,down,RB,0.000124
1497,1,1,0,3,yesterday,"('yesterday', 'NN')",NN,yesterday,NN,0.000133
1497,1,1,0,4,to,"('to', 'TO')",TO,to,TO,0.000000
...,...,...,...,...,...,...,...,...,...,...
1750,12,127,0,5,ebook,"('EBook', 'NNP')",NNP,EBook,NN,0.000091
1750,12,127,0,6,of,"('of', 'IN')",IN,of,IN,0.000000
1750,12,127,0,7,laws,"('Laws,', 'NNP')",NNP,"Laws,",NN,0.001053
1750,12,127,0,8,by,"('by', 'IN')",IN,by,IN,0.000000


### Outputting all the tables as csvs - 

In [38]:
LIB.to_csv(f'{data_out}/{data_prefix}-LIB.csv')
VOCAB.to_csv(f'{data_out}/{data_prefix}-VOCAB-CHAP.csv')
TOKEN.to_csv(f'{data_out}/{data_prefix}-CORPUS-CHAP.csv')
bow.to_csv(f'{data_out}/{data_prefix}-BOW-CHAP.csv')