# Vectorizing Plato's Dialogues
## Iris Wu (iw5hte@virginia.edu) DS 5001 Spring 2023

## End goal of this notebook:
Produce a vector representation of the corpus to generate TFIDF values to add to the TOKEN (aka CORPUS) and VOCAB tables (F4).

### Setting up necessary tools:

Importing useful packages -

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import nltk

Defining useful filepaths for reading and outputting data -

In [2]:
data_in = 'data/output'
data_out = 'data/output'
data_prefix = 'plato'

Setting useful configurations -

In [3]:
OHCO = ['book_id', 'chap_id', 'para_num', 'sent_num', 'token_num']
SENTS = OHCO[:4]
PARAS = OHCO[:3]
CHAPS = OHCO[:2]
BOOKS = OHCO[:1]

In [80]:
bag = PARAS

### Importing the data -

In [5]:
LIB = pd.read_csv(f"{data_in}/{data_prefix}-LIB.csv").set_index(BOOKS)
TOKEN = pd.read_csv(f'{data_in}/{data_prefix}-CORPUS.csv').set_index(OHCO).dropna()
VOCAB = pd.read_csv(f'{data_in}/{data_prefix}-VOCAB.csv').set_index('term_str').dropna()
POS_GROUP = pd.read_csv(f'{data_in}/{data_prefix}-POS_GROUP.csv').set_index('pos_group')

### Defining useful functions to create a bag of words and calculate TF IDF

In [6]:
def make_bow(corp, b):
    cats = b + ['term_str']
    BOW = corp.groupby(by=cats).term_str.count().to_frame('n') 
    return BOW

In [75]:
def calc_TFIDF(bow, measure):
    DTCM = bow.n.unstack().fillna(0).astype('int')
    DF = DTCM.astype('bool').sum()
    print('TF method:', measure)
    if measure == 'sum':
        TF = DTCM.T / DTCM.T.sum()
    elif measure == 'max':
        TF = DTCM.T / DTCM.T.max()
    elif measure == 'log':
        TF = np.log2(1 + DTCM.T)
    elif measure == 'raw':
        TF = DTCM.T
    elif measure == 'double_norm':
        TF = DTCM.T / DTCM.T.max()
    elif measure == 'binary':
        TF = DTCM.T.astype('bool').astype('int')
    TF = TF.T
    DF = DTCM.astype('bool').sum()
    N = DTCM.shape[0]
    IDF = np.log2(N / DF)
    TFIDF = TF * IDF
    return [DF, IDF, TFIDF]

### Calculating TFIDF with sum

In [81]:
bow = make_bow(TOKEN, bag)
tf_idf_agg = calc_TFIDF(bow, 'sum')
tf_idf = tf_idf_agg[2]
df = tf_idf_agg[0]
idf = tf_idf_agg[1]
tf_idf

TF method: sum


Unnamed: 0_level_0,Unnamed: 1_level_0,term_str,1,10,100,10000,11,12,120,1248,13927,151,...,zeus,zeuxippus,zeuxis,zodiac,zone,zones,zopyrus,zoroaster,zosin,zugon
book_id,chap_id,para_num,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
1497,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1497,1,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1497,1,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1497,1,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1497,1,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1750,12,121,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1750,12,122,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1750,12,123,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1750,12,124,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Adding TFIDF data to other DataFrames

In [82]:
bow['tfidf'] = tf_idf.stack()
bow

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,n,tfidf
book_id,chap_id,para_num,term_str,Unnamed: 4_level_1,Unnamed: 5_level_1
1497,1,1,a,2,0.028596
1497,1,1,also,1,0.030383
1497,1,1,and,6,0.049761
1497,1,1,ariston,1,0.085455
1497,1,1,artemis,1,0.085455
...,...,...,...,...,...
1750,12,127,laws,1,0.598648
1750,12,127,of,2,0.261360
1750,12,127,plato,1,0.767098
1750,12,127,project,1,0.970073


Adding TFIDF data to VOCAB table -

In [83]:
VOCAB['tfidf_mean'] = bow.groupby('term_str').tfidf.mean()
VOCAB['tfidf_sum'] = tf_idf.sum()
VOCAB['tfidf_median'] = bow.groupby('term_str').tfidf.median()
VOCAB['tfidf_max'] = tf_idf.max()
VOCAB['df'] = df
VOCAB['idf'] = idf
VOCAB['dfidf'] = VOCAB.df * VOCAB.idf
VOCAB = VOCAB.sort_values(['tfidf_sum'], ascending=False)
VOCAB = VOCAB.dropna()
VOCAB

Unnamed: 0_level_0,n,p,i,n_chars,cat_pos,max_pos,stop,stem_porter,stem_snowball,stem_lancaster,n_pos,tfidf_mean,tfidf_sum,tfidf_median,tfidf_max,df,idf,dfidf
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
yes,1751,0.002370,8.720863,3,"{'NNP', 'POS', 'IN', 'VBP', 'CD', 'PRP', 'VB',...",NNP,0,ye,yes,ye,17,1.226361,2133.867957,0.880205,3.520820,1740,3.520820,6126.226105
true,2218,0.003002,8.379783,4,"{'NNP', 'IN', 'VBP', 'VBG', 'JJ', 'VB', 'VBD',...",JJ,0,true,true,tru,12,0.969068,2001.125406,0.818441,3.273765,2065,3.273765,6760.324983
certainly,1240,0.001678,9.218702,9,"{'NNP', 'POS', 'VBP', 'JJ', 'VB', 'VBD', 'NN',...",NNP,0,certainli,certain,certain,10,1.595651,1969.033328,1.338855,4.016565,1234,4.016565,4956.440608
theaetetus,1100,0.001489,9.391538,10,"{'NNP', 'IN', 'POS', 'VBP', 'VB', 'TO', 'NN', ...",NNP,0,theaetetu,theaetetus,theaetet,9,1.101907,1203.282028,1.048234,2.096467,1092,4.192934,4578.683983
cleinias,946,0.001280,9.609130,8,"{'NNP', 'IN', 'PDT', 'VB', 'NN', 'VBN'}",NNP,0,cleinia,cleinia,cleinia,6,0.959805,892.618185,0.884921,2.212302,930,4.424604,4114.881985
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
hop,1,0.000001,19.494826,3,{'VB'},VB,0,hop,hop,hop,1,0.007252,0.007252,0.007252,0.007252,1,14.285691,14.285691
otys,1,0.000001,19.494826,4,{'NNP'},NNP,0,oti,oti,oty,1,0.007252,0.007252,0.007252,0.007252,1,14.285691,14.285691
halved,1,0.000001,19.494826,6,{'VBN'},VBN,0,halv,halv,halv,1,0.007252,0.007252,0.007252,0.007252,1,14.285691,14.285691
apple,1,0.000001,19.494826,5,{'NN'},NN,0,appl,appl,appl,1,0.007252,0.007252,0.007252,0.007252,1,14.285691,14.285691


Adding TFIDF data to TOKENS table - 

In [None]:
TOKEN = TOKEN.reset_index().set_index(['book_id', 'chap_id', 'para_num', 'term_str'])
TOKEN['tfidf'] = tf_idf.stack()
TOKEN = TOKEN.reset_index().set_index(['book_id', 'chap_id', 'para_num', 'sent_num', 'token_num'])
TOKEN

### Outputting all the tables as csvs - 

In [None]:
LIB.to_csv(f'{data_out}/{data_prefix}-LIB.csv')
VOCAB.to_csv(f'{data_out}/{data_prefix}-VOCAB.csv')
TOKEN.to_csv(f'{data_out}/{data_prefix}-CORPUS.csv')
bow.to_csv(f'{data_out}/{data_prefix}-BOW.csv')