# Annotating Plato's Dialogues
## Iris Wu (iw5hte@virginia.edu) DS 5001 Spring 2023

## End goal of this notebook:
Annotate these tables with statistical and linguistic features using NLP libraries such as NLTK (F3).

### Setting up necessary tools:

Importing useful packages -

In [17]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import nltk

Defining useful filepaths for reading and outputting data -

In [18]:
data_in = 'data/output'
data_out = 'data/output'
data_prefix = 'plato'

Setting useful configurations -

In [19]:
OHCO = ['book_id', 'chap_id', 'para_num', 'sent_num', 'token_num']
SENTS = OHCO[:4]
PARAS = OHCO[:3]
CHAPS = OHCO[:2]
BOOKS = OHCO[:1]

### Importing the data -

In [20]:
LIB = pd.read_csv(f"{data_in}/{data_prefix}-LIB.csv").set_index(BOOKS)
TOKEN = pd.read_csv(f'{data_in}/{data_prefix}-CORPUS.csv').set_index(OHCO).dropna()
VOCAB = pd.read_csv(f'{data_in}/{data_prefix}-VOCAB.csv').set_index('term_str').dropna()
POS = pd.read_csv(f'{data_in}/misc/upenn_tagset.txt', sep='\t', names=['pos_code','def'])

### Adding Part of Speech data to TOKEN table - 

Adding number of times a part of speech appears - 

In [21]:
POS = POS[POS.pos_code.str.match(r'^\w')].set_index('pos_code') # Keep only letter codes
POS['n'] = TOKEN.pos.value_counts().to_frame().sort_index()
POS['n'] = POS['n'].fillna(0).astype('int')
POS.sort_values('n', ascending=False)
POS

Unnamed: 0_level_0,def,n
pos_code,Unnamed: 1_level_1,Unnamed: 2_level_1
CC,"conjunction, coordinating",48102
CD,"numeral, cardinal",4997
DT,determiner,76691
EX,existential there,2784
FW,foreign word,71
IN,"preposition or conjunction, subordinating",95244
JJ,"adjective or numeral, ordinal",45019
JJR,"adjective, comparative",2640
JJS,"adjective, superlative",1753
LS,list item marker,4


Adding part of speech groups to POS and TOKEN tables - 

In [22]:
POS['pos_group'] = POS.apply(lambda x: x.name[:2], 1)
TOKEN['pos_group'] = TOKEN.pos.str[:2]
TOKEN

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str,term_str,pos_group
book_id,chap_id,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1497,1,1,0,0,"('I', 'PRP')",PRP,I,i,PR
1497,1,1,0,1,"('went', 'VBD')",VBD,went,went,VB
1497,1,1,0,2,"('down', 'RB')",RB,down,down,RB
1497,1,1,0,3,"('yesterday', 'NN')",NN,yesterday,yesterday,NN
1497,1,1,0,4,"('to', 'TO')",TO,to,to,TO
...,...,...,...,...,...,...,...,...,...
1750,12,127,0,5,"('EBook', 'NNP')",NNP,EBook,ebook,NN
1750,12,127,0,6,"('of', 'IN')",IN,of,of,IN
1750,12,127,0,7,"('Laws,', 'NNP')",NNP,"Laws,",laws,NN
1750,12,127,0,8,"('by', 'IN')",IN,by,by,IN


Adding probability and entropy data to POS_GROUP table - 

In [23]:
POS_GROUP = POS.groupby('pos_group').n.sum().to_frame('n')
POS_GROUP = POS_GROUP[POS_GROUP.n > 0]
POS_GROUP['def'] = POS.groupby('pos_group').apply(lambda x: '; '.join(x['def']))
POS_GROUP['p'] = POS_GROUP.n / POS_GROUP.n.sum()
POS_GROUP['i'] = np.log2(1/POS_GROUP.p)
POS_GROUP['h'] = POS_GROUP.p * POS_GROUP.i
POS_GROUP

Unnamed: 0_level_0,n,def,p,i,h
pos_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CC,48102,"conjunction, coordinating",0.065108,3.941017,0.256593
CD,4997,"numeral, cardinal",0.006764,7.20798,0.048752
DT,76691,determiner,0.103805,3.268057,0.33924
EX,2784,existential there,0.003768,8.051883,0.030342
FW,71,foreign word,9.6e-05,13.345079,0.001282
IN,95244,"preposition or conjunction, subordinating",0.128917,2.955486,0.381012
JJ,49412,"adjective or numeral, ordinal; adjective, co...",0.066881,3.902252,0.260988
LS,4,list item marker,5e-06,17.494826,9.5e-05
MD,17699,modal auxiliary,0.023956,5.383446,0.128968
NN,170116,"noun, common, singular or mass; noun, proper...",0.23026,2.118667,0.487843


### Annotating VOCAB table -

Adding probabilities, logs, and numbers of characters for each term -

In [24]:
VOCAB['p'] = VOCAB.n / VOCAB.n.sum()
VOCAB['i'] = -np.log2(VOCAB.p)
VOCAB['n_chars'] = VOCAB.index.str.len()
VOCAB

Unnamed: 0_level_0,n,p,i,n_chars
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,38,0.000051,14.246899,1
10,1,0.000001,19.494826,2
100,13,0.000018,15.794387,3
10000,1,0.000001,19.494826,5
11,1,0.000001,19.494826,2
...,...,...,...,...
zones,9,0.000012,16.324901,5
zopyrus,1,0.000001,19.494826,7
zoroaster,1,0.000001,19.494826,9
zosin,1,0.000001,19.494826,5


Adding all parts of speech - 

In [25]:
VOCAB['cat_pos'] = TOKEN[['term_str','pos']].value_counts().to_frame('n').reset_index()\
    .groupby('term_str').pos.apply(lambda x: set(x))
VOCAB

Unnamed: 0_level_0,n,p,i,n_chars,cat_pos
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,38,0.000051,14.246899,1,"{PDT, CD, NNP, NN, VBZ, VB, NNS}"
10,1,0.000001,19.494826,2,{CD}
100,13,0.000018,15.794387,3,{CD}
10000,1,0.000001,19.494826,5,{CD}
11,1,0.000001,19.494826,2,{CD}
...,...,...,...,...,...
zones,9,0.000012,16.324901,5,"{NNS, NN, VBP}"
zopyrus,1,0.000001,19.494826,7,{NNP}
zoroaster,1,0.000001,19.494826,9,{NNP}
zosin,1,0.000001,19.494826,5,{NN}


Adding most common part of speech annotation -

In [26]:
VOCAB['max_pos'] = TOKEN[['term_str','pos']].value_counts().unstack(fill_value=0).idxmax(1)
VOCAB

Unnamed: 0_level_0,n,p,i,n_chars,cat_pos,max_pos
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,38,0.000051,14.246899,1,"{PDT, CD, NNP, NN, VBZ, VB, NNS}",CD
10,1,0.000001,19.494826,2,{CD},CD
100,13,0.000018,15.794387,3,{CD},CD
10000,1,0.000001,19.494826,5,{CD},CD
11,1,0.000001,19.494826,2,{CD},CD
...,...,...,...,...,...,...
zones,9,0.000012,16.324901,5,"{NNS, NN, VBP}",NNS
zopyrus,1,0.000001,19.494826,7,{NNP},NNP
zoroaster,1,0.000001,19.494826,9,{NNP},NNP
zosin,1,0.000001,19.494826,5,{NN},NN


Adding stopword annotation -

In [27]:
sw = pd.DataFrame(nltk.corpus.stopwords.words('english'), columns=['term_str'])
sw = sw.reset_index().set_index('term_str')
sw.columns = ['dummy']
sw.dummy = 1
VOCAB['stop'] = VOCAB.index.map(sw.dummy)
VOCAB['stop'] = VOCAB['stop'].fillna(0).astype('int')
VOCAB

Unnamed: 0_level_0,n,p,i,n_chars,cat_pos,max_pos,stop
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,38,0.000051,14.246899,1,"{PDT, CD, NNP, NN, VBZ, VB, NNS}",CD,0
10,1,0.000001,19.494826,2,{CD},CD,0
100,13,0.000018,15.794387,3,{CD},CD,0
10000,1,0.000001,19.494826,5,{CD},CD,0
11,1,0.000001,19.494826,2,{CD},CD,0
...,...,...,...,...,...,...,...
zones,9,0.000012,16.324901,5,"{NNS, NN, VBP}",NNS,0
zopyrus,1,0.000001,19.494826,7,{NNP},NNP,0
zoroaster,1,0.000001,19.494826,9,{NNP},NNP,0
zosin,1,0.000001,19.494826,5,{NN},NN,0


Adding three kinds of stems - 

In [28]:
from nltk.stem.porter import PorterStemmer
stemmer1 = PorterStemmer()
VOCAB['stem_porter'] = VOCAB.apply(lambda x: stemmer1.stem(x.name), 1)

from nltk.stem.snowball import SnowballStemmer
stemmer2 = SnowballStemmer("english")
VOCAB['stem_snowball'] = VOCAB.apply(lambda x: stemmer2.stem(x.name), 1)

from nltk.stem.lancaster import LancasterStemmer
stemmer3 = LancasterStemmer()
VOCAB['stem_lancaster'] = VOCAB.apply(lambda x: stemmer3.stem(x.name), 1)
VOCAB

Unnamed: 0_level_0,n,p,i,n_chars,cat_pos,max_pos,stop,stem_porter,stem_snowball,stem_lancaster
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,38,0.000051,14.246899,1,"{PDT, CD, NNP, NN, VBZ, VB, NNS}",CD,0,1,1,1
10,1,0.000001,19.494826,2,{CD},CD,0,10,10,10
100,13,0.000018,15.794387,3,{CD},CD,0,100,100,100
10000,1,0.000001,19.494826,5,{CD},CD,0,10000,10000,10000
11,1,0.000001,19.494826,2,{CD},CD,0,11,11,11
...,...,...,...,...,...,...,...,...,...,...
zones,9,0.000012,16.324901,5,"{NNS, NN, VBP}",NNS,0,zone,zone,zon
zopyrus,1,0.000001,19.494826,7,{NNP},NNP,0,zopyru,zopyrus,zopyr
zoroaster,1,0.000001,19.494826,9,{NNP},NNP,0,zoroast,zoroast,zoroast
zosin,1,0.000001,19.494826,5,{NN},NN,0,zosin,zosin,zosin


Generating TPM table and total number of parts of speech for VOCAB table - 

In [29]:
TPM = TOKEN[['term_str','pos']].value_counts().unstack()
TPM

pos,CC,CD,DT,EX,FW,IN,JJ,JJR,JJS,LS,...,VB,VBD,VBG,VBN,VBP,VBZ,WDT,WP,WP$,WRB
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,24.0,,,,,,,,,...,2.0,,,,,2.0,,,,
10,,1.0,,,,,,,,,...,,,,,,,,,,
100,,13.0,,,,,,,,,...,,,,,,,,,,
10000,,1.0,,,,,,,,,...,,,,,,,,,,
11,,1.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zones,,,,,,,,,,,...,,,,,1.0,,,,,
zopyrus,,,,,,,,,,,...,,,,,,,,,,
zoroaster,,,,,,,,,,,...,,,,,,,,,,
zosin,,,,,,,,,,,...,,,,,,,,,,


In [30]:
VOCAB['n_pos'] = TPM.count(1)
VOCAB

Unnamed: 0_level_0,n,p,i,n_chars,cat_pos,max_pos,stop,stem_porter,stem_snowball,stem_lancaster,n_pos
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,38,0.000051,14.246899,1,"{PDT, CD, NNP, NN, VBZ, VB, NNS}",CD,0,1,1,1,7
10,1,0.000001,19.494826,2,{CD},CD,0,10,10,10,1
100,13,0.000018,15.794387,3,{CD},CD,0,100,100,100,1
10000,1,0.000001,19.494826,5,{CD},CD,0,10000,10000,10000,1
11,1,0.000001,19.494826,2,{CD},CD,0,11,11,11,1
...,...,...,...,...,...,...,...,...,...,...,...
zones,9,0.000012,16.324901,5,"{NNS, NN, VBP}",NNS,0,zone,zone,zon,3
zopyrus,1,0.000001,19.494826,7,{NNP},NNP,0,zopyru,zopyrus,zopyr,1
zoroaster,1,0.000001,19.494826,9,{NNP},NNP,0,zoroast,zoroast,zoroast,1
zosin,1,0.000001,19.494826,5,{NN},NN,0,zosin,zosin,zosin,1


### Outputting all the tables as csvs - 

In [31]:
LIB.to_csv(f'{data_out}/{data_prefix}-LIB.csv')
VOCAB.to_csv(f'{data_out}/{data_prefix}-VOCAB.csv')
TOKEN.to_csv(f'{data_out}/{data_prefix}-CORPUS.csv')
POS.to_csv(f"{data_out}/{data_prefix}-POS.csv")
POS_GROUP.to_csv(f"{data_out}/{data_prefix}-POS_GROUP.csv")