## Data Cleaning

In [2]:
# Import Necessary Packages
import pandas as pd
import numpy as np
import nltk 
import os 
from nltk.stem.porter import PorterStemmer
from sklearn.decomposition import PCA
from scipy.linalg import norm
from scipy.linalg import eigh
from scipy.spatial.distance import pdist
import scipy.cluster.hierarchy as sch

In [3]:
# Setting up import 
OHCO = ['book_id', 'chap_num', 'para_num', 'sent_num', 'token_num']
path = '/Users/kritchanwong/Downloads/Plato-project-5001-main/'

In [4]:
# Creating LIB Tables
title=['Timaeus', 'Theaetetus', 'Republic', 'Symposium',
           'Statesman', 'Sophist', 'Protagoras', 'Philebus',
           'Phaedrus', 'Phaedo', 'Parmenides', 'Meno',
           'Menexenus', 'Lysis', 'Laws', 'Laches', 'Ion',
           'Gorgias', 'Euthyphro', 'Euthydemus', 'Crito',
           'Critias', 'Cratylus', 'Charmides', 'Apology']

period= ['Late','Middle','Middle','Middle',
         'Late','Late','Early','Late',
         'Middle','Middle','Middle','Middle',
         'Unknown','Early','Late','Early','Early',
         'Early','Early','Middle','Early',
         'Late','Middle','Early','Early']
lib = {'book_id': title, 'Period': period,'Author': ['Plato']*25}
LIB = pd.DataFrame(lib)

In [5]:
# Make tokeniser function
def tokeniser(epub_file, first_line, book_val, chap_val):
    # Reading in the Epub
    epub = open(path + '/data_in/' + epub_file, encoding= "utf8", \
    errors= 'ignore').readlines()
    df = pd.DataFrame(epub, columns=['line_str'])
    df.index.name = 'line_num'
    df.line_str = df.line_str.str.strip()
        
    # Removing Cruft
    a = df.line_str.str.match(first_line)
    b = df.tail(1)
    an = df.loc[a].index[0]
    bn = b.index[0]
    df = df.loc[an: bn] 
    
    # Add Book ID
    df['book_id'] = epub_file.split('.')[0] #Adding Book ID

    # Dealing with Chapters
    if book_val == 0:
        chap_lines = df.line_str.str.match(first_line)
    else: 
        chap_lines = df.line_str.str.match(chap_val)
    chap_nums = [i+1 for i in range(df.loc[chap_lines].shape[0])]
    df.loc[chap_lines, 'chap_num'] = chap_nums
    df.chap_num = df.chap_num.ffill() #Fill in the NANs with chapter number
    df = df.loc[~chap_lines] # Remove chapter heading lines
    df.chap_num = df.chap_num.astype('int') # Convert chap_num from float to int
    
    # Creating dataframe based on CHAPTER
    dfc = df.groupby(OHCO[:2]).line_str.apply(lambda x: '\n'.join(x)).to_frame() # Make big string
    dfc['line_str'] = dfc.line_str.str.strip()

    # Creating dataframe based on PARAGRAPH
    # Creating dataframe based on PARAGRAPHS
    dfp = dfc['line_str'].str.split(r'\n\n+', expand=True).stack()\
        .to_frame().rename(columns={0:'para_str'}) #Grouping by Paragraphs
    dfp.index.names = OHCO[:3]
    dfp = dfp[~dfp['para_str'].str.match(r'^\s*$')] # Remove empty paragraphs
    
    # Creating dataframes based on SENTENCE and TOKENS using NLTK
    # Paragraphs to Sentences
    dfs= dfp.para_str\
        .apply(lambda x: pd.Series(nltk.sent_tokenize(x)))\
        .stack()\
        .to_frame()\
        .rename(columns={0:'sent_str'})

    # Sentences to Tokens
    # Local function to pick tokenizer
    def word_tokenize(x):
        s = pd.Series(nltk.pos_tag(nltk.word_tokenize(x)))
        return s
    
    # Tokenised Dataframe            
    dft = dfs.sent_str\
        .apply(word_tokenize)\
        .stack()\
        .to_frame()\
        .rename(columns={0:'pos_tuple'})
        
    # Grab info from tuple
    dft['pos'] = dft.pos_tuple.apply(lambda x: x[1])
    dft['token_str'] = dft.pos_tuple.apply(lambda x: x[0])
    return dft

In [6]:
## Importing Data
Apology_token = tokeniser('Apology.txt',r'APOLOGY.',0,0)
Republic_token = tokeniser('The Republic.txt',r'\s*BOOK I',1,
                         r"^\s*(BOOK|letter)\s+(d+|I|II|III|IV|V|VI|VII|VIII|IX|X|XX)")
Laws_token = tokeniser('Laws.txt',r'\s*BOOK I\.',1,r"^\s*(BOOK|letter)\s+(|I|II|III|IV|V|VI|VII|VIII|IIX|IX|X|XI|XII)\.")
Charmides_token = tokeniser('Charmides.txt',r'\s*PERSONS\s*\s*OF\s*\s*THE\s*DIALOGUE.*',0,0)
Cratylus_token = tokeniser('Cratylus.txt',r'\s*PERSONS\s*\s*OF\s*\s*THE\s*DIALOGUE.*',0,0)
Crito_token = tokeniser('Crito.txt',r'\s*PERSONS\s*\s*OF\s*\s*THE\s*DIALOGUE.*',0,0)
Critias_token = tokeniser('Critias.txt',r'\s*PERSONS\s*\s*OF\s*\s*THE\s*DIALOGUE.*',0,0)
Euthydemus_token = tokeniser('Euthydemus.txt',r'\s*PERSONS\s*\s*OF\s*\s*THE\s*DIALOGUE.*',0,0)
Euthyphro_token = tokeniser('Euthyphro.txt',r'\s*PERSONS\s*\s*OF\s*\s*THE\s*DIALOGUE.*',0,0)
Gorgias_token = tokeniser('Gorgias.txt',r'\s*PERSONS\s*\s*OF\s*\s*THE\s*DIALOGUE.*',0,0)
Ion_token = tokeniser('Ion.txt',r'\s*PERSONS\s*\s*OF\s*\s*THE\s*DIALOGUE.*',0,0)
Laches_token = tokeniser('Laches.txt',r'\s*PERSONS\s*\s*OF\s*\s*THE\s*DIALOGUE.*',0,0)
Lysis_token = tokeniser('Lysis.txt',r'\s*PERSONS\s*\s*OF\s*\s*THE\s*DIALOGUE.*',0,0)
Menexenus_token = tokeniser('Menexenus.txt',r'\s*PERSONS\s*\s*OF\s*\s*THE\s*DIALOGUE.*',0,0)
Meno_token = tokeniser('Menexenus.txt',r'\s*PERSONS\s*\s*OF\s*\s*THE\s*DIALOGUE.*',0,0)
Parmenides_token = tokeniser('Parmenides.txt',r'\s*PERSONS\s*\s*OF\s*\s*THE\s*DIALOGUE.*',0,0)
Phaedo_token = tokeniser('Phaedo.txt',r'\s*PERSONS\s*\s*OF\s*\s*THE\s*DIALOGUE.*',0,0)
Phaedrus_token = tokeniser('Phaedrus.txt',r'\s*PERSONS\s*\s*OF\s*\s*THE\s*DIALOGUE.*',0,0)
Philebus_token = tokeniser('Philebus.txt',r'\s*PERSONS\s*\s*OF\s*\s*THE\s*DIALOGUE.*',0,0)
Protagoras_token = tokeniser('Protagoras.txt',r'\s*PERSONS\s*\s*OF\s*\s*THE\s*DIALOGUE.*',0,0)
Sophist_token = tokeniser('Sophist.txt',r'\s*PERSONS\s*\s*OF\s*\s*THE\s*DIALOGUE.*',0,0)
Statesman_token = tokeniser('Statesman.txt',r'\s*PERSONS\s*\s*OF\s*\s*THE\s*DIALOGUE.*',0,0)
Symposium_token = tokeniser('Symposium.txt',r'\s*PERSONS\s*\s*OF\s*\s*THE\s*DIALOGUE.*',0,0)
Theaetetus_token = tokeniser('Theaetetus.txt', r'\s*PERSONS\s*\s*OF\s*\s*THE\s*DIALOGUE.*',0,0)
Timaeus_token = tokeniser('Timaeus.txt', r'\s*PERSONS\s*\s*OF\s*\s*THE\s*DIALOGUE.*',0,0)
### Plato Tokens
TOKEN = pd.concat([Timaeus_token, Theaetetus_token, Republic_token, Symposium_token,
           Statesman_token, Sophist_token, Protagoras_token, Philebus_token,
           Phaedrus_token, Phaedo_token, Parmenides_token, Meno_token,
           Menexenus_token, Lysis_token, Laws_token, Laches_token, Ion_token,
           Gorgias_token, Euthyphro_token, Euthydemus_token, Crito_token,
           Critias_token, Cratylus_token, Charmides_token, Apology_token])
TOKEN = TOKEN.rename_axis(['book_id','chap_num','para_num','sent_num','token_num'])


### Create BOW and VOCAB tables

In [7]:
# Annotated Vocab Table
TOKEN['term_str']=TOKEN['token_str'].str.lower().str.replace('[\W_]', '')
VOCAB = TOKEN.term_str.value_counts().to_frame().rename(columns={'index':'term_str', 'term_str':'n'})\
    .sort_index().reset_index().rename(columns={'index':'term_str'})
VOCAB.index.name = 'term_id'
VOCAB['num'] = VOCAB.term_str.str.match("\d+").astype('int')
sw = pd.DataFrame(nltk.corpus.stopwords.words('english'), columns=['term_str'])
sw = sw.reset_index().set_index('term_str')
sw.columns = ['dummy']
sw.dummy = 1
VOCAB['stop'] = VOCAB.term_str.map(sw.dummy)
VOCAB['stop'] = VOCAB['stop'].fillna(0).astype('int')
stemmer = PorterStemmer()
VOCAB['p_stem'] = VOCAB.term_str.apply(stemmer.stem)
TOKEN['pos_group'] = TOKEN.pos.str[:2]
pos_max = TOKEN.groupby(['term_str','pos_group']).pos.count().unstack().idxmax(1)
VOCAB['pos_max'] = list(pos_max)
VOCAB['term_code'] = VOCAB.term_str + '/' + VOCAB.pos_max
N_vocab = VOCAB.shape[0]
U_vocab = 1/N_vocab
VOCAB['p'] = VOCAB.n / VOCAB.n.sum()  # Probability using MLE
VOCAB['s'] = 1 / VOCAB.p              # Surprise
VOCAB['i'] = np.log2(VOCAB.s)         # Information
VOCAB['h'] = VOCAB.p * VOCAB.i        # Entropy
VOCAB['wlen'] = VOCAB.term_str.str.len() # Word length feature
VOCAB = VOCAB.sort_values('n', ascending=False)
VOCAB['term_rank'] = [r+1 for r in range(VOCAB.shape[0])] ### Compute the term rank, and sort table
## BOW
DOC = OHCO[:1]
BOW = TOKEN.groupby(DOC+['term_str']).term_str.count().to_frame('tf_n')
D = BOW.groupby(DOC).tf_n # many ways to calculate term frequency
BOW['tf_jp'] = D.apply(lambda x: x / x.sum().sum()) # jp = P(w,d)
BOW['tf_cp'] = D.apply(lambda x: x / x.sum()) # cp = P(w|d)
BOW['tf_l2'] = D.apply(lambda x: x / np.sqrt((x**2).sum()))
BOW['tf_logn'] = D.apply(lambda x: np.log2(1 + x))
BOW['tf_sub'] = D.apply(lambda x: 1 + np.log2(x)) # Sublinear scaling; from Manning, et al.
BOW['tf_max'] = D.apply(lambda x: .4 + .6 * (x / x.max())) # See Manning, et al. for choice of α
BOW['tf_bool'] = D.apply(lambda x: x.astype('bool') / x.astype('bool').sum())
VOCAB['df']=list(BOW.groupby('term_str').tf_n.count())
N_docs = len(D.groups)
VOCAB['idf'] = list(np.log2(N_docs/VOCAB.df)) 
tf_types = [col.split('_')[1] for col in BOW.columns.to_list() if 'tf_' in col]
VOCAB=VOCAB.set_index('term_str')
for tf_type in tf_types:
    BOW[f'tfidf_{tf_type}'] = BOW[f'tf_{tf_type}'] * VOCAB.idf
for tf_type in tf_types:
    col = f"tfidf_{tf_type}"
    VOCAB[col + "_sum"] = BOW.groupby('term_str')[col].sum()
    VOCAB[col + "_sum"] = (VOCAB[col + "_sum"] - VOCAB[col + "_sum"].mean()) / VOCAB[col + "_sum"].std()
    VOCAB[col + "_sum"] = VOCAB[col + "_sum"] - VOCAB[col + "_sum"].min() 
    VOCAB[col + "_sum"] = VOCAB[col + "_sum"] / N_docs
tfidf_sum_cols = [f"tfidf_{type}_sum" for type in tf_types]
## Making TFIDF 
tfidf_sum = 'tfidf_bool'
TFIDF = BOW.groupby(OHCO[:1]+['term_str'])[[tfidf_sum]].mean().unstack(fill_value=0)
TFIDF.columns = TFIDF.columns.droplevel(0) #dropping all 0s
VOCAB['dfidf'] = VOCAB['idf']*VOCAB['df'] 
VOCAB_sig = VOCAB.sort_values(by='dfidf',ascending=False).head(4000)
TFIDF = TFIDF[VOCAB_sig.index] #choosing 4000 top terms
TFIDF = TFIDF.apply(lambda x: x / np.sqrt(np.square(x).sum()), axis=1)

  TOKEN['term_str']=TOKEN['token_str'].str.lower().str.replace('[\W_]', '')


## Export Data

In [12]:
LIB.to_csv(path + '/data_out/' + 'lib_plato.csv')
BOW.to_csv(path + '/data_out/' + 'bow_plato.csv')
VOCAB.to_csv(path + '/data_out/' +'vocab_plato.csv')
TOKEN.to_csv(path + '/data_out/' +'token_plato.csv' )
TFIDF.to_csv(path + '/data_out/' +'tf_idf_plato.csv')