In [1]:
import os, pickle

import pandas as pd
import numpy as np

from lxml import etree
from tqdm.notebook import tqdm

# Modules developed by the author
from my_lda import LDA
import corpus_man as cm

pd.options.display.max_rows = 100

# Downloading tycho XML

In [2]:
!mkdir ./corpora
!wget --no-check-certificate http://www.tycho.iel.unicamp.br/corpus/texts/xml.zip -P ./corpora
!mkdir ./corpora/xml
!unzip ./corpora/xml.zip -d ./corpora/xml

--2022-03-22 17:43:26--  http://www.tycho.iel.unicamp.br/corpus/texts/xml.zip
Resolving www.tycho.iel.unicamp.br (www.tycho.iel.unicamp.br)... 143.106.176.17
Connecting to www.tycho.iel.unicamp.br (www.tycho.iel.unicamp.br)|143.106.176.17|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 24196121 (23M) [application/zip]
Saving to: ‘./corpora/xml.zip’


2022-03-22 17:43:32 (8.47 MB/s) - ‘./corpora/xml.zip’ saved [24196121/24196121]

Archive:  ./corpora/xml.zip
  inflating: ./corpora/xml/a_001.xml  
  inflating: ./corpora/xml/a_002.xml  
  inflating: ./corpora/xml/a_003.xml  
  inflating: ./corpora/xml/a_004.xml  
  inflating: ./corpora/xml/a_005.xml  
  inflating: ./corpora/xml/a_006.xml  
  inflating: ./corpora/xml/a_007.xml  
  inflating: ./corpora/xml/a_008.xml  
  inflating: ./corpora/xml/a_009.xml  
  inflating: ./corpora/xml/b_001.xml  
  inflating: ./corpora/xml/b_002.xml  
  inflating: ./corpora/xml/b_003.xml  
  inflating: ./corpora/xml/b_005.xml  
  infl

# Loading CSV Classification

The tags present in the 'period' column indicate to which period the texts belong according to Bechara's classification:  
* cont: **contemporary**  
* mod: **modern**  
* arcmed: **archaic**  

In [3]:
df = pd.read_csv('periodos_tycho.csv',header=None)
df.columns = ['period', 'file_name']
df['genre'] = None
df['predicted_period'] = None

df['period'] = df['period'].replace(['cont1'],'cont')
df['period'] = df['period'].replace(['cont2'],'cont')

df.period.value_counts()

cont      48
mod       28
arcmed    13
Name: period, dtype: int64

In [4]:
df['file_name'] = df['file_name'].apply(cm.removeMime)

In [5]:
df.head()

Unnamed: 0,period,file_name,genre,predicted_period
0,arcmed,l_002,,
1,arcmed,g_009,,
2,arcmed,p_002,,
3,arcmed,b_002,,
4,arcmed,m_007,,


In [6]:
# Setting the file names as row names
new_df = df.sort_values(by='file_name').reset_index().drop('index',axis=1)
new_df = new_df.set_index('file_name')

In [7]:
new_df.tail(10)

Unnamed: 0_level_0,period,genre,predicted_period
file_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
va_002,cont,,
va_003,cont,,
va_004,cont,,
va_006,cont,,
va_007,cont,,
va_008,cont,,
va_009,cont,,
va_010,cont,,
va_011,cont,,
va_012,cont,,


In [8]:
# Every xml file is parsed and the corresponding xml file is stored as
# a key value pair in the forest dict, where the key is the file name
# and the value is the parsed xml file

forest={}
for root, dirs, files in os.walk("./corpora/xml", topdown=False):
    for file_name in sorted(files):
        with open(os.path.join(root,file_name),'r') as xml:
            xml_plain = '\n'.join(xml.read().split('\n')[1:])
            try:
                forest[file_name] = etree.fromstring(xml_plain)
            except etree.XMLSyntaxError as error:
                print(error)
                print(xml_plain)
            except ValueError:
                print(xml_plain)

In [9]:
# id_list is a list of the files which have a assigned period from the 
# periodos_tycho.csv file
id_list = list(new_df.index)

# the tycho_to_lda file is a file where each line corresponds to a
# document from the corpus.
with open('./corpora/tycho_to_lda', 'w') as dump:
    for file_name, xml in forest.items():
        
        # get id
        for i in xml:
            if i.tag == 'head':
                name = str(i.get('id'))+'.xml'
                
        # Skipping files which don't have a period classification
        # provided by the tycho_periods.csv
        if file_name[:-4] not in id_list:
            print('not in id_lst= ',file_name)
            continue

        # get genre
        genre = cm.get_meta('Genre',xml).xpath('string()')
        new_df.loc[file_name[:-4],'genre'] = genre
    
        # get text
        text = ' '.join(xml.xpath("//body//text()"))
        # process text and write to file
        text = cm.pre_process(text)
        dump.write(text)
        dump.write('\n')

In [10]:
# Sanity check to see if every text was written in the tycho_to_lda file
with open('./corpora/tycho_to_lda', 'r') as dump:
    if len(dump.readlines()) == len(new_df):
        print('sanity checked!')

sanity checked!


In [11]:
new_df

Unnamed: 0_level_0,period,genre,predicted_period
file_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a_001,cont,Dissertation,
a_002,mod,Dissertation,
a_003,cont,Narrative,
a_004,cont,Letters,
a_005,mod,Grammar,
a_006,cont,Narrative,
a_007,mod,Narrative,
a_008,cont,Theatre,
a_009,cont,Theatre,
b_001,mod,Narrative,


On the DataFrame above we can see that the following fields are incorrect:
* "p_002" the genre is incorrect as it has a text title.  
* "s_002" has a no genre, in the "genre" field of the XML file, while inspecting the file it was seem that the genre is a "Narrative; Chronicle". 
* Genre "ata" was not translated to english

In [12]:
# Fixing fields
new_df.loc['p_002','genre'] = 'Narrative; Chronicle'
new_df.loc['s_002','genre'] = 'Narrative; Chronicle'
new_df['genre'] = new_df['genre'].replace(['ata'],'Minute')

In [13]:
new_df['genre'].value_counts()

Theatre                              26
Letters                              18
Narrative                            17
Dissertation                          7
News                                  6
Narrative; Chronicle                  5
Grammar                               3
Newspaper                             3
Minute                                2
Songs                                 1
Transcript of original manuscript     1
Name: genre, dtype: int64

# Experiments

Three experiments were ran in order to analyse how the LDA would group texts and see if grouping into 3, 4 or 5 topics would yield more expressive results.

# 3 Topics

In [14]:
# Make the comparison true if you want to run the experiment, or copy the command to your favorite terminal
# emulator and run it on this same folder
# lda_cli is a CLI LDA in this folder and runs LDA with Gibbs Sampling over our documents and saves the LDA
# experiment as a class to be loaded afterwards.

if 1 == 0:
    !python lda_cli.py -nTopics 3 -path ./corpora/tycho_to_lda -outPath ./corpora/tycho_to_lda_3
else:
    !unzip lda_out.zip -d ./corpora/

Archive:  lda_out.zip
  inflating: ./corpora/tycho_to_lda_3.out  
  inflating: ./corpora/tycho_to_lda_4.out  
  inflating: ./corpora/tycho_to_lda_5.out  


In [15]:
# Loading the LDA class object into our notebook
with open('./corpora/tycho_to_lda_3.out', 'rb') as file:
    lda_3 = pickle.load(file)

# This method creates a dictionary which constains the most prevalent topic for each document
lda_3.classifyDocuments()

# Creates a copy of the original DataFrame
df_3topics = new_df.copy()

# Get the predicted topic grounping for each file/document
df_3topics['predicted_period'] = [lda_3.getDocTopic(i)[0] for i in range(len(new_df))]


In [16]:
df_3topics

Unnamed: 0_level_0,period,genre,predicted_period
file_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a_001,cont,Dissertation,2
a_002,mod,Dissertation,2
a_003,cont,Narrative,2
a_004,cont,Letters,2
a_005,mod,Grammar,0
a_006,cont,Narrative,2
a_007,mod,Narrative,0
a_008,cont,Theatre,2
a_009,cont,Theatre,1
b_001,mod,Narrative,2


In [17]:
# 
cm.confusionMatrix(df_3topics)

Unnamed: 0,0,1,2
arcmed,11,0,2
mod,12,1,15
cont,3,16,29


## Genres per topic grouping
Some groups were checked to see if the LDA tried to group certain genres of texts together

In [18]:
genre_profile = df_3topics[df_3topics['predicted_period']==0]
pd.DataFrame(genre_profile['genre'].value_counts())

Unnamed: 0,genre
Theatre,9
Narrative,7
Narrative; Chronicle,4
Grammar,2
News,2
Transcript of original manuscript,1
Letters,1


In [19]:
genre_profile = df_3topics[df_3topics['predicted_period']==1]
pd.DataFrame(genre_profile['genre'].value_counts())

Unnamed: 0,genre
Letters,5
News,4
Theatre,3
Newspaper,3
Minute,2


In [20]:
genre_profile = df_3topics[df_3topics['predicted_period']==2]
pd.DataFrame(genre_profile['genre'].value_counts())

Unnamed: 0,genre
Theatre,14
Letters,12
Narrative,10
Dissertation,7
Songs,1
Narrative; Chronicle,1
Grammar,1


## Most frequent words per topic

In [21]:
pd.DataFrame(lda_3.mostFreqWordsTopic(30)).transpose()

Unnamed: 0,0,1,2
0,que,que,que
1,por,para,não
2,com,não,com
3,não,com,para
4,rey,por,por
5,lhe,dos,mais
6,como,senhor,como
7,naõ,uma,uma
8,hum,sua,lhe
9,para,lhe,dos


# 4 Topics

In [22]:
if 1 == 0:
    !python lda_cli.py -nTopics 4 -path ./corpora/tycho_to_lda -outPath ./corpora/tycho_to_lda_4
    

In [23]:
with open('./corpora/tycho_to_lda_4.out', 'rb') as file:
    lda_4 = pickle.load(file)

lda_4.classifyDocuments()

df_4topics = new_df.copy()
df_4topics['predicted_period'] = [lda_4.getDocTopic(i)[0] for i in range(len(new_df))]


In [24]:
df_4topics.head()

Unnamed: 0_level_0,period,genre,predicted_period
file_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a_001,cont,Dissertation,1
a_002,mod,Dissertation,1
a_003,cont,Narrative,1
a_004,cont,Letters,1
a_005,mod,Grammar,2


In [25]:
cm.confusionMatrix(df_4topics)

Unnamed: 0,0,1,2,3
arcmed,7,1,5,0
mod,9,14,5,0
cont,2,26,3,17


## Genres per topic grouping

In [26]:
genre_profile = df_4topics[df_4topics['predicted_period']==0]
pd.DataFrame(genre_profile['genre'].value_counts())

Unnamed: 0,genre
Narrative,5
Narrative; Chronicle,5
News,5
Letters,2
Transcript of original manuscript,1


In [27]:
genre_profile = df_4topics[df_4topics['predicted_period']==1]
pd.DataFrame(genre_profile['genre'].value_counts())

Unnamed: 0,genre
Theatre,12
Narrative,10
Letters,10
Dissertation,7
Songs,1
Grammar,1


In [28]:
genre_profile = df_4topics[df_4topics['predicted_period']==2]
pd.DataFrame(genre_profile['genre'].value_counts())

Unnamed: 0,genre
Theatre,9
Grammar,2
Narrative,2


In [29]:
genre_profile = df_4topics[df_4topics['predicted_period']==3]
pd.DataFrame(genre_profile['genre'].value_counts())

Unnamed: 0,genre
Letters,6
Theatre,5
Newspaper,3
Minute,2
News,1


## Most frequent words per topic

In [30]:
pd.DataFrame(lda_4.mostFreqWordsTopic(30)).transpose()

Unnamed: 0,0,1,2,3
0,que,que,que,que
1,com,não,não,não
2,por,com,por,para
3,rey,para,included,com
4,lhe,por,naõ,por
5,para,mais,punctuation,senhor
6,como,como,com,dos
7,rei,uma,como,uma
8,dos,lhe,nam,lhe
9,hum,mas,lhe,sua


# 5 Topics

In [31]:
if 1 == 0:
    !python lda_cli.py -nTopics 5 -path ./corpora/tycho_to_lda -outPath ./corpora/tycho_to_lda_5

In [32]:
with open('./corpora/tycho_to_lda_5.out', 'rb') as file:
    lda_5 = pickle.load(file)

lda_5.classifyDocuments()

df_5topics = new_df.copy()
df_5topics['predicted_period'] = [lda_5.getDocTopic(i)[0] for i in range(len(new_df))]


In [33]:
cm.confusionMatrix(df_5topics)

Unnamed: 0,0,1,2,3,4
arcmed,6,5,1,1,0
mod,8,4,15,1,0
cont,2,3,9,7,27


## Genres per topic grouping

In [34]:
genre_profile = df_5topics[df_5topics['predicted_period']==0]
pd.DataFrame(genre_profile['genre'].value_counts())

Unnamed: 0,genre
Narrative,5
News,5
Narrative; Chronicle,4
Transcript of original manuscript,1
Letters,1


In [35]:
genre_profile = df_5topics[df_5topics['predicted_period']==1]
pd.DataFrame(genre_profile['genre'].value_counts())

Unnamed: 0,genre
Theatre,9
Narrative,2
Narrative; Chronicle,1


In [36]:
genre_profile = df_5topics[df_5topics['predicted_period']==2]
pd.DataFrame(genre_profile['genre'].value_counts())

Unnamed: 0,genre
Letters,10
Dissertation,7
Narrative,6
Songs,1
Grammar,1


In [37]:
genre_profile = df_5topics[df_5topics['predicted_period']==3]
pd.DataFrame(genre_profile['genre'].value_counts())

Unnamed: 0,genre
Letters,5
Grammar,2
Minute,2


In [38]:
genre_profile = df_5topics[df_5topics['predicted_period']==4]
pd.DataFrame(genre_profile['genre'].value_counts())

Unnamed: 0,genre
Theatre,17
Narrative,4
Newspaper,3
Letters,2
News,1


## Most frequent words per topic

In [39]:
pd.DataFrame(lda_5.mostFreqWordsTopic(30)).transpose()

Unnamed: 0,0,1,2,3,4
0,que,que,que,que,que
1,com,não,não,com,não
2,por,included,com,por,para
3,rey,por,por,não,uma
4,lhe,naõ,para,para,com
5,para,punctuation,mais,senhor,por
6,rei,com,como,vossa,mas
7,hum,nam,dos,lhe,senhor
8,dom,vos,lhe,dos,como
9,dos,como,vossa,amigo,meu
