In [1]:
import os, re, pickle
import pandas as pd
import numpy as np
from lxml import etree
from tqdm.notebook import tqdm


from my_lda import LDA
import corpus_man as cm

In [2]:
def confusionMatrix(df):
    confusion_matrix = pd.DataFrame(0,
                                    index=['arcmed','mod','cont'],
                                    columns=range(len(df['predicted_period'].unique())))
    for index in range(len(df)):
        initial = df['period'][index]
        predicted = df['predicted_period'][index]
        confusion_matrix[predicted][initial] += 1
        
    return confusion_matrix

def removeMime(to_slice):
    return to_slice[:-4]

# Downloading tycho XML

In [None]:
!wget --no-check-certificate http://www.tycho.iel.unicamp.br/corpus/texts/xml.zip -P ./corpora
!mkdir ./corpora/xml
!unzip ./corpora/xml.zip -d ./corpora/xml

# Loading CSV Classification

In [3]:
df = pd.read_csv('periodos_tycho.csv',header=None)
df.columns = ['period', 'file_name']
df['genre'] = None
df['predicted_period'] = None

cond1 = df.period =='cont2'
df['period'] = df['period'].replace(['cont1'],'cont')
df['period'] = df['period'].replace(['cont2'],'cont')


df.period.value_counts()

cont      48
mod       28
arcmed    13
Name: period, dtype: int64

In [4]:
df['file_name'] = df['file_name'].apply(removeMime)

In [5]:
df.head()

Unnamed: 0,period,file_name,genre,predicted_period
0,arcmed,l_002,,
1,arcmed,g_009,,
2,arcmed,p_002,,
3,arcmed,b_002,,
4,arcmed,m_007,,


In [6]:
new_df = df.sort_values(by='file_name').reset_index().drop('index',axis=1)
new_df = new_df.set_index('file_name')

In [None]:
new_df.tail(10)

In [7]:
# Every xml file is parsed and the corresponding xml file is stored as
# a key value pair in the forest dict, where the key is the file name
# and the value is the parsed xml file

forest={}
for root, dirs, files in os.walk("./corpora/xml", topdown=False):
    for file_name in sorted(files):
        with open(os.path.join(root,file_name),'r') as xml:
            xml_plain = '\n'.join(xml.read().split('\n')[1:])
            try:
                forest[file_name] = etree.fromstring(xml_plain)
            except etree.XMLSyntaxError as error:
                print(error)
                print(xml_plain)
            except ValueError:
                print(xml_plain)

In [20]:
# id_list is a list of the files which have a assigned period from the 
# periodos_tycho.csv file
id_list = list(new_df.index)

# the tycho_to_lda file is a file where each line corresponds to a
# document from the corpus.
with open('./corpora/tycho_to_lda', 'w') as dump:
    for file_name, xml in forest.items():
        
        # get id
        for i in xml:
            if i.tag == 'head':
                name = str(i.get('id'))+'.xml'
        # Skipping files which don't have a period classification
        # provided by the tycho_periods.csv
        
        if file_name[:-4] not in id_list:
            print('not in id_lst= ',file_name)
            continue
        # get genre
        genre = cm.get_meta('Genre',xml).xpath('string()')
        new_df.loc[file_name[:-4],'genre'] = genre
    
        # get text
        text = ' '.join(xml.xpath("//body//text()"))
        # process text and write to file
        text = cm.pre_process(text)
        dump.write(text)
        dump.write('\n')

a_001.xml----a_001.xml
a_002.xml----a_002.xml
a_003.xml----a_003.xml
a_004.xml----a_004.xml
a_005.xml----a_005.xml
a_006.xml----a_006.xml
a_007.xml----a_007.xml
a_008.xml----a_008.xml
a_009.xml----a_009.xml
b_001.xml----b_001.xml
b_002.xml----b_002.xml
b_003.xml----b_003.xml
b_005.xml----b_005.xml
b_006.xml----b_006.xml
b_007.xml----b_007.xml
b_008.xml----b_008.xml
b_009.xml----b_009.xml
b_010.xml----b_010.xml
b_011.xml----b_011.xml
c_001.xml----c_001.xml
c_002.xml----c_002.xml
c_003.xml----c_003.xml
c_004.xml----c_004.xml
c_005.xml----c_005.xml
c_006.xml----c_006.xml
c_007.xml----c_007.xml
c_008.xml----c_008.xml
c_009.xml----morgadinha-valflor-final.xml
c_010.xml----c_010.xml
d_001.xml----d_001.xml
e_001.xml----e_001.xml
f_001.xml----f_001.xml
f_002.xml----f_002.xml
f_003.xml----f_003.xml
g_001.xml----g_001.xml
g_002.xml----g_002.xml
g_003.xml----g_003.xml
g_004.xml----g_004.xml
g_005.xml----g_005.xml
g_006.xml----g_006.xml
g_008.xml----g_008.xml
g_009.xml----g_009.xml
g_010.xml----g_

In [21]:
with open('./corpora/tycho_to_lda', 'r') as dump:
    print(len(dump.readlines()))

89


# 3 Topics

In [None]:
with open('./corpora/tycho_to_lda_3.out', 'rb') as file:
    lda_3 = pickle.load(file)

lda_3.classifyDocuments()    

df_3topics = new_df.copy()
df_3topics['predicted_period'] = [lda_3.getDocTopic(i)[0] for i in range(len(new_df))]


In [None]:
df_3topics.head()

In [None]:
confusionMatrix(df_3topics)

# 4 Topics

In [None]:
with open('./corpora/tycho_to_lda_4.out', 'rb') as file:
    lda_4 = pickle.load(file)

lda_4.classifyDocuments()    

df_4topics = new_df.copy()
df_4topics['predicted_period'] = [lda_4.getDocTopic(i)[0] for i in range(len(new_df))]


In [None]:
df_4topics.head()

In [None]:
confusionMatrix(df_4topics)

# 5 Topics

In [None]:
with open('./corpora/tycho_to_lda_5.out', 'rb') as file:
    lda_5 = pickle.load(file)

lda_5.classifyDocuments()    

df_5topics = new_df.copy()
df_5topics['predicted_period'] = [lda_5.getDocTopic(i)[0] for i in range(len(new_df))]


In [None]:
confusionMatrix(df_5topics)