# Laboratorio 6.1

Integrantes

- Fabrizzio Vilchez


- Jeffrey Monja

## 1- Clase BoolModel

Esta clase tendrá todos los métodos necesario para tokenizar los archivos

In [31]:
import nltk
import numpy as np
import pandas as pd
nltk.download('punkt')

class BoolModel:
    def __init__(self, files):
        self.files = files
        self.words = set()
        self.matrix = None
        self.dataframe = None

        stoplist = []
        with open('docs/stoplist.txt', 'r', encoding='utf-8') as f:
            for line in f:
                stoplist.append(line.strip())
        stoplist += ['?', '-', ':', ',', '.', '!', '¡', '¿', ';',
                     '(', ')', '«', '»', '—', '“', '”', '…', '°', 'º', '``']
        self.stoplist = stoplist

    def __convert_to_lowercase(self):
        for file in self.files:
            with open(file, 'r', encoding='utf-8') as f:
                text = f.read()
            text = text.lower()
            with open(file, 'w', encoding='utf-8') as f:
                f.write(text)

    def __remove_stopwords(self):
        for file in self.files:
            with open(file, 'r', encoding='utf-8') as f:
                text = f.read()
            words = nltk.word_tokenize(text)
            words = [word for word in words if word not in self.stoplist]
            text = ' '.join(words)

            new_name = file.split('.')[0] + '_clean.txt'
            with open(new_name, 'w', encoding='utf-8') as f:
                f.write(text)

            files[files.index(file)] = new_name

    def __create_set_words(self):
        for file in self.files:
            with open(file, 'r', encoding='utf-8') as f:
                text = f.read()
            words = nltk.word_tokenize(text)
            self.words.update(words)

    def preprocesar(self):
        self.__convert_to_lowercase()
        self.__remove_stopwords()

    def create_incidence_matrix(self):
        self.__create_set_words()
        self.matrix = np.zeros((len(self.words), len(self.files))).astype(int)

        for i, file in enumerate(self.files):
            with open(file, 'r', encoding='utf-8') as f:
                text = f.read()
            words = nltk.word_tokenize(text)
            for word in words:
                if word in self.words:
                    self.matrix[list(self.words).index(word)][i] = 1

        self.dataframe = pd.DataFrame(self.matrix, index=list(self.words), columns=self.files)
        self.dataframe = self.dataframe.sort_index()

        self.dataframe.to_csv('incidence_matrix.csv')

    def print_pretty(self):
        print(self.dataframe)

    def M(self, palabra):
        message = ''
        try:
            message = self.matrix[list(self.words).index(palabra)]
            print(palabra + ': ', message)
        except Exception as e:
            message = ['No existe la palabra', e]
        finally:
            return message

    def AND(self, word1, word2):
        if len(word1) != len(word2):
            return 'No se pueden comparar'
        res = np.logical_and(word1, word2)
        print(f"{word1.tolist()} AND {word2.tolist()}: {res.astype(int).tolist()}")
        return res

    def OR(self, word1, word2):
        if len(word1) != len(word2):
            return 'No se pueden comparar'
        res = np.logical_or(word1, word2)
        print(f"{word1.tolist()} OR {word2.tolist()}: {res.astype(int).tolist()}")
        return res

    def NOT(self, word1):
        res = np.logical_not(word1)
        print(f"NOT {word1.tolist()}: {res.astype(int).tolist()}")
        return res

    def query(self, expresion):
        res = []
        for i in range(len(expresion)):
            if expresion[i]:
                res.append(self.files[i])
        return res

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jeffr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## 2- Tokenizar y eliminar stopwords

In [32]:
files = ['docs/libro1.txt', 'docs/libro2.txt', 'docs/libro3.txt', 'docs/libro4.txt', 'docs/libro5.txt', 'docs/libro6.txt']

f = BoolModel(files)
f.preprocesar()

## 3- Construir la matriz de incidencia y guardar en csv

In [33]:
f.create_incidence_matrix()
f.print_pretty()

         docs/libro1_clean.txt  docs/libro2_clean.txt  docs/libro3_clean.txt  \
111º                         1                      0                      0   
``                           0                      0                      0   
abajo                        0                      0                      0   
abierto                      0                      0                      0   
abismo                       0                      1                      1   
...                        ...                    ...                    ...   
éomer                        0                      0                      1   
éored                        0                      0                      1   
éowyn                        0                      0                      0   
órdenes                      0                      0                      0   
único                        1                      0                      0   

         docs/libro4_clean.txt  docs/li

## 4- Aplicando consultas textuales

### Consulta 1: éomer AND éored OR abierto AND acaba

In [35]:
a = f.AND(f.M('éomer'), f.M('éored'))
b = f.AND(f.M('abierto'), f.M('acaba'))
c = f.OR(a, b)

print(f.query(c))

éomer:  [0 0 1 0 1 1]
éored:  [0 0 1 0 0 0]
[0, 0, 1, 0, 1, 1] AND [0, 0, 1, 0, 0, 0]: [0, 0, 1, 0, 0, 0]
abierto:  [0 0 0 0 1 0]
acaba:  [1 0 0 1 1 0]
[0, 0, 0, 0, 1, 0] AND [1, 0, 0, 1, 1, 0]: [0, 0, 0, 0, 1, 0]
[False, False, True, False, False, False] OR [False, False, False, False, True, False]: [0, 0, 1, 0, 1, 0]
['docs/libro3_clean.txt', 'docs/libro5_clean.txt']


### Consulta 2: NOT anillo AND NOT hobbit

In [10]:
a = f.NOT(f.M('anillo'))
b = f.NOT(f.M('hobbit'))
c = f.AND(a, b)

print(f.query(c))

['docs/libro5_clean.txt']


### Consulta 3: Consulta 3: éowyn AND aparecer AND apoderarse OR NOT aragorn

In [13]:
a = f.AND(f.M('éowyn'), f.M('aparecer'))
b = f.AND(a, f.M('apoderarse'))
c = f.OR(b, f.NOT(f.M('aragorn')))

print(f.query(c))

['docs/libro4_clean.txt']


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=fd776e8a-6c64-4897-baf8-1d04b601adab' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>