# Laboratorio 7.2

In [1]:
import nltk
import numpy as np
from nltk.stem.snowball import SnowballStemmer
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jeffr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jeffr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## P3- Matriz de similitudes
### Elabore una matriz de similitud de coseno entre los documentos de la colección "El Señor de los Anillos". Debe aplicar los pesos TF-IDF.
### 1. Preprocesamiento

In [2]:
def preprocesamiento(texto):
    words = nltk.word_tokenize(texto)

    words = [word for word in words if word.isalnum()]

    stopwords = nltk.corpus.stopwords.words('spanish')
    words = [word for word in words if word.lower() not in stopwords]
  
    stemmer = SnowballStemmer('spanish')
    words = [stemmer.stem(word) for word in words]

    return words

textos = ["libro1.txt","libro2.txt","libro3.txt","libro4.txt","libro5.txt","libro6.txt"]
textos_procesados = []
indice = {}
for file_name in textos:
  file = open(file_name, encoding="utf-8")
  texto = file.read().rstrip()
  texto = preprocesamiento(texto)  
  textos_procesados.append(texto)

### 2- Similitud de coseno

In [3]:
def compute_tfidf(collection):
    index = {}
    for i, doc in enumerate(collection):
        for word in doc:
            if word not in index:
                index[word] = []
            index[word].append(i)

    tfidf = []
    for doc in collection:
        doc_tfidf = []
        for word in index.keys():
            tf = doc.count(word)
            idf = np.log(len(collection) / len(index[word]))
            doc_tfidf.append(tf * idf)
        tfidf.append(doc_tfidf)
    return tfidf

def cosine_sim(Q, Doc):  
    Q = np.array(Q)
    Doc = np.array(Doc)
    return np.dot(Q, Doc) / (np.linalg.norm(Q) * np.linalg.norm(Doc))
  
textos_tfidf = compute_tfidf(textos_procesados)

def print_pretty_matrix(matrix):
  for row in matrix:
    for value in row:
      print("{:.2f}".format(value), end=" ")
    print()

matriz = []
for doc1 in textos_tfidf:
  row = []
  for doc2 in textos_tfidf:  
    row.append(cosine_sim(doc1, doc2))
  matriz.append(row)

print_pretty_matrix(matriz)

1.00 0.54 0.41 0.48 0.36 0.59 
0.54 1.00 0.39 0.44 0.37 0.54 
0.41 0.39 1.00 0.18 0.40 0.34 
0.48 0.44 0.18 1.00 0.30 0.59 
0.36 0.37 0.40 0.30 1.00 0.34 
0.59 0.54 0.34 0.59 0.34 1.00 


## P4- Indice invertido con similitud de coseno

### 1. Estructura del índice invertido en Python:

In [6]:
import json
import numpy as np
import pandas as pd
import nltk
from nltk.stem import SnowballStemmer
from sklearn.preprocessing import normalize

nltk.download('punkt')
nltk.download('stopwords')

class InvertIndex:
    
    def __init__(self, index_file):
        self.index_file = index_file
        self.index = {}
        self.idf = {}
        self.length = {}

    def __preprocesamiento(self, texto):
        words = nltk.word_tokenize(texto)
        words = [word for word in words if word.isalnum()]
        stopwords = nltk.corpus.stopwords.words('spanish')
        words = [word for word in words if word.lower() not in stopwords]
        stemmer = SnowballStemmer('spanish')
        words = [stemmer.stem(word) for word in words]
        return words
    
    def __compute_tf(self, collection):
        index = {}
        for i, doc in enumerate(collection):
            for word in doc:
                if word not in index:
                    index[word] = []
                index[word].append(i)
        return index
    
    def __compute_idf(self, collection):
        idf = {}
        for word in self.index.keys():
            idf[word] = np.log(len(collection) / len(self.index[word]))
        return idf
    
    def __load_index(self, index_file):
        self.index = json.load(open(index_file, 'r'))
        self.idf = json.load(open(index_file.replace("index", "idf"), 'r'))
        self.length = json.load(open(index_file.replace("index", "length"), 'r'))

    def __save_index(self, index_file):
        json.dump(self.index, open(index_file, 'w'))
        json.dump(self.idf, open(index_file.replace("index", "idf"), 'w'))
        json.dump(self.length, open(index_file.replace("index", "length"), 'w'))

    def building(self, collection_text):
        preprocesamiento = [self.__preprocesamiento(texto) for texto in collection_text]
        self.index = self.__compute_tf(preprocesamiento)
        self.idf = self.__compute_idf(preprocesamiento)
        
        tf_idf_matrix = []
        for doc in preprocesamiento:
            tf_idf_vector = []
            for word in self.index.keys():
                tf = doc.count(word)
                tf_idf_vector.append(tf * self.idf.get(word, 0))
            tf_idf_matrix.append(tf_idf_vector)
        
        self.length = np.linalg.norm(tf_idf_matrix, axis=1).tolist()
        
        self.__save_index(self.index_file)
        print("Index built and saved in", self.index_file)

    def retrieval(self, query, k):
        self.__load_index(self.index_file)
        score = {}
        query = self.__preprocesamiento(query)
        
        query_tf_idf = np.zeros(len(self.index))
        for i, word in enumerate(self.index.keys()):
            if word in query:
                tf = query.count(word)
                query_tf_idf[i] = tf * self.idf.get(word, 0)
        
        query_length = np.linalg.norm(query_tf_idf)
        
        for i in range(len(self.length)):
            doc_vector = np.zeros(len(self.index))
            for word in query:
                if word in self.index:
                    for doc_id in self.index[word]:
                        if doc_id == i:
                            tf = self.index[word].count(doc_id)
                            doc_vector[list(self.index.keys()).index(word)] = tf * self.idf[word]
            doc_length = self.length[i]
            cosine_similarity = np.dot(query_tf_idf, doc_vector) / (query_length * doc_length)
            score[i] = cosine_similarity
        
        result = sorted(score.items(), key=lambda tup: tup[1], reverse=True)
        return result[:k]


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jeffr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jeffr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Paso 2:	Probar el Índice

In [7]:
dataton = pd.read_csv('df_total.csv')

collection_text = dataton['news']
index = InvertIndex("index.txt")
index.building(collection_text)

Query1 = "El regulador de valores de China"
result = index.retrieval(Query1, 10)
print([r[0] for r in result])


Index built and saved in index.txt


  cosine_similarity = np.dot(query_tf_idf, doc_vector) / (query_length * doc_length)


[1, 15, 409, 126, 748, 14, 34, 52, 91, 113]
