In [85]:
import pandas as pd
import numpy as np
import re

#NLTK
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import words

#spaCy
import spacy

#SKlearn
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [86]:
books = pd.read_csv('/Users/javm/Desktop/Projects/Sistema-Recomendacion-Libros/books.csv')
books.dropna(inplace = True)
books['full_text'] = books['title'] + ' ' + books['publisher'] + ' ' + books['blurb']
books

Unnamed: 0,title,authors,publisher,blurb,full_text
0,Generación idiota: Una crítica al adolescentrismo,Agustin Laje,HarperCollins Mexico,El controversial escritor Agustín Laje adviert...,Generación idiota: Una crítica al adolescentri...
1,Spare: En la sombra,Prince Harry The Duke of Sussex,PRH Grupo Editorial,La controversial autobiografía del príncipe Ha...,Spare: En la sombra PRH Grupo Editorial La con...
2,Volver a empezar / It Starts with Us (Spanish ...,Colleen Hoover,Planeta Publishing Corporation,¡Colleen Hoover nos brinda su magia nuevamente...,Volver a empezar / It Starts with Us (Spanish ...
3,¡Vámonos a la Estufa! con Janet Jauja Cocina M...,Kushner Janet,Larousse,"Jauja, la popular chef estrella de YouTube, no...",¡Vámonos a la Estufa! con Janet Jauja Cocina M...
4,RUGE: O espera a ser devorado (Spanish Edition),Daniel Habif,Editorial Planeta Mexicana S.A. de C.V.,Con su clásico agresivo pero muy alentador est...,RUGE: O espera a ser devorado (Spanish Edition...
...,...,...,...,...,...
977,EL DR. SILKWORTH EN ALCOHÓLICOS ANÓNIMOS,Oslos Molina Oslos Molina,Experiencias AA,Bill decía frecuentemente que el programa de A...,EL DR. SILKWORTH EN ALCOHÓLICOS ANÓNIMOS Exper...
978,La Inspiración de Rumi: 100 Citas Para Elevar ...,David Smith,Independently published,"En este libro, te sumergirás en la sabiduría y...",La Inspiración de Rumi: 100 Citas Para Elevar ...
979,El Derecho como objeto de investigación: Enfoq...,Elmer Arce Ortíz,Palestra Editores,Este libro es una presentación de la investiga...,El Derecho como objeto de investigación: Enfoq...
980,Reflexiones jurídicas sobre la Jurisdicción Es...,Jaime Bernal Cuéllar,Universidad Externado,El libro Reflexiones jurídicas sobre la Jurisd...,Reflexiones jurídicas sobre la Jurisdicción Es...


In [87]:

def clean_and_lemmatize(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove scraping notes from text
    text = text.replace('spanish edition', '')
    text = text.replace('overview\nnotes from your bookseller\n', '')
    text = text.replace('overview\n', '')
    text = text.replace('\n', ' ')
    text = text.replace('@', '')

    # Remove non-word characters (including punctuation) from text
    pattern = r'[¡!¿?.,:;()\-—«»“”‘’\[\]{}\/\'\"\d]'
    text = re.sub(pattern, '', text)

    # Remove stop words from text
    stop_words = set(stopwords.words("spanish"))
    text = " ".join([word for word in nltk.word_tokenize(text) if word.lower() not in stop_words])

    # Lemmatize text
    nlp = spacy.load('es_core_news_sm')
    doc = nlp(text)
    text = ' '.join([token.lemma_ for token in doc])

    # Return the cleaned and lemmatized text as a string
    return text


In [88]:
books['full_text'] = books['full_text'].apply(clean_and_lemmatize)

In [89]:
books['authors'][2]

' Colleen Hoover'

In [90]:
tf_vec = TfidfVectorizer(min_df = 3)

In [91]:
text = books['full_text']
vectors = tf_vec.fit_transform(text)

In [92]:
vec_df = pd.DataFrame(vectors.toarray(),
                     columns=[k for k, v in sorted(tf_vec.vocabulary_.items(), 
                     key=lambda item: item[1])])

In [93]:
vec_df

Unnamed: 0,aar,abajo,abandonado,abandonar,abandoned,abandono,abarca,abarcar,abc,abierto,...,íntimo,ón,óptimo,órden,órgano,último,únicamente,único,ús,útil
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083708,...,0.000000,0.0,0.0,0.0,0.0,0.056179,0.0,0.0,0.0,0.000000
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.050798,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
977,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.047501,0.0,0.0,0.0,0.000000
978,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000
979,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.054782
980,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000


In [94]:
book_search = input('What are you looking for: ')

In [95]:
print(book_search)

amigos juegos aventuras


In [96]:
search = clean_and_lemmatize(book_search)
search_vec = tf_vec.transform([search])

In [97]:
test = cosine_similarity(search_vec, vectors)

In [98]:
best_book = np.argmax(test[0])

print('Title : {}'.format(books.loc[best_book, 'title']))
print('Author : {}'.format(books.loc[best_book, 'authors']))
print('Publisher : {}'.format(books.loc[best_book, 'publisher']))
print('Blurb : {}'.format(books.loc[best_book, 'blurb']))

Title : La Reina Druida: (El Sendero del Guardabosques, Libro 16)
Author :  Pedro Urvi
Publisher : Independently published
Blurb : Las Panteras tendrán que proteger a la futura reina de Norghana. Una labor nada agradable y muy peligrosa.  Hay intereses politicos en juego y algunos reinos y facciones no ven la nueva alianza entre Norghana e Irinel con buenos ojos. Hay quienes quieren que la reina druida no alcance el trono.  Por si eso no fuera suficiente, otro peligro más importante acecha a Norghana y a todo Tremia. Un peligro de enormes dimensiones que amenaza con descender y arrasar reinos enteros.  Una nueva aventura épica aguarda a nuestros amigos. Una de muy dificil solución.  ¡Disfruta de unas aventuras llenas de acción, aventura, magia y romance!  Fantasía épica para toda la familia


In [83]:
import numpy as np

# Assume `test` is a numpy array containing the test results
# and `books` is a pandas dataframe containing book data

# Get the top k indices of the highest values in `test`
k = 5
top_k = np.argpartition(test[0], -k)[-k:]

# Print information about each book
for i in top_k:
    print('Title : {}'.format(books.loc[i, 'title']))
    print('Author : {}'.format(books.loc[i, 'authors']))
    print('Publisher : {}'.format(books.loc[i, 'publisher']))
    print('Blurb : {}'.format(books.loc[i, 'blurb']))
    print()


Title : Diario de Junior, El
Author :               James Patterson    Steven Butler    Richard Watson  
Publisher : Duomo ediciones
Blurb : ¡Finalmente puedo contar mi historia! Ser el perro de Rafe no siempre es fácil... ¡pero nunca es aburrido! Tengo mucho que contaros sobre: Cómo protejo el patio de pájaros, mapaches, ardillas, mapaches... ¿y mencioné a los…mapaches? La mejor manera de olfatear el trasero de un perro para descubrir las últimas noticias caninas. El monstruo aterrador del armario del pasillo: ¡la aspiradora! Pero lo peor de todo, la malvada Señora Stricker me mandará de nuevo a la perrera si no aprendo a comportarme. ¿Qué debe hacer un chucho como yo? I can finally tell my story! Being Rafe's dog isn't always easy... but it's never boring! I have a lot to tell you about: how I protect my yard from birds, raccoons, squirrels, raccoons… and did I mention… raccoons? The best way to sniff a dog's butt to find out the latest canine news. The scary monster in the hall clos

In [None]:
# nlp = spacy.load('es_core_news_sm')

# def clean_and_lemmatize(df, column_name):
#     # Convert text to lowercase
#     df[column_name] = df[column_name].str.lower()

#     # Remove scraping notes from text
#     df[column_name] = df[column_name].str.replace('spanish edition', '')
#     df[column_name] = df[column_name].str.replace('overview\nnotes from your bookseller\n', '')
#     df[column_name] = df[column_name].str.replace('overview\n', '')
#     df[column_name] = df[column_name].str.replace('\n', ' ')
#     df[column_name] = df[column_name].str.replace('@', '')

#     # Remove non-word characters (including punctuation) from text
#     pattern = r'[¡!¿?.,:;()\-—«»“”‘’\[\]{}\/\'\"\d]'
#     df[column_name] = df[column_name].apply(lambda x: re.sub(pattern, '', str(x)))

#     # Remove stop words from text
#     stop_words = set(stopwords.words("spanish"))
#     df[column_name] = df[column_name].apply(lambda x: " ".join([word for word in nltk.word_tokenize(x) if word.lower() not in stop_words]))

#     # Lemmatize text
#     df[column_name] = df[column_name].apply(lambda x: ' '.join([token.lemma_ for token in nlp(x)]))

#     return df



# DIFFICULT TO PROCESS INPUT TEXT



In [None]:
# import spacy
# nlp = spacy.load('es_core_news_sm')

# def spacy_tag_and_lemmatize(text_column):
#     tagged_lemmatized = []
#     for text in text_column:
#         doc = nlp(text)
#         tagged_lemmas = ' '.join([token.lemma_ + '_' + token.pos_ for token in doc])
#         tagged_lemmatized.append(tagged_lemmas)
#     return tagged_lemmatized
