IMPORT, READ_CSV

In [41]:
#Standard imports
import pandas as pd
import string
import re
import numpy as np

#SKlearn
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer


#NLTK
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import words

In [42]:
df = pd.read_csv('/Users/javm/Desktop/Projects/Book-Recommendations/books_with_blurbs.csv')

TEXT COMPILATION

In [43]:
df['text']= df['Title'] + " " + df ['Author'] + " " + df['Publisher'] + " " + df['Blurb']

In [44]:
df['text']

0        Decision in Normandy Carlo D'Este HarperPerenn...
1        Flu: The Story of the Great Influenza Pandemic...
2        The Kitchen God's Wife Amy Tan Putnam Pub Grou...
3        What If?: The World's Foremost Military Histor...
4        Goodbye to the Buttermilk Sky Julia Oliver Riv...
                               ...                        
57505    Tainted Trail Wen Spencer Roc Ukiah Oregon, ha...
57506    Twelve Mile Limit Randy Wayne White Penguin Pu...
57507    The Man With the Red Tattoo (James Bond 007) R...
57508    Iron Fist (Star Wars: X-Wing Series, Book 6) A...
57509    The Adventures of Lando Calrissian: Lando Calr...
Name: text, Length: 57510, dtype: object

PREPROCESSING

In [45]:
#function used to remove stopwords
def remove_stop_words(sentence):
    stop_words = set(stopwords.words("english"))
    words = nltk.word_tokenize(sentence)
    words = [word for word in words if word.lower() not in stop_words]
    return " ".join(words)


In [46]:
#function used to remove punctuation
def remove_punkt(text):
    punctuation = string.punctuation
    
    no_punct = ""

    for char in text:
        if char not in punctuation:
            no_punct = no_punct + char

    return no_punct


In [47]:
#function used to remove non alphabetical characters
def remove_non_word_characters(text):
    # Use a regular expression pattern to match any non-word character (including punctuation and whitespace)
    pattern = r'\W+'

    # Use the re.sub() function to remove all matches of the pattern from the input text
    no_non_word = re.sub(pattern, ' ', text)

    # Return the text with all non-word characters removed
    return no_non_word

In [48]:
#lowercase all text
def lower_all (text):
    lowercase = text.lower()
    return lowercase

In [49]:
#function used to lemmatize
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()

    # Use the lemmatizer to lemmatize each word in the input text
    lemmatized_words = [lemmatizer.lemmatize(word) for word in text.split()]

    # Join the lemmatized words back into a single string
    lemmatized_text = ' '.join(lemmatized_words)

    # Return the lemmatized text
    return lemmatized_text


In [50]:
#function used to keep only alphabetical characters
def keep_alphabetical(text):
    # Use a regular expression pattern to match any non-alphabetic character (including numbers and punctuation)
    pattern = r'[^a-zA-Z]+'

    # Use the re.sub() function to remove all matches of the pattern from the input text
    alphabetical = re.sub(pattern, ' ', text)

    # Return the text with all non-alphabetic characters removed
    return alphabetical

In [51]:
#function to keep only English language words
english_words = set(words.words())

def extract_english_words(text):
    words = text.split()
    english_words_only = (word for word in words if word.lower() in english_words)
    return english_words_only


In [52]:
def preprocess_text(sentence):
    sentence = remove_stop_words(sentence)
    sentence = remove_punkt(sentence)
    sentence = remove_non_word_characters(sentence)
    sentence = lower_all(sentence)
    sentence = lemmatize_text(sentence)
    sentence = keep_alphabetical(sentence)
    #sentence = extract_english_words(sentence)
    return sentence

In [53]:
df['text'] = df['text'].apply(preprocess_text)

In [54]:
df['text'][35]

'haveli laurel leaf book suzanne fisher staple laurel leaf world newbery honor book shabanu vividly recreated novel young pakistani woman s heartbreaking struggle tyranny custom ancient law shabanu mother face daily challenge position husband s household even plan young daughter s education uncertain future visit haveli home city lahore shabanu fall love omar spite tradition forbid union'

In [55]:
text = df['text']

In [56]:
tf_vec = TfidfVectorizer(min_df = 3)

In [57]:
vectors = tf_vec.fit_transform(text)

In [58]:
vec_df = pd.DataFrame(vectors.toarray(),
                     columns=[k for k, v in sorted(tf_vec.vocabulary_.items(), 
                     key=lambda item: item[1])])
                     

In [59]:
vec_df

Unnamed: 0,aa,aaa,aahz,aan,aardvark,aaron,ab,aba,aback,abacus,...,zweig,zweigniederlassung,zweimal,zweite,zweiten,zwinger,zwischen,zwischenmenschlichen,zyklus,zz
0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57505,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
57506,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
57507,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
57508,0.0,0.0,0.0,0.0,0.0,0.092546,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [65]:
book_search = input('What are you looking for: ')


In [66]:
print(book_search)

leopard vinyl essence thigh


In [67]:
search = preprocess_text(book_search)
search_vec = tf_vec.transform([search])

In [68]:
test = cosine_similarity(search_vec, vectors)

In [69]:
best_book = np.argmax(test[0])

print('Title : {}'.format(df.loc[best_book, 'Title']))
print('Author : {}'.format(df.loc[best_book, 'Author']))
print('Publisher : {}'.format(df.loc[best_book, 'Publisher']))
print('Blurb : {}'.format(df.loc[best_book, 'Blurb']))

Title : The Vinyl Cafe Unplugged
Author : Stuart McLean
Publisher : Penguin Books
Blurb : Why is Morley skulking around with a man named Frank on the eve of her 40th birthday? What grisly secret is Stephanie hiding in her father’s picnic cooler? And exactly what is Dave doing by himself in a Halifax hotel room with a duck? In the pages of the Vinyl Cafe Diaries, humorist Stuart McLean answers these questions and reveals more strange, shocking, and above all, entertaining truths about the seemingly ordinary folk of the Vinyl Cafe.
