In [1]:
#Importing Libraries
import re
import ast
import string
import pandas as pd
import numpy as np
import random
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from scipy import sparse
from scipy.sparse import hstack
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
#Importing the descriptions into dataframe
df = pd.read_csv('csv/good_bid_good_desc.csv')
df_final = pd.read_csv('csv/Final_Dataset.csv')
df = df.replace('[nan]','')
df['good_title'] = df_final['good_title']
df['good_aname'] = df_final['good_aname']
df.head()

Unnamed: 0,good_bid,good_desc,good_title,good_aname
0,9542963,['John F. Kennedy\'s inaugural address of 1961...,John F. Kennedy: The Inaugural Address,John F. Kennedy
1,11842413,,Give Me Liberty Or Give Me Death,Patrick Henry
2,18996330,,Abraham Lincoln's Second Inaugural Address,Abraham Lincoln
3,836116,"['Source of legend and lyric, reference and co...",Alice's Adventures in Wonderland,Lewis Carroll
4,108014,"[""In 1865, English author Charles Lutwidge Dod...",Through the Looking Glass,Lewis Carroll


In [3]:
#Cleaning the description - Text preprocessing
# Function for removing NonAscii characters
def _removeNonAscii(s):
    return "".join(i for i in s if  ord(i)<128)

# Function for converting into lower case
def make_lower_case(text):
    return text.lower()

# Function for removing stop words
def remove_stop_words(text):
    text = text.split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    return text

# Function for removing punctuation
def remove_punctuation(text):
    tokenizer = RegexpTokenizer(r'\w+')
    text = tokenizer.tokenize(text)
    text = " ".join(text)
    return text

# Function for removing the html tags
def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

# Function to clean
def clean(df, column_name):
    df['cleaned_'+column_name] = df[column_name].apply(_removeNonAscii)
    df['cleaned_'+column_name] = df['cleaned_'+column_name].apply(func = make_lower_case)
    df['cleaned_'+column_name] = df['cleaned_'+column_name].apply(func = remove_stop_words)
    df['cleaned_'+column_name] = df['cleaned_'+column_name].apply(func=remove_punctuation)
    df['cleaned_'+column_name] = df['cleaned_'+column_name].apply(func=remove_html)
    return df

In [4]:
df = clean(df, 'good_desc')
df = clean(df, 'good_aname')
df = clean(df, 'good_title')

In [5]:
df.head()

Unnamed: 0,good_bid,good_desc,good_title,good_aname,cleaned_good_desc,cleaned_good_aname,cleaned_good_title
0,9542963,['John F. Kennedy\'s inaugural address of 1961...,John F. Kennedy: The Inaugural Address,John F. Kennedy,john f kennedy s inaugural address 1961 unforg...,john f kennedy,john f kennedy inaugural address
1,11842413,,Give Me Liberty Or Give Me Death,Patrick Henry,,patrick henry,give liberty give death
2,18996330,,Abraham Lincoln's Second Inaugural Address,Abraham Lincoln,,abraham lincoln,abraham lincoln s second inaugural address
3,836116,"['Source of legend and lyric, reference and co...",Alice's Adventures in Wonderland,Lewis Carroll,source legend lyric reference conjecture alice...,lewis carroll,alice s adventures wonderland
4,108014,"[""In 1865, English author Charles Lutwidge Dod...",Through the Looking Glass,Lewis Carroll,in 1865 english author charles lutwidge dodgso...,lewis carroll,looking glass


## TF-IDF

In [6]:
desc_list=list(df['cleaned_good_desc'])
title_list=list(df['cleaned_good_title'])
auth_list=list(df['cleaned_good_aname'])

In [7]:
tf = TfidfVectorizer(analyzer='word', stop_words='english')

In [8]:
for i in range(len(desc_list)):
    desc_list[i] = desc_list[i] + ' '+ title_list[i] + ' ' + auth_list[i]
len(desc_list)
tfidf_matrix1 = tf.fit_transform(desc_list)

In [9]:

tfidf_desc = tf.fit_transform(desc_list)
tfidf_title = tf.fit_transform(title_list)
tfidf_auth = tf.fit_transform(auth_list)

In [10]:
tfidf_matrix = hstack([0.2*tfidf_desc, 0.7*tfidf_title, 0.1*tfidf_auth], format='csr')
tfidf_matrix

<5993x45926 sparse matrix of type '<class 'numpy.float64'>'
	with 281440 stored elements in Compressed Sparse Row format>

In [11]:
#Importing user books interactions
def load_data_into_matrix():
    #Importing dataset into dataframe
    data = pd.read_csv('csv/averaged_num_ratings.csv')

    #Converting column of string(dictionaries {user:rating}) into a list of dictionaries
    dict_list=[]
    for index, row in data.iterrows():
        temp_dict = ast.literal_eval(row['averaged_num_ratings'])
        dict_list.append(temp_dict)

    #Transforming list of dictionaries into numpy array using DictVectorizer
    v = DictVectorizer(sparse=True)
    D = dict_list
    X = v.fit_transform(D)
    #print('Shape of matrix:',X.shape)
    #print(X)
    #print('rows = books, columns = users')
    return X

#Function which removes self index from given lists
def remove_self(sim_len, sim_books, bids):
    books = []
    sim = []
    for i in range(0,len(sim_books)):
        if sim_books[i] not in bids:
            books.append(sim_books[i])
            sim.append(sim_len[i])
    df = pd.DataFrame({'books':books,'sim':sim})
    return df

#Function to get book meta data(id and title) from goodreads dataset
def get_books_details(index_list):
    df_temp = pd.read_csv('csv/Final_Dataset.csv')
    df_bid = pd.read_csv('csv/good_bid_index_bid.csv')
    good_bid_list=[]
    title_list=[]
    for i in index_list:
        good_bid = df_bid['good_bid'].loc[i]
        good_bid_list.append(good_bid)
        title_list.append(str(list(df_temp['good_title'].loc[df_temp['good_bid']==good_bid])))
    return good_bid_list, title_list

def predict(userid, matrix, tfidf_matrix, num_recommendations=10):

    #get most liked books
    bids = list(np.argsort(matrix[userid].toarray())[0])[::-1][:5] #np array to list & top 5 books
    #bids = bids[-5:] #top 5 books

    #for each book get similar books by using cosine similarity between tf idf vectors
    sim_books = []
    sim_len = []
    for bid in bids:
        cosine_sim = cosine_similarity(tfidf_matrix[bid],tfidf_matrix)
        sim = list(np.argsort(cosine_sim)[0])[::-1][:5]
        #sim = sim[-5:]
        for s in sim:
            sim_len.append(cosine_sim[0][s])
        sim_books.extend(sim)

    #remove self index and sim lengths and sort according to similarity
    books_sim_df = remove_self(sim_len, sim_books, bids)
    books_sim_df = books_sim_df.sort_values(by=['sim'], ascending=False)
    books_sim_df = books_sim_df[:num_recommendations]


    #get_book_details
    bid, title = get_books_details(list(books_sim_df['books']))
    recommendations = pd.DataFrame({'good_bid':bid, 'Title':title, 'Matrix_index':list(books_sim_df['books']), 'Similarity':list(books_sim_df['sim'])})
    return recommendations

def manipulate(user_id, matrix, max_rating=3, num_books=3):
    rated_books = list(np.where(matrix[user_id]>=max_rating)[0])
    print(rated_books)
    random_selection = random.sample(rated_books, num_books)
    print(random_selection)
    for book in random_selection:
        matrix[user_id][book]=0
    print(np.where(matrix[user_id]>=max_rating)[0])
    return matrix

In [12]:
matrix = load_data_into_matrix()
matrix = matrix.T
matrix.shape

(433249, 5993)

In [13]:
matrix

<433249x5993 sparse matrix of type '<class 'numpy.float64'>'
	with 3261272 stored elements in Compressed Sparse Column format>

In [14]:
predict(0, matrix, tfidf_matrix, 10)

Unnamed: 0,good_bid,Title,Matrix_index,Similarity
0,10356101,['The Memoirs of Sherlock Holmes'],412,0.760605
1,640380,['Return Of Sherlock Holmes'],62,0.736856
2,20574541,['The Treasure'],1560,0.690517
3,7479376,['Sea Garden'],4506,0.487149
4,21045297,['Stolen Treasure'],2538,0.475888
5,6412074,['The Secret House'],4271,0.475631
6,9920114,['For Treasure Bound'],3735,0.465557
7,15846147,['The Island Mystery'],4604,0.461125
8,95109,['My Summer in a Garden'],1330,0.441661
9,2032127,['The Secret of the Night'],838,0.437889


In [15]:
predict(0, matrix, tfidf_matrix1, 10)

Unnamed: 0,good_bid,Title,Matrix_index,Similarity
0,161820,['The Croxley Master: A Great Tale of the Priz...,5290,0.40692
1,19438473,['The Adventure of the Dying Detective'],1078,0.355522
2,10319646,['The Tale of Benjamin Bunny'],2995,0.344506
3,640380,['Return Of Sherlock Holmes'],62,0.335264
4,18420587,['A Study in Scarlet'],137,0.308498
5,345183,['Great Big Treasury of Beatrix Potter'],303,0.275434
6,332470,['Essays in the Art of Writing'],256,0.241691
7,2263773,"['20,000 Leagues Under the Sea']",1988,0.238249
8,8436543,"['Collection of Beatrix Potter Stories, a']",308,0.231493
9,10638384,['Underwoods'],234,0.229753


In [16]:
dense_matrix = matrix.toarray()
dense_matrix.shape

(433249, 5993)

In [17]:
dense_matrix = manipulate(0, dense_matrix)

[16, 38, 55, 66, 68, 98, 101, 124, 126, 475, 541, 542, 669, 696, 756, 828, 1041, 1251, 1340, 1990, 2805, 3086]
[66, 68, 98]
[  16   38   55  101  124  126  475  541  542  669  696  756  828 1041
 1251 1340 1990 2805 3086]


In [18]:
sp = sparse.csr_matrix(dense_matrix)

In [19]:
predict(0, sp, tfidf_matrix, 10)

Unnamed: 0,good_bid,Title,Matrix_index,Similarity
0,16050933,"[""The Adventures of Huckleberry Finn Tom Sawye...",4854,0.66297
1,18940497,"['Adventures of Huckleberry Finn, Chapters 21 ...",2083,0.648165
2,7277831,"[""Wild Bill's Last Trail""]",3743,0.643343
3,9386880,['Wild Apples'],1540,0.546821
4,2263698,['Jeremy'],1419,0.526914
5,9850427,['After London or Wild England'],2936,0.525224
6,6493511,['Wild Life on the Rockies'],4490,0.518569
7,375712,['The Tale of Mr. Tod'],3594,0.385907
8,20623490,"[""W. A. G.'s Tale""]",2463,0.315108
9,835800,['Phineas Finn: The Irish Member'],3396,0.274658


In [20]:
predict(0, sp, tfidf_matrix1, 10)

Unnamed: 0,good_bid,Title,Matrix_index,Similarity
0,375712,['The Tale of Mr. Tod'],3594,0.407511
1,8436543,"['Collection of Beatrix Potter Stories, a']",308,0.373218
2,18940497,"['Adventures of Huckleberry Finn, Chapters 21 ...",2083,0.34305
3,22891139,['The Tale of Mrs. Tittlemouse'],3296,0.306355
4,345183,['Great Big Treasury of Beatrix Potter'],303,0.306013
5,1146398,['Tom Sawyer Abroad'],48,0.287189
6,298648,['What Is Man? and Other Essays'],33,0.269436
7,556967,['Chamber Music'],1254,0.266516
8,11685279,['A Memoir of Jane Austen'],3370,0.251307
9,16050933,"[""The Adventures of Huckleberry Finn Tom Sawye...",4854,0.244607
