In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv("books.csv")
df

Unnamed: 0,Title,description,authors,image,previewLink,publisher,publishedDate,infoLink,genre
0,Famous Italian Opera Arias: A Dual-Language Book,Original Italian texts with English translatio...,['Ellen H. Bleiler'],http://books.google.com/books/content?id=h__kD...,http://books.google.nl/books?id=h__kDtEueOAC&p...,Courier Corporation,2012-12-11,http://books.google.nl/books?id=h__kDtEueOAC&d...,music
1,Coaching a Championship High School Track & Fi...,Guthrie presents coaches with a blueprint for ...,['Mark Guthrie'],http://books.google.com/books/content?id=CUKDR...,http://books.google.com/books?id=CUKDRH3_LNUC&...,Human Kinetics,2003,http://books.google.com/books?id=CUKDRH3_LNUC&...,sports & recreation
2,The Best Man To Die,Chief Inspector Wexford investigates the murde...,['Ruth Rendell'],http://books.google.com/books/content?id=0PMKw...,http://books.google.com/books?id=0PMKwQEACAAJ&...,Fawcett Books,1987-05,http://books.google.com/books?id=0PMKwQEACAAJ&...,fiction
3,BEYOND THE CORAL SEA. Travels in the old Empir...,A romantic and adventurous journey to the hidd...,['Michael Moran'],http://books.google.com/books/content?id=Yhtd0...,http://books.google.com/books?id=Yhtd0WXCJKQC&...,HarperCollins UK,2012-06-28,https://play.google.com/store/books/details?id...,travel
4,Full Scoop (Full Series),"Beaumont, South Carolina, pediatrician Maggie ...","['Janet Evanovich', 'Charlotte Hughes']",http://books.google.com/books/content?id=6onh6...,http://books.google.com/books?id=6onh6Q4Lif0C&...,Macmillan,2010-04-27,http://books.google.com/books?id=6onh6Q4Lif0C&...,fiction
...,...,...,...,...,...,...,...,...,...
16923,FFH - Voice From Home,11 tracks from FFH's 2005 release featuring so...,['FFH (CRT)'],http://books.google.com/books/content?id=eRAkA...,http://books.google.com/books?id=eRAkAAAACAAJ&...,Brentwood-Benson Music Pub,2006-01-01,http://books.google.com/books?id=eRAkAAAACAAJ&...,music
16924,Lesser Evil (Star Trek Deep Space Nine: Missio...,"After a shocking death, space station Deep Spa...",['Robert Simpson'],http://books.google.com/books/content?id=LpAzA...,http://books.google.com/books?id=LpAzAAAACAAJ&...,Pocket Books/Star Trek,2002,http://books.google.com/books?id=LpAzAAAACAAJ&...,fiction
16925,The Prodigal Wife (To Have & to Hold),"""Deservedly compared to her countrywomen, Binc...",['Marcia Willett'],http://books.google.com/books/content?id=oHBzv...,http://books.google.com/books?id=oHBzvuk9P5YC&...,Macmillan,2010-01-05,https://play.google.com/store/books/details?id...,fiction
16926,Soulmates Dissipate,In this sensual and bittersweet story of true ...,['Mary B. Morrison'],http://books.google.com/books/content?id=6GWOv...,http://books.google.com/books?id=6GWOvgEACAAJ&...,Booga Bear Poetry Group,2000,http://books.google.com/books?id=6GWOvgEACAAJ&...,fiction


### Data Exploration

In [3]:
for i in df.loc[0]:
    print(i)

Famous Italian Opera Arias: A Dual-Language Book
Original Italian texts with English translations of 145 arias from 50 operas, including Rigoletto, The Marriage of Figaro, Lucia di Lammermoor, Madama Butterfly, La Bohème, and more. Introduction.
['Ellen H. Bleiler']
http://books.google.com/books/content?id=h__kDtEueOAC&printsec=frontcover&img=1&zoom=1&edge=curl&source=gbs_api
http://books.google.nl/books?id=h__kDtEueOAC&printsec=frontcover&dq=Famous+Italian+Opera+Arias:+A+Dual-Language+Book&hl=&cd=1&source=gbs_api
Courier Corporation
2012-12-11
http://books.google.nl/books?id=h__kDtEueOAC&dq=Famous+Italian+Opera+Arias:+A+Dual-Language+Book&hl=&source=gbs_api
music


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16928 entries, 0 to 16927
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Title          16928 non-null  object
 1   description    15899 non-null  object
 2   authors        16755 non-null  object
 3   image          16244 non-null  object
 4   previewLink    16928 non-null  object
 5   publisher      15453 non-null  object
 6   publishedDate  16865 non-null  object
 7   infoLink       16928 non-null  object
 8   genre          16928 non-null  object
dtypes: object(9)
memory usage: 1.2+ MB


In [5]:
df.isnull().sum()

Title               0
description      1029
authors           173
image             684
previewLink         0
publisher        1475
publishedDate      63
infoLink            0
genre               0
dtype: int64

In [6]:
test = np.array([i for i in df.authors])
test = [i[2:len(i)-2] for i in test]

In [7]:
df.authors = test
df.authors

0                           Ellen H. Bleiler
1                               Mark Guthrie
2                               Ruth Rendell
3                              Michael Moran
4        Janet Evanovich', 'Charlotte Hughes
                        ...                 
16923                              FFH (CRT)
16924                         Robert Simpson
16925                         Marcia Willett
16926                       Mary B. Morrison
16927                           George Mikes
Name: authors, Length: 16928, dtype: object

### Data Imputation

#### The null values of Description can be replaced with "No description available!"
#### The null values of Authors can be replaced with "Anonymous"
#### The null values of Images can be replaced with a Default image in database.
#### The null values of publisher can be replaced with most frequent publisher in the column.

In [8]:
type(df.image[0])

str

In [9]:
df.publisher.value_counts().keys()[0]

'Penguin'

In [10]:
df = df.fillna({
    "description" : "No Description available!",
    "authors" : "Anonymous",
    "image" : "https://tse3.mm.bing.net/th?id=OIP.P-nIodv7WzkQ4wYYPsXWaQAAAA&pid=Api&P=0",
    "publisher" : df.publisher.value_counts().keys()[0],
})

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16928 entries, 0 to 16927
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Title          16928 non-null  object
 1   description    16928 non-null  object
 2   authors        16928 non-null  object
 3   image          16928 non-null  object
 4   previewLink    16928 non-null  object
 5   publisher      16928 non-null  object
 6   publishedDate  16865 non-null  object
 7   infoLink       16928 non-null  object
 8   genre          16928 non-null  object
dtypes: object(9)
memory usage: 1.2+ MB


In [12]:
df.publishedDate.describe()

count     16865
unique     4304
top        2000
freq        336
Name: publishedDate, dtype: object

#### There are 63 null values for date of publishing, we would simply drop these rows 

In [13]:
df.dropna(inplace=True)

In [14]:
df.isnull().sum()

Title            0
description      0
authors          0
image            0
previewLink      0
publisher        0
publishedDate    0
infoLink         0
genre            0
dtype: int64

In [15]:
df.head()

Unnamed: 0,Title,description,authors,image,previewLink,publisher,publishedDate,infoLink,genre
0,Famous Italian Opera Arias: A Dual-Language Book,Original Italian texts with English translatio...,Ellen H. Bleiler,http://books.google.com/books/content?id=h__kD...,http://books.google.nl/books?id=h__kDtEueOAC&p...,Courier Corporation,2012-12-11,http://books.google.nl/books?id=h__kDtEueOAC&d...,music
1,Coaching a Championship High School Track & Fi...,Guthrie presents coaches with a blueprint for ...,Mark Guthrie,http://books.google.com/books/content?id=CUKDR...,http://books.google.com/books?id=CUKDRH3_LNUC&...,Human Kinetics,2003,http://books.google.com/books?id=CUKDRH3_LNUC&...,sports & recreation
2,The Best Man To Die,Chief Inspector Wexford investigates the murde...,Ruth Rendell,http://books.google.com/books/content?id=0PMKw...,http://books.google.com/books?id=0PMKwQEACAAJ&...,Fawcett Books,1987-05,http://books.google.com/books?id=0PMKwQEACAAJ&...,fiction
3,BEYOND THE CORAL SEA. Travels in the old Empir...,A romantic and adventurous journey to the hidd...,Michael Moran,http://books.google.com/books/content?id=Yhtd0...,http://books.google.com/books?id=Yhtd0WXCJKQC&...,HarperCollins UK,2012-06-28,https://play.google.com/store/books/details?id...,travel
4,Full Scoop (Full Series),"Beaumont, South Carolina, pediatrician Maggie ...","Janet Evanovich', 'Charlotte Hughes",http://books.google.com/books/content?id=6onh6...,http://books.google.com/books?id=6onh6Q4Lif0C&...,Macmillan,2010-04-27,http://books.google.com/books?id=6onh6Q4Lif0C&...,fiction


In [16]:
df.to_csv("dataset.csv", index=False)