In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import urllib.request # Download images
from PIL import *   # Read image
from langdetect import detect # Books only in english

import git # To clone and download the csv files

from bs4 import BeautifulSoup # For web scraping
import requests # For web scraping
import json # For web scraping
import re # Regex

from time import time 
import shutil # To remove folders
import os
import warnings
warnings.filterwarnings('ignore')

from myfunk import *

%matplotlib inline

Using TensorFlow backend.


In [2]:
## ---- Variables
# Check Path

project_name = 'book_cover_recommender'

if not os.getcwd().endswith(project_name):
    try: 
        ind = os.getcwd().index(project_name)
        idx = len(os.getcwd()) - (ind+len(project_name))
        path = os.getcwd()[:-idx]
        print(path)
        os.chdir(path)

    except:
        print('{} not found in path'.format(project_name))
else: 
    path = os.getcwd()
    print(path)
    

/Users/iZbra1/Documents/K2DS/Projects/book_cover_recommender


In [3]:
## ----  Web scraping:  ~ 1 min

try: 
    # See if the 5 csv files are already in the external folder
    
    assert len([f for f in glob.glob(path+"/data/external/*.csv")]) == 5
    print('CSV files are already in place', pd.read_csv(path+"/data/external/books.csv").shape)

    
except:
    # Get the data from goodreads 10k
    # https://github.com/zygmuntz/goodbooks-10k.git
    # save the csv files in ~/data/external
    print('Getting files from the github repo...')
    start = time()
    download_data_from_goodbooks(path)
    print(time()-start,'seconds processed')

CSV files are already in place (10000, 23)


In [5]:
def preprocess_goodbooks(path):
    '''Preps and cleans the external csv files:
        - Selects the relevant columns
        - Remove books that dont have ISBN
        - Unify the language_code
        - Correct the missing language_code
        - Get the index with english titles only
        - Drop the language_code column
        - Assign a logical label name to the current feature
        - Checks the validity of data in feature: years
        - Casting to adequate types
        - Set authors and title to lower case and remove punctuation
        - Saves csv preprocessed files on the raw subfolder
    '''
    ## ---- books.csv
    # Load csv file from goodbooks-10k
    cols = ['book_id','isbn', 'authors', 'original_publication_year',
           'title', 'language_code', 'image_url', 'average_rating', 'ratings_count']
    df = pd.read_csv(path+'/data/external/books.csv', usecols=cols, index_col=0)
    print('Preprocessing {} books'.format(df.shape[0]))

    # Remove books that dont have ISBN
    df.dropna(subset=['isbn'], inplace=True)

    # Unify the language_code
    df.loc[df.language_code.str.startswith('en', na=False),'language_code'] = 'en'
    print('Found {} books officially in english.\nRunning the english detector for the rest'.\
    format(df.loc[df.language_code=='en'].shape[0]))

    # Correct the missing language_code:
    df_lang = df.loc[df.language_code.isnull()].title.apply(lambda x: detect(x))
    df.loc[df_lang.index,'language_code'] = df_lang
    print('Detector found {} more books in english'.format((df_lang=='en').sum()))

    # Get the index with english titles only
    df = df.loc[df.language_code=='en']

    # Drop the language_code column
    df.drop('language_code', axis=1, inplace=True)

    # Rename column original_publication_year to year
    df.rename(columns={'original_publication_year': 'year'}, inplace=True)

    # Set invalid years to 0
    df.loc[df.year<0,'year'] = 0

    # Set Nan's to 0
    df['year'] = df['year'].fillna(0)

    # Convert to int
    df.year = df.year.astype(int)


    # Set authors and title to lower case and remove punctuation
    punctuation_pattern = r"[^\w\s]"
    df['authors'] = df.authors.apply(lambda x: re.sub(punctuation_pattern, '', unidecode.unidecode(x).lower().strip()))
    df['title'] = df.title.apply(lambda x: re.sub(punctuation_pattern, '', unidecode.unidecode(x).lower().strip()))

    # Save file (will be saved as UTF-8 by default!)
    df.to_csv(path+'/data/raw/books.csv')
    print("Preprocessed", df.shape[0], 'books')

    # Get the book id's that remain
    books_available = df.index.values


## ---- book_tags.csv
    df = pd.read_csv(path+'/data/external/book_tags.csv')
    df.rename(columns={'goodreads_book_id':'book_id'}, inplace=True)
    df = df.loc[df.book_id.isin(books_available)]

    # Save file
    df.to_csv(path+'/data/raw/book_tags.csv')
    print("Preprocessed", df.shape[0], 'book_tags')

## --- tags.csv
    copyfile(path+'/data/external/tags.csv', path+'/data/raw/tags.csv')

## --- ratings.csv
    df = pd.read_csv(path + '/data/external/ratings.csv')
    df = df.loc[df.book_id.isin(books_available)]
    # Save file
    df.to_csv(path + '/data/raw/ratings.csv')

## --- to_read.csv
    df = pd.read_csv(path + '/data/external/to_read.csv')
    df = df.loc[df.book_id.isin(books_available)]

    # Save file
    df.to_csv(path+'/data/raw/to_read.csv')
    

In [6]:
## --- Preprocessing: ~ 20 s
# Preprocess the csv files from the ~data/external folder 
# and save them in ~data/raw folder

start = time()
preprocess_goodbooks(path)
print(time()-start,'seconds processed')

Preprocessing 10000 books
Found 8142 books officially in english.
Running the english detector for the rest
Detector found 870 more books in english
Preprocessed 9012 books
Preprocessed 73000 book_tags
56.8722608089447 seconds processed


In [None]:
## ----- Upload the csv files to the sql instance

# df = pd.read_csv(path+'data/interim/books.csv', index_col=0)
# insert_db(df, 'books', engine)
# print('Uploaded: books')
# df2 = pd.read_csv(path+'data/external/ratings.csv', index_col=0)
# insert_db(df2, 'ratings', engine)
# print('Uploaded: ratings')
# df3 = pd.read_csv(path+'data/external/to_read.csv', index_col=0)
# insert_db(df3, 'to_read', engine)
# print('Uploaded: to_read')
# df4 = pd.read_csv(path+'data/external/tags.csv', index_col=0)
# insert_db(df4, 'tags', engine)
# print('Uploaded: tags')
# df5 = pd.read_csv(path+'data/external/book_tags.csv', index_col=0)
# insert_db(df5, 'book_tags', engine)
# print('Uploaded: book tags')
# print('insert_db done')

In [7]:
df = pd.read_csv(path+'/data/raw/books.csv', index_col=0)
print(df.shape)
df.tail(2)

(9012, 7)


Unnamed: 0_level_0,isbn,authors,year,title,average_rating,ratings_count,image_url
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
9999,61711527,peggy orenstein,2011,cinderella ate my daughter dispatches from the...,3.65,11279,https://images.gr-assets.com/books/1279214118m...
10000,375700455,john keegan,1998,the first world war,4.0,9162,https://images.gr-assets.com/books/1403194704m...


### Check book covers or download them

In [17]:
punctuation_pattern = r"[^\w\s]"

# Function that formats image filenames into the standard format
format_cover_file = lambda x: re.sub(punctuation_pattern, '', unidecode.unidecode(x).lower().strip())

# Function that returns True if the image file exists. 
sync = lambda x: os.path.isfile(os.getcwd()+'/'+x.title+" by "+x.authors+".jpg")

# Download Book Covers from image_url
def get_cover(row):
    filename = row.title+' by '+row.authors+'.jpg'
    try: 
        urllib.request.urlretrieve(row.image_url,filename)
    except: 
        print('image url not found: ', row.image_url)
        filename = 'Img not found'
    return filename

def format_image_filenames(cover_titles):
    for file in cover_titles:
    # Check if file exists
        if os.path.isfile(file+".jpg"):
#             print('Renaming..',file+".jpg")
            os.rename(file+".jpg", format_cover_file(file)+".jpg")
        else:
            print(file+".jpg doesnt exist/n")

def check_cover(x, df, pth, path):
    try: 
        os.chdir(pth)
        # Check if all covers have a reference in the dataframe
        cover_titles = [f[:-4] for f in glob.glob("*.jpg")]  
        number_of_covers = len(cover_titles)
        print('{} covers found'.format(number_of_covers))
        if number_of_covers == 0:
            return 0
        number_covers_reference = df.shape[0]
        # Apply format to cover files:  
        print('Checking filename Format')
        format_image_filenames(cover_titles)
    
    except Exception as e: 
        print('No covers found in {} drive:\n{}'.format(x,e))
        return 0 #'none'

    if number_of_covers > 0 and not number_of_covers == number_covers_reference:
        print('There are {} out of {} book covers that couldnt be downloaded'.\
              format(number_covers_reference-number_of_covers, number_covers_reference))
        # Remove from the dataset
        df['cover_exists'] = df.apply(sync, axis=1)
        print('Removing {} instances from the dataset'.format(number_covers_reference-number_of_covers))
        df = df[df.cover_exists]
        # Column holding filename
        df['cover'] = df.apply(lambda x: x.title+' by '+x.authors+".jpg", axis=1)

        # Sanity check
        # Remove cover files that dont have a reference in the dataframe. 
        img_files = [f[:-4] for f in glob.glob("*.jpg")] 
        df['filename'] = df.cover.apply(lambda x: x[:-4])
        if df.shape[0] < len(img_files):

            book_list = df.filename.values
            for img in img_files:
                if img not in book_list:
                    os.remove(img+'.jpg')
        img_files = [f[:-4] for f in glob.glob("*.jpg")]       
        if len(img_files) == df.shape[0]:
            print("Database consolidated")
        else: 
            print("Something went while in consolidating database: there are ", df.shape[0], " books in the dataframe and ", len(img_files), " cover images.")
        # Move covers folder to raw
        print("Copying images to data/raw/covers")
        src = os.getcwd()
        dst = path+"/data/raw/covers"

        try:
            #if path already exists, remove it before copying with copytree()
            if os.path.exists(dst):
                shutil.rmtree(dst)
            else: 
                os.mkdir(dst)
                
            shutil.copytree(src, dst)
            print('Image files copied!')
        except OSError as e:
            # If the error was caused because the source wasn't a directory
            if e.errno == errno.ENOTDIR:
                shutil.copy(source_dir_prompt, destination_dir_prompt)
            else:
                print('Directory not copied. Error: %s' % e)
        
        return df
    else: 
        print("Dataset is complete!",df.shape, number_of_covers)
        return df
        

In [18]:
print("Select Drive: [external/local/none]")
x = input()
if x == 'external':
    x = check_cover(x, df, '/Volumes/LEEGARE/Capstone', path)

    
elif x == 'local':
    # Local drive
    try: 
        os.mkdir(path+'/data/external/covers')
    except: 
        pass
    x = check_cover(x, df, path+"/data/external/covers/Capstone", path)


if not isinstance(x,pd.core.frame.DataFrame):
    print('Downloading book covers... into {} ETA: 68 min for 9011 books | 45s for 100 books'.format(path))
# if os.getcwd()
    start = time() # 68 min for 9011 books | 45s for 100 books
    df['cover'] = df.apply(lambda row: get_cover(row), axis=1)
    print(time()-start, 's downloading', df.shape[0], 'book covers')
    print("Covers saved in ", path+"/data/external/covers")
else:
    df = x.copy()

Select Drive: [external/local/none]
8988 covers found
Checking filename Format
There are 24 out of 9012 book covers that couldnt be downloaded
Removing 24 instances from the dataset
Database consolidated
Copying images to data/raw/covers
Path does exist!, removing
Image files copied!


In [21]:
os.chdir(path)

In [22]:
av_books = df.reset_index().book_id.unique()
assert len(av_books) == len([f[:-4] for f in glob.glob(path+"/data/raw/covers/*.jpg")])

In [25]:
# Save
df.drop(["cover_exists","filename"], axis=1, inplace=True)
df.to_csv(path+'/data/raw/books.csv')
print("Books saved")

Books saved


In [28]:
df.shape

(8988, 8)

In [26]:
# Get the RATINGS of the sample books: 
rat = pd.read_csv(path+"/data/raw/ratings.csv", index_col=0)
rat = rat.loc[rat.book_id.isin(av_books)]
av_users = rat.reset_index().user_id.unique()
rat.to_csv(path+'/data/raw/ratings.csv')
print("Ratings saved")

Ratings saved


In [13]:
# Get the USERS in RATINGS that want to read the BOOKS available
df_read = pd.read_csv(path+"/data/raw/to_read.csv", index_col=0)
df_read = df_read.loc[(df_read.user_id.isin(av_users))&(df_read.book_id.isin(av_books))]
df_read.to_csv(path+'/data/raw/to_read.csv')
print('Books to read saved!',df_read.shape,sep='\n')
df_read.head()

Books to read saved!
(861087, 2)


Unnamed: 0,user_id,book_id
0,9,8
1,15,398
2,15,275
3,37,7173
4,34,380
