# **Goodreads Neighborhood**


## **Imports**


In [1]:
import os
import sys
from warnings import filterwarnings
import matplotlib.pyplot as plt
from pathlib import Path
from dotenv import load_dotenv

%matplotlib inline
%load_ext autotime

load_dotenv()
filterwarnings("ignore")

module_path = os.path.abspath(os.path.join("../"))
parent_dir = os.path.dirname(os.getcwd())
if module_path not in sys.path:
    sys.path.append(module_path)

time: 9.42 ms (started: 2022-12-25 16:14:04 -05:00)


In [2]:
import pandas as pd
import ast
import networkx as nx

import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin

PATH = os.getcwd()
PROJECT = str(Path(PATH).parents[0])
CLOUD_STORAGE_BUCKET = os.getenv("CLOUD_STORAGE_BUCKET")

time: 3.12 s (started: 2022-12-25 16:14:04 -05:00)


## **Read**


In [3]:
!{sys.executable} -m pip install lxml

You should consider upgrading via the '/media/starscream/wheeljack/projects/ego_networks/.venv/bin/python3 -m pip install --upgrade pip' command.[0m
time: 1.49 s (started: 2022-12-24 16:42:01 -05:00)


In [3]:
base_site = "https://www.goodreads.com/review/list/126744914?per_page=infinite&shelf=read" 
r = requests.get(base_site)
df_list = pd.read_html(r.text)
df_read = df_list[-1].copy()
df_read["title"] = df_read["title"].apply(lambda x: re.sub("^(title )", "", x).lower())
df_read["author"] = df_read["author"].apply(lambda x: re.sub("(^author )|( \*$)", "", x).lower())
df_read["date"] = pd.to_datetime(df_read["read"].apply(lambda x: re.sub("^(date read )", "", x)))
cols = ["title", "author", "date"]
df_read = df_read[cols].sort_values(by="date", ascending=False)
df_read.head(10)

Unnamed: 0,title,author,date
3,"surface detail (culture, #9)","banks, iain m.",2022-11-13
6,statistical rethinking: a bayesian course with...,"mcelreath, richard",2022-09-24
2,novacene: the coming age of hyperintelligence,"lovelock, james e.",2022-09-10
0,introducing game theory: a graphic guide,"pastine, ivan",2022-09-03
1,the spy and the traitor: the greatest espionag...,"macintyre, ben",2022-08-28
25,the irrational ape: why flawed logic puts us a...,"grimes, david robert",2022-08-14
15,atomic habits: an easy & proven way to build g...,"clear, james",2022-07-24
4,the manager's path: a guide for tech leaders n...,"fournier, camille",2022-06-25
22,emotions revealed: recognizing faces and feeli...,"ekman, paul",2022-06-19
11,the unicorn project,"kim, gene",2022-05-05


time: 1.78 s (started: 2022-12-25 16:14:10 -05:00)


In [4]:
base_site = "https://www.goodreads.com/review/list/126744914?shelf=currently-reading" 
r = requests.get(base_site)
df_list = pd.read_html(r.text)
df_current = df_list[-1].copy()
df_current["title"] = df_current["title"].apply(lambda x: re.sub("^(title )", "", x))
df_current["author"] = df_current["author"].apply(lambda x: re.sub("(^author )|( \*$)", "", x))
df_current["date"] = pd.to_datetime(df_current["added"].apply(lambda x: re.sub("^(date added )", "", x)))
cols = ["title", "author", "date"]
df_current = df_current[cols].sort_values(by="date", ascending=False)
df_current.head(10)

Unnamed: 0,title,author,date
0,Dive Into Design Patterns,"Shvets, Alexander",2022-09-28
1,Deep Reinforcement Learning with Python: Maste...,"Ravichandiran, Sudharsan",2022-09-24
2,Information Retrieval: Implementing and Evalua...,"Büttcher, Stefan",2022-07-27
3,"Guns, Germs, and Steel: The Fates of Human Soc...","Diamond, Jared",2022-07-10


time: 881 ms (started: 2022-12-25 16:14:21 -05:00)


In [25]:
import requests
import json

# params = {'inauthor': 'jared', 'intitle': 'guns, germs, and steel'}
params = {'inauthor': 'james,clear', 'intitle': 'atomic habits'.lower()}
new_params = 'q='
new_params += '+'.join('{}:{}'.format(key, value) for key, value in params.items())
r = requests.get('https://www.googleapis.com/books/v1/volumes?', params=new_params)

uri = f"https://www.googleapis.com/books/v1/volumes?q=inauthor:gene,kim+intitle:the,unicorn,project"
r = requests.get(uri)
_ = pd.DataFrame(json.loads(r.text).get('items'))
cols = ["kind","volumeInfo"]
_ = _[cols]
_["title"] = _["volumeInfo"].apply(lambda x: x.get("title","none").lower().strip())
_["rating"] = _["volumeInfo"].apply(lambda x: int(x.get("ratingsCount", 0)))
# _["subtitle"] = _["volumeInfo"].apply(lambda x: x.get("subtitle","none").lower().strip())
_["description"] = _["volumeInfo"].apply(lambda x: x.get("description","none").lower().strip())
_ = _[_.rating>0].sort_values(by="rating", ascending=False)
# _ = _.drop_duplicates(subset=["kind"], keep="first")
cols = ["title", "description"]
_ = _[cols]
_.head()

Unnamed: 0,title,description
2,the phoenix project,"***over a half-million sold! the sequel, the u..."
0,the unicorn project,the phoenix project wowed over a half-million ...
1,the unicorn project,the phoenix project wowed over a half-million ...


time: 1.09 s (started: 2022-12-25 16:25:13 -05:00)


In [46]:
r_text = " ".join(_.description.unique())
r_text = re.sub(r"[^a-zA-Z0-9 \n\.]", " ", r_text)

time: 3.63 ms (started: 2022-12-25 16:33:55 -05:00)


In [58]:
def get_book_description(title, author):
    uri = f"https://www.googleapis.com/books/v1/volumes?q=inauthor:{author.lower()}+intitle:{title.lower()}"
    r = requests.get(uri)
    cols = ["kind","volumeInfo"]
    try:
        items = pd.DataFrame(json.loads(r.text).get('items'))[cols]
        items["rating"] = items["volumeInfo"].apply(lambda x: int(x.get("ratingsCount", 0)))
        items["description"] = items["volumeInfo"].apply(lambda x: x.get("description","none").lower().strip())
        items = items[items.rating>0].sort_values(by="rating", ascending=False).head(1)
        desc = items.description.iloc[0]
    except:
        desc = ""
    return re.sub(r"[^a-zA-Z0-9 \n\.]", " ", desc)

time: 2.2 ms (started: 2022-12-25 16:48:26 -05:00)


In [62]:
df_current["desc"] = df_current[["title", "author"]].apply(lambda x: get_book_description(x.title, x.author), axis=1)
df_current["book"] = df_current["title"] + " " + df_current["desc"]
df_current.head()

Unnamed: 0,title,author,date,desc,book
0,Dive Into Design Patterns,"Shvets, Alexander",2022-09-28,,Dive Into Design Patterns
1,Deep Reinforcement Learning with Python: Maste...,"Ravichandiran, Sudharsan",2022-09-24,an example rich guide for beginners to start t...,Deep Reinforcement Learning with Python: Maste...
2,Information Retrieval: Implementing and Evalua...,"Büttcher, Stefan",2022-07-27,,Information Retrieval: Implementing and Evalua...
3,"Guns, Germs, and Steel: The Fates of Human Soc...","Diamond, Jared",2022-07-10,patterned planet special editions of groundbr...,"Guns, Germs, and Steel: The Fates of Human Soc..."


time: 1.63 s (started: 2022-12-25 16:54:34 -05:00)


In [60]:
df_read["desc"] = df_read[["title", "author"]].apply(lambda x: get_book_description(x.title, x.author), axis=1)
df_read.head(10)

Unnamed: 0,title,author,date,desc
3,"surface detail (culture, #9)","banks, iain m.",2022-11-13,it begins in the realm of the real where matt...
6,statistical rethinking: a bayesian course with...,"mcelreath, richard",2022-09-24,
2,novacene: the coming age of hyperintelligence,"lovelock, james e.",2022-09-10,
0,introducing game theory: a graphic guide,"pastine, ivan",2022-09-03,
1,the spy and the traitor: the greatest espionag...,"macintyre, ben",2022-08-28,new york times bestseller now a netflix film...
25,the irrational ape: why flawed logic puts us a...,"grimes, david robert",2022-08-14,
15,atomic habits: an easy & proven way to build g...,"clear, james",2022-07-24,a leading expert on habit formation reveals pr...
4,the manager's path: a guide for tech leaders n...,"fournier, camille",2022-06-25,managing people is difficult wherever you work...
22,emotions revealed: recognizing faces and feeli...,"ekman, paul",2022-06-19,an expert on nonverbal communication traces th...
11,the unicorn project,"kim, gene",2022-05-05,over a half million sold the sequel the u...


time: 15.1 s (started: 2022-12-25 16:48:58 -05:00)


In [61]:
df_read.tail(10)

Unnamed: 0,title,author,date,desc
29,graph representation learning,"hamilton, william l.",2021-10-30,
19,recommender systems: the textbook,"aggarwal, charu c.",2019-12-31,
21,effective python: 59 specific ways to write be...,"slatkin, brett",2019-12-31,
17,applied predictive modeling,"kuhn, max",2018-12-31,applied predictive modeling covers the overall...
7,the 48 laws of power,"greene, robert",2017-12-31,cunning instructive and amoral this controv...
13,deep learning with r,"chollet, francois",2017-12-31,
18,applied predictive analytics: principles and t...,"abbott, dean",2016-12-31,
20,a place called freedom,"follett, ken",2014-12-31,the worldwide phenomenon from the bestselling ...
16,power and love: a theory and practice of socia...,"kahane, adam",2011-12-31,
26,deception point,"brown, dan",2011-06-12,a boxed set containing two best selling thrill...


time: 29.2 ms (started: 2022-12-25 16:49:39 -05:00)
