# Task: Pick three of your favorite books on one of your favorite subjects. At least one of the books should have more than one author. For each book, include the title, authors, and two or three other attributes that you find interesting. Take the information that you’ve selected about these three books, and separately create three files which store the book’s information in HTML (using an html table), XML, and JSON formats (e.g. “books.html”, “books.xml”, and “books.json”).

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
import json

## HTML

In [21]:
html_url= 'https://github.com/javernw/JWCUNYAssignments/blob/master/books.html'
html_url_raw = 'https://raw.githubusercontent.com/javernw/JWCUNYAssignments/master/books.html'
html_file = requests.get(html_url)
html_file_raw = requests.get(html_url_raw)
books_raw = BeautifulSoup(html_file_raw.content, 'html.parser')
books = BeautifulSoup(html_file.content, 'html.parser')
print(books_raw)


<!DOCTYPE html>

<html>
<head>
<title>Subject Books</title>
</head>
<style>
table {
  border-collapse: collapse;
}
table, th, td {
   border: 1px solid black;
   text-align: left;
}
td {
  height: 50px;
}
</style>
<body>
<table>
<tr>
<th width="30%">Title</th>
<th width="30%">Author(s)</th>
<th width="10%">Edition</th>
<th width="15%">Publisher</th>
<th width="15%">Date Published</th>
</tr>
<tr>
<td>OpenIntro Statistics</td>
<td>David M Diez, Christopher D Barr, Mine Çetinkaya-Rundel</td>
<td>Third</td>
<td>OpenIntro, Inc.</td>
<td>07/02/2015<td>
</td></td></tr>
<tr>
<td>R for Everyone: Advanced Analytics and Graphics</td>
<td>Jared P. Lander</td>
<td>Second</td>
<td> Addison-Wesley Professional</td>
<td>06/18/2017</td>
</tr>
<tr>
<td>Automated Data Collection with R A Practical Guide to Web Scraping and Text Mining</td>
<td>Simon Munzert, Christian Rubba, Peter Meißner, Dominic Nyhuis</td>
<td>First</td>
<td>Wiley</td>
<td>01/20/2015</td>
</tr>
</table></body>
</html>


In [22]:
books_html = pd.read_html(books.text)[0]
books_html = books_html.drop(columns = 'Unnamed: 5')
books_html

Unnamed: 0,Title,Author(s),Edition,Publisher,Date Published
0,OpenIntro Statistics,"David M Diez, Christopher D Barr, Mine Çetinka...",Third,"OpenIntro, Inc.",07/02/2015
1,R for Everyone: Advanced Analytics and Graphics,Jared P. Lander,Second,Addison-Wesley Professional,06/18/2017
2,Automated Data Collection with R A Practical G...,"Simon Munzert, Christian Rubba, Peter Meißner,...",First,Wiley,01/20/2015


## XML

In [23]:
#use ET.parse when reading file from disk
#use ET.fromstring when reading from string

xml_url  = 'https://raw.githubusercontent.com/javernw/JWCUNYAssignments/master/books.xml'
xml_file = requests.get(xml_url)
xml_data = BeautifulSoup(xml_file.content, 'lxml')
print(xml_data)

<?xml version="1.0" encoding="UTF-8"?><html><body><fav_books>
<books>
<title>OpenIntro Statistics</title>
<authors>David M Diez, Christopher D Barr, Mine Çetinkaya-Rundel</authors>
<edition>Third</edition>
<publisher>OpenIntro, Inc.</publisher>
<date_published>07/02/2015</date_published>
</books>
<books>
<title>R for Everyone: Advanced Analytics and Graphics</title>
<authors>Jared P. Lander</authors>
<edition>Second</edition>
<publisher>Addison-Wesley Professional</publisher>
<date_published>06/18/2017</date_published>
</books>
<books>
<title>Automated Data Collection with R A Practical Guide to Web Scraping and Text Mining</title>
<authors>Simon Munzert, Christian Rubba, Peter Meißner, Dominic Nyhuis</authors>
<edition>First</edition>
<publisher>Wiley</publisher>
<date_published>01/20/2015</date_published>
</books>
</fav_books>
</body></html>


In [24]:
root = ET.fromstring(xml_file.content)

title=[]
for t in root.findall('books/title'):
    title.append(t.text)
    
authors = []
for a in root.findall('books/authors'):
    authors.append(a.text)
    
edition = []
for ed in root.findall('books/edition'):
    edition.append(ed.text)
    
pub = []
for p in root.findall('books/publisher'):
    pub.append(p.text)
    
date_pub = []
for dp in root.findall('books/date_published'):
    date_pub.append(dp.text)
    
# put together table and display content
books_xml = pd.DataFrame(list(zip(title, authors, edition, pub, date_pub)), columns =['Title', 'Author(s)', 'Edition', 'Publisher', 'Date Published'])
books_xml

Unnamed: 0,Title,Author(s),Edition,Publisher,Date Published
0,OpenIntro Statistics,"David M Diez, Christopher D Barr, Mine Çetinka...",Third,"OpenIntro, Inc.",07/02/2015
1,R for Everyone: Advanced Analytics and Graphics,Jared P. Lander,Second,Addison-Wesley Professional,06/18/2017
2,Automated Data Collection with R A Practical G...,"Simon Munzert, Christian Rubba, Peter Meißner,...",First,Wiley,01/20/2015


## JSON

In [25]:
json_url = 'https://raw.githubusercontent.com/javernw/JWCUNYAssignments/master/books.json'
json_file = requests.get(json_url)
json_data = json_file.json()
json_data

{'favorite books': [{'title': 'OpenIntro Statistics',
   'authors': 'David M Diez, Christopher D Barr, Mine Cetinkaya-Rundel',
   'edition': 'Third',
   'publisher': 'OpenIntro, Inc.',
   'date_published': '07/02/2015'},
  {'title': 'R for Everyone: Advanced Analytics and Graphics',
   'authors': 'Jared P. Lander',
   'edition': 'Second',
   'publisher': 'Addison-Wesley Professional',
   'date_published': '06/18/2017'},
  {'title': 'Automated Data Collection with R A Practical Guide to Web Scraping and Text Mining',
   'authors': 'Simon Munzert, Christian Rubba, Peter Meibner, Dominic Nyhuis',
   'edition': 'First',
   'publisher': 'Wiley',
   'date_published': '01/20/2015'}]}

In [26]:
books_json = pd.DataFrame(json_data['favorite books'])
books_json = books_json[['title', 'authors', 'edition', 'publisher', 'date_published']] #rearrange columns
books_json.columns = ['Title', 'Author(s)', 'Edition', 'Publisher', 'Date Published']
books_json

Unnamed: 0,Title,Author(s),Edition,Publisher,Date Published
0,OpenIntro Statistics,"David M Diez, Christopher D Barr, Mine Cetinka...",Third,"OpenIntro, Inc.",07/02/2015
1,R for Everyone: Advanced Analytics and Graphics,Jared P. Lander,Second,Addison-Wesley Professional,06/18/2017
2,Automated Data Collection with R A Practical G...,"Simon Munzert, Christian Rubba, Peter Meibner,...",First,Wiley,01/20/2015
