# 1-Webscraping

## Basics 

### Responses

In [None]:
URL = 'https://en.wikipedia.org/wiki/Questrom_School_of_Business'

In [None]:
resp = requests.get(URL)
resp

<Response [200]>

In [None]:
resp.status_code

200

In [None]:
resp.text[:500]
#website is basically a text file of html code

'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>Questrom School of Business - Wikipedia</title>\n<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"f08774ec-ae3c-4187-98cc-0be7'

### Parsing with BeautifulSoup

In [None]:
soup = BeautifulSoup(resp.text, 'html.parser')
type(soup)

bs4.BeautifulSoup

In [None]:
#title
title = soup.find('title')
title

<title>Questrom School of Business - Wikipedia</title>

In [None]:
type(title)

bs4.element.Tag

In [None]:
title.text

'Questrom School of Business - Wikipedia'

In [None]:
#body
soup.body.text[:500]

'\n\n\n\n\n\n\nQuestrom School of Business\n\nFrom Wikipedia, the free encyclopedia\n\n\n\nJump to navigation\nJump to search\nBoston UniversityQuestrom School of BusinessThe Rafik B. Hariri BuildingTypePrivateEstablished1913DeanSusan FournierAcademic staff255Undergraduates1,748Postgraduates881LocationBoston, Massachusetts, United StatesCampusUrbanWebsiteQuestrom Website\nThe Questrom School of Business (formerly, the Boston University School of Management) is the business school at Boston University in Boston, '

In [None]:
body = soup.body.text
body.strip().replace("\n", " ")[:500]

'Questrom School of Business  From Wikipedia, the free encyclopedia    Jump to navigation Jump to search Boston UniversityQuestrom School of BusinessThe Rafik B. Hariri BuildingTypePrivateEstablished1913DeanSusan FournierAcademic staff255Undergraduates1,748Postgraduates881LocationBoston, Massachusetts, United StatesCampusUrbanWebsiteQuestrom Website The Questrom School of Business (formerly, the Boston University School of Management) is the business school at Boston University in Boston, MA, USA'

In [None]:
#look for the paragraphs
pars = soup.find_all("p")

In [None]:
type(pars)

bs4.element.ResultSet

In [None]:
paragraphs = []
for p in pars:
    paragraphs.append(p.text)

In [None]:
paragraphs[0]

'The Questrom School of Business (formerly, the Boston University School of Management) is the business school at Boston University in Boston, MA, USA. Founded in 1913 as the College of Business Administration, the school offers undergraduate and graduate programs.\n'

In [None]:
paragraphs[1]

'The BU Questrom School of Business offers a Bachelor of Science in Business Administration (BSBA), Master of Business Administration (MBA) degree (full- and part-time programs), a Master of Science (MS) in Mathematical Finance, a Master of Science in Management Studies (MSMS), executive education programs, and two Ph.D. programs. Both the undergraduate and graduate programs offer dual degree options with other schools and colleges at Boston University.\n'

## Scraping Links and URLs

In [None]:
links = soup.find_all("a")
# a-tag represents links

In [None]:
len(links)
#346 links on page

346

In [None]:
links[:10]

[<a id="top"></a>,
 <a class="mw-jump-link" href="#mw-head">Jump to navigation</a>,
 <a class="mw-jump-link" href="#searchInput">Jump to search</a>,
 <a class="image" href="/wiki/File:BU_School_of_Management.JPG"><img alt="BU School of Management.JPG" data-file-height="1000" data-file-width="1474" decoding="async" height="170" src="//upload.wikimedia.org/wikipedia/commons/thumb/7/74/BU_School_of_Management.JPG/250px-BU_School_of_Management.JPG" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/7/74/BU_School_of_Management.JPG/375px-BU_School_of_Management.JPG 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/7/74/BU_School_of_Management.JPG/500px-BU_School_of_Management.JPG 2x" width="250"/></a>,
 <a class="mw-redirect" href="/wiki/Private_school" title="Private school">Private</a>,
 <a href="/wiki/Dean_(education)" title="Dean (education)">Dean</a>,
 <a href="/wiki/Undergraduate_education" title="Undergraduate education">Undergraduates</a>,
 <a href="/wiki/Postgraduate_educati

In [None]:
#parsing the link with a list comprehension
links_parsed = [link['href'] for link in links if link.has_attr("href")]
len(links_parsed)

344

In [None]:
links_parsed[:10]

['#mw-head',
 '#searchInput',
 '/wiki/File:BU_School_of_Management.JPG',
 '/wiki/Private_school',
 '/wiki/Dean_(education)',
 '/wiki/Undergraduate_education',
 '/wiki/Postgraduate_education',
 '/wiki/Boston',
 '/wiki/Massachusetts',
 '/wiki/United_States']

In [None]:
link_data = [(link.text, link['href']) for link in links if link.has_attr("href")]
link_data[:10]

[('Jump to navigation', '#mw-head'),
 ('Jump to search', '#searchInput'),
 ('', '/wiki/File:BU_School_of_Management.JPG'),
 ('Private', '/wiki/Private_school'),
 ('Dean', '/wiki/Dean_(education)'),
 ('Undergraduates', '/wiki/Undergraduate_education'),
 ('Postgraduates', '/wiki/Postgraduate_education'),
 ('Boston', '/wiki/Boston'),
 ('Massachusetts', '/wiki/Massachusetts'),
 ('United States', '/wiki/United_States')]

In [None]:
# let's only pull links that are http*
import re

http_links = soup.find_all("a", attrs = {'href': re.compile("^http.*")})
http_links[:5]

[<a class="external text" href="http://www.bu.edu/questrom/" rel="nofollow">Questrom Website</a>,
 <a class="external text" href="http://management.bu.edu/about/facts/index.html" rel="nofollow">Boston University School of Management Students Facts and Figures</a>,
 <a class="external text" href="https://web.archive.org/web/20060515211324/http://management.bu.edu/about/facts/index.html" rel="nofollow">Archived</a>,
 <a class="external text" href="https://web.archive.org/web/20160304025610/http://www.bu.edu/interactive-design/2013/09/22/school-of-management-centennial/" rel="nofollow">"School of Management Centennial"</a>,
 <a class="external text" href="http://www.bu.edu/interactive-design/2013/09/22/school-of-management-centennial/" rel="nofollow">the original</a>]

In [None]:
http_links[0]['href']

'http://www.bu.edu/questrom/'

In [None]:
http_urls = [links['href'] for links in http_links]
#now have list of all links on wikipedia page

In [None]:
http_urls[:5]

['http://www.bu.edu/questrom/',
 'http://management.bu.edu/about/facts/index.html',
 'https://web.archive.org/web/20060515211324/http://management.bu.edu/about/facts/index.html',
 'https://web.archive.org/web/20160304025610/http://www.bu.edu/interactive-design/2013/09/22/school-of-management-centennial/',
 'http://www.bu.edu/interactive-design/2013/09/22/school-of-management-centennial/']

## Parsing Web Tables

In [None]:
import pandas as pd
URL2 = 'https://en.wikipedia.org/wiki/List_of_Super_Bowl_champions'
tables = pd.read_html(URL2)
type(tables)

list

In [None]:
len(tables)

16

In [None]:
tables[1].head()

Unnamed: 0,Game,Date/Season,Winning team,Score,Losing team,Venue,City,Attendance,Ref
0,I[sb 1],"January 15, 1967 (1966 AFL/1966 NFL)","Green Bay Packersn(1, 1–0)",35–10,"Kansas City Chiefsa(1, 0–1)",Los Angeles Memorial Coliseum,"Los Angeles, California[sb 2]",61946,[12][13]
1,II[sb 1],"January 14, 1968 (1967 AFL/1967 NFL)","Green Bay Packersn(2, 2–0)",33–14,"Oakland Raidersa(1, 0–1)",Miami Orange Bowl,"Miami, Florida[sb 3]",75546,[14][13]
2,III[sb 1],"January 12, 1969 (1968 AFL/1968 NFL)","New York Jetsa(1, 1–0)",16–7,"Baltimore Coltsn(1, 0–1)",Miami Orange Bowl (2),"Miami, Florida (2)[sb 3]",75389,[15][13]
3,IV[sb 1],"January 11, 1970 (1969 AFL/1969 NFL)","Kansas City Chiefsa(2, 1–1)",23–7,"Minnesota Vikingsn(1, 0–1)",Tulane Stadium,"New Orleans, Louisiana",80562,[16][13]
4,V,"January 17, 1971 (1970)","Baltimore ColtsA(2, 1–1)",16–13,"Dallas CowboysN(1, 0–1)",Miami Orange Bowl (3),"Miami, Florida (3)[sb 3]",79204,[17][13]


## Downloading Files from Web

In [None]:
URL3 = 'https://vincentarelbundock.github.io/Rdatasets/csv/ggplot2/diamonds.csv'

> wget

In [None]:
! wget https://vincentarelbundock.github.io/Rdatasets/csv/ggplot2/diamonds.csv

--2020-11-05 01:41:13--  https://vincentarelbundock.github.io/Rdatasets/csv/ggplot2/diamonds.csv
Resolving vincentarelbundock.github.io (vincentarelbundock.github.io)... 185.199.108.153, 185.199.109.153, 185.199.110.153, ...
Connecting to vincentarelbundock.github.io (vincentarelbundock.github.io)|185.199.108.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3192560 (3.0M) [text/csv]
Saving to: ‘diamonds.csv’


2020-11-05 01:41:13 (11.1 MB/s) - ‘diamonds.csv’ saved [3192560/3192560]



# 2-APIs

## StackExchange API

![](https://i.stack.imgur.com/sByum.png)

https://api.stackexchange.com/docs

https://stackexchange.com/sites

In [None]:
BASE_URL = "https://api.stackexchange.com/2.2/"

### Answers Endpoint

In [None]:
# build a url for the answers endpoint
URL = BASE_URL + 'answers'
URL

'https://api.stackexchange.com/2.2/answers'

In [None]:
resp = requests.get(URL)
resp.status_code

400

In [None]:
resp.reason

'Bad Request'

In [None]:
resp.json()

{'error_id': 400,
 'error_message': 'site is required',
 'error_name': 'bad_parameter'}

In [None]:
# let's add site as vars
SITE = '?site=stackoverflow'
EP = 'answers'       #endpoint

In [None]:
URL1 = BASE_URL + EP + SITE
URL1

'https://api.stackexchange.com/2.2/answers?site=stackoverflow'

In [None]:
resp1 = requests.get(URL1) #kept getting 400 error for some reason
resp1.status_code

200

In [None]:
# parse the json into answers
answers = resp1.json()
answers['items'][:3]

[{'answer_id': 64690312,
  'content_license': 'CC BY-SA 4.0',
  'creation_date': 1604542702,
  'is_accepted': False,
  'last_activity_date': 1604542702,
  'owner': {'display_name': 'Sudaraka',
   'link': 'https://stackoverflow.com/users/3237589/sudaraka',
   'profile_image': 'https://graph.facebook.com/850714398/picture?type=large',
   'reputation': 15,
   'user_id': 3237589,
   'user_type': 'registered'},
  'question_id': 64690131,
  'score': 0},
 {'answer_id': 64690247,
  'content_license': 'CC BY-SA 4.0',
  'creation_date': 1604542251,
  'is_accepted': False,
  'last_activity_date': 1604542699,
  'last_edit_date': 1604542699,
  'owner': {'accept_rate': 90,
   'display_name': 'andrew_reece',
   'link': 'https://stackoverflow.com/users/2799941/andrew-reece',
   'profile_image': 'https://www.gravatar.com/avatar/260db848f0eb7e3cd55998969a7a43a4?s=128&d=identicon&r=PG&f=1',
   'reputation': 14782,
   'user_id': 2799941,
   'user_type': 'registered'},
  'question_id': 64690197,
  'score':

In [None]:
# get the answer with body text via the filter
aid = answers['items'][0]['answer_id']
EP = f"answers/{aid}"
FILTER = "&filter=withbody"
URL = BASE_URL + EP + SITE + FILTER
URL

'https://api.stackexchange.com/2.2/answers/64690312?site=stackoverflow&filter=withbody'

In [None]:
resp = requests.get(URL)
resp.status_code

200

In [None]:
resp.json()['items'][0]['body']

'<p>It is fairly straightforward to do this, instead of passing the whole data frame to <code>radachart()</code> function, just pass the required data for one site.</p>\n<pre><code>df = data.frame(ambient_air_temp = c(85, -40, 9.176667, 8.492500,  8.477500, 8.475000),\n                rel_hum = c(100.0000,0.0000,71.0700, 80.9600, 76.9875,76.5525 ),\n                bar_pre = c(2000.000, 10.000, 1013.167, 1014.000, 1012.675, 1013.775),\n                ave_ws = c(160,0,4.043333,2.035000,6.842500,6.335000),\n                pd2.5 = c(999.9, 0, 5.133333, 5.600000, 6.275000, 5.175000),\n                pd10 = c(1999.9, 0, 25.16667, 25.10000, 28.15000, 30.20000))\n\nrow.names(df) = c(&quot;1&quot;,&quot;2&quot;,&quot;arc1045&quot;,&quot;arc1047&quot;,&quot;arc1048&quot;,&quot;arc1050&quot;)\n\nlibrary(fmsb)\npltrd = function(idx) radarchart(df[c(1,2,idx),], title = row.names(df)[idx])\n</code></pre>\n<p>Since your data is in idx = {3,4,5}, you can call this function using those values</p>\n

### Questions Endpoint

In [None]:
SITE = '?site=stackoverflow'
EP = 'questions'

In [None]:
URL = BASE_URL + EP + SITE + FILTER
URL

'https://api.stackexchange.com/2.2/questions?site=stackoverflow&filter=withbody'

In [None]:
resp = requests.get(URL)

In [None]:
resp.status_code

200

In [None]:
entries = resp.json()
entries['items'][0]

{'answer_count': 1,
 'body': '<p>Is it possible to do something like the following?</p>\n<pre class="lang-scala prettyprint-override"><code>@FunctionHint(\n  output = new DataTypeHint(s&quot;is_$role BOOLEAN&quot;)\n)\nclass Func(role: String) extends TableFunction[Boolean] {\n  ...\n}\n</code></pre>\n<p>Notice the use of <code>role</code> in the Annotation which is a field on the Object being annotated.</p>\n',
 'content_license': 'CC BY-SA 4.0',
 'creation_date': 1604449925,
 'is_answered': False,
 'last_activity_date': 1604543169,
 'last_edit_date': 1604453864,
 'link': 'https://stackoverflow.com/questions/64672716/scala-access-objects-field-within-class-annotation',
 'owner': {'accept_rate': 76,
  'display_name': 'Awesome-o',
  'link': 'https://stackoverflow.com/users/1979235/awesome-o',
  'profile_image': 'https://www.gravatar.com/avatar/a0c6779dea8d581a86ee5a0dbf1eb5b9?s=128&d=identicon&r=PG',
  'reputation': 1790,
  'user_id': 1979235,
  'user_type': 'registered'},
 'question_id

### Questions by tags

You might need to look through the API docs listed above.

1. Write a function to get the questions that have a tag input by the user
1. Return a __list of dictionaries__ that includes the __question id, title, and tags__ as key:value pairs over the response object

In [None]:
def fun(tagz):
    list_of_dikts = []
    URL = BASE_URL + EP + SITE + FILTER
    resp = requests.get(URL)
    if resp.status_code == 200:
        entries = resp.json()
        for i in range(len(entries['items'])):
            if tagz in entries['items'][i]['tags']:
                dikt = {'question_id' : entries['items'][i]['question_id'], 'title' : entries['items'][i]['title'], 'tags':entries['items'][i]['tags'] }
                list_of_dikts.append(dikt)

    return list_of_dikts

In [None]:
def fun(tagz):
  list_of_dikts = []
  BASE_URL = "https://api.stackexchange.com/2.2/"
  SITE = '?site=stackoverflow'
  EP = 'questions'
  FILTER1 = "&filter=withbody"
  FILTER2 = f'&tags={tagz}'
  URL = BASE_URL + EP + SITE + FILTER1 + FILTER2
  URL
  resp = requests.get(URL)
  if resp.status_code == 200:
        entries = resp.json()
        for i in range(len(entries['items'])):
            if tagz in entries['items'][i]['tags']:
                dikt = {'question_id' : entries['items'][i]['question_id'], 'title' : entries['items'][i]['title'], 'tags':entries['items'][i]['tags'] }
                list_of_dikts.append(dikt)
  print(URL)
  return list_of_dikts
 

In [None]:
fun('python')

https://api.stackexchange.com/2.2/questions?site=stackoverflow&filter=withbody&tags=python


[{'question_id': 64690523,
  'tags': ['python'],
  'title': 'How to make a bot in python?'},
 {'question_id': 64690519,
  'tags': ['python', 'tensorflow', 'masking'],
  'title': 'Pad Tensorflow training data without affecting results'},
 {'question_id': 64690509,
  'tags': ['python', 'python-3.x', 'pygame'],
  'title': 'How do I make my orange&#39;s randomizefall pthon?'},
 {'question_id': 64690508,
  'tags': ['python', 'django'],
  'title': 'Limit Foreign Key Options in Django Select'}]

In [None]:
fun('R')

https://api.stackexchange.com/2.2/questions?site=stackoverflow&filter=withbody&tags=R


[]

In [None]:
fun('html')

https://api.stackexchange.com/2.2/questions?site=stackoverflow&filter=withbody&tags=html


[{'question_id': 60394733,
  'tags': ['javascript', 'html', 'reactjs'],
  'title': 'React - KeyBoard Navigation on ul li list'},
 {'question_id': 64690514,
  'tags': ['javascript', 'html', 'math', 'coordinates'],
  'title': 'Creating a resizable/draggable/rotate view in javascript'},
 {'question_id': 64689351,
  'tags': ['javascript', 'html', 'regex', 'function'],
  'title': 'checking each function while using onsubmit button javascript'}]