In [None]:
import numpy as np
import scipy.linalg as la
import sympy as sym
sym.init_printing(use_unicode=False, wrap_line=False, no_global=True)


%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib as mpl
#mpl.rcParams['savefig.dpi'] = 80
mpl.rcParams['figure.dpi'] = 80
# from IPython.display import set_matplotlib_formats
# set_matplotlib_formats('png', 'pdf')
%config InlineBackend.figure_format = 'retina'
#https://www.dataquest.io/blog/jupyter-notebook-tips-tricks-shortcuts/



import seaborn as sns
sns.set()
#sns.set_style(style= "whitegrid")
#plt.style.available
plt.style.use('fivethirtyeight')



# 0. Monte Carlo integration in Python

ref: 

http://barnesanalytics.com/monte-carlo-integration-in-python

http://people.duke.edu/~ccc14/sta-663-2016/15C_MonteCarloIntegration.html

https://people.duke.edu/~ccc14/sta-663/MonteCarlo.html

## 0.1 Monte Carlo integration

Suppose we want to find the value of

$$\int_a^b f(x) dx$$

in some region with volumne $V$. 

**Monte Carlo integration estimates this integral by estimaing the fraction of random points that fall below $f(x)$ multiplied by $V$.**

## 0.2 Monte Carlo integration to estimate the expectation

In a statistical context, we use Monte Carlo integration to estimate the expectation

$$E[h(X)] = \int_X h(x) f(x) dx$$

with


$$\bar{h_n} = \frac{1}{n} \sum_{i=1}^n h(x_i)$$

where $x_i \sim f$ is a draw from the density $f$.

We can estimate the Monte Carlo variance of the approximation as

$$v_n = \frac{1}{n} \sum_{o=1}^n (h(x_i) - \bar{h_n})^2)$$

Also, from the Central Limit Theorem,

$$\frac{\bar{h_n} - E[h(X)]}{\sqrt{v_n}} \sim \mathcal{N}(0, 1)$$


The convergence of Monte Carlo integration is $\mathcal{0}(n^{1/2})$ and independent of the dimensionality. Hence Monte Carlo integration gnereally beats numerical intergration for moderate- and high-dimensional integration since numerical integration (quadrature) converges as $\mathcal{0}(n^{d})$. 

Even for low dimensional problems, Monte Carlo integration may have an advantage when the volume to be integrated is concentrated in a very small region and we can use information from the distribution to draw samples more often in the region of importance.

##### Example
We want to estiamte the following integral $\int_0^1 e^x dx$. The minimum value of the function is 1 at $x=0$ and $e$ at $x=1$.

In [None]:
a,b = 0,1

x = np.linspace(0, 1, 100)
fig = plt.figure( figsize = (8,8) )
ax = fig.add_subplot(111)
ax.plot(x, np.exp(x));
pts = np.random.uniform(a,b,(500, 2))
pts[:, 1] *= np.e
ax.scatter(pts[:, 0], pts[:, 1])
ax.set_xlim([0,1])
ax.set_ylim([0, np.e]);

In [None]:
# sympy
import sympy
x = sympy.symbols('x')
sympy.integrate(sympy.exp(x), (x,0,1)).evalf()

In [None]:
#scipy
from scipy import integrate
integrate.quad(np.exp, 0, 1)

We can transform it to $\int_0^1 e^x 1 dx$, pdf $f$ is $1$ as uniform distribution.

$$E[h(X)] = \int_0^1 e^x 1 dx$$

with


$$\bar{h_n} = \frac{1}{n} \sum_{i=1}^n e^x$$

In [None]:
# Monte Carlo approximation
# since we can transform it to 
for n in 10**np.array([1,2,3,4,5,6,7,8]):
    x = np.random.uniform(0, 1, n)
    sol = np.mean(np.exp(x))
    print('%10d %.6f' % (n, sol))

In [None]:
# basic idea
def my_integrate(a,b,func=np.exp,n=100000):
    X=np.linspace(a,b,1000)
    y1=0
    y2=max((func(X)))+1
    print('a:%10d b:%10d y1:%.6f y2:%.6f' % (a,b,y1,y2))
    area=(b-a)*(y2-y1)
    check=[]
    xs=[]
    ys=[]
    for i in range(n):
        x=np.random.uniform(a,b,1)
        xs.append(x)
        y=np.random.uniform(y1,y2,1)
        ys.append(y)
        #Monte Carlo integration estimates this integral by estimaing the fraction of random points that fall below $f(x)$ multiplied by $V$.

        if abs(y)>abs(func(x)) or y<0:
            check.append(0)
        else:
            check.append(1)
    print(np.mean(check))
    return(np.mean(check)*area,xs,ys,check)
# TOO SLOW
# vectorization by using numpy
def my_integrate(a,b,func=np.exp,n=100):
    X=np.linspace(a,b,1000)
    y1=0
    y2=max((func(X)))  #+1
    print('a:%d b:%d y1:%.6f y2:%.6f' % (a,b,y1,y2))
    area=(b-a)*(y2-y1)
    check=np.zeros(n)
    xs=np.random.uniform(a,b,n)
    ys=np.random.uniform(y1,y2,n)
    check = np.logical_not(np.bitwise_or(abs(ys)>abs(func(xs)), ys<0) )
    #Monte Carlo integration estimates this integral by estimaing the fraction of random points that fall below $f(x)$ multiplied by $V$.
    print(np.mean(check))
    return(np.mean(check)*area,xs,ys,check)

In [None]:
for n in 10**np.array([2,4,6,7]):
    sol = my_integrate(a,b,np.exp,n)[0]
    print('n:%10d solution:%.6f' % (n, sol))

In [None]:
import pandas as pd
_,x,y,c=my_integrate(a,b,n=1000)
df=pd.DataFrame()
df['x']=x
df['y']=y
df['c']=c

fig = plt.figure( figsize = (8,8) )
ax = fig.add_subplot(111)

X=np.linspace(a,b,1000)
ax.plot(X,np.exp(X))

ax.scatter(df[df['c']== False]['x'],df[df['c'] == False]['y'],color='red')
ax.scatter(df[df['c']]['x'],df[df['c']]['y'],color='blue')
plt.show()

##  1. basics of HTML 

https://blog.hartleybrody.com/web-scraping-cheat-sheet/

https://data-lessons.github.io/library-webscraping/

https://medium.freecodecamp.org/how-to-scrape-websites-with-python-and-beautifulsoup-5946935d93fe

https://www.dataquest.io/blog/web-scraping-beautifulsoup/

https://realpython.com/python-web-scraping-practical-introduction/

http://docs.python-guide.org/en/latest/scenarios/scrape/

### 1.0.1 One example

In [None]:
%%html
<!DOCTYPE html>  
<html>  
    <head>
    </head>
    <body>
        <h1> First Scraping </h1>
        <p> Hello World </p>
    <body>
</html>

 ## 1.1 The basic syntax of an HTML webpage
 
    1. <!DOCTYPE html>: HTML documents must start with a type declaration.

    2. The HTML document is contained between <html> and </html>.

    3. The meta and script declaration of the HTML document is between <head> and </head>.

    4. The visible part of the HTML document is between <body> and </body> tags.

    5. Title headings are defined with the <h1> through <h6> tags.

    6. Paragraphs are defined with the <p> tag.

    7. Other useful tags include <a> for hyperlinks, <table> for tables, <tr> for table rows, and <td> for table columns.

    8. Also, HTML tags sometimes come with `id` or `class` attributes. The id attribute specifies a unique id for an HTML tag and the value must be unique within the HTML document. 
    
    9. The class attribute is used to define equal styles for HTML tags with the same class. We can make use of these ids and classes to help us locate the data we want.
    
    
more on [html](https://www.w3schools.com/html/html_intro.asp)    

### 1.0.2 One example

In [None]:
%%html
<!DOCTYPE html>  
<html>  
    <head>
    <!-- You can also include some CSS inline in your markup. -->
    <style>
       a {color: purple;}
       
       h1 {
          font-size: 2em;
          color: white;
          background-color: black;
        } 
       
       .item {
           color: red;
        }
        
    </style>
    </head>
<body>
<h1>My Watch</h1>
<div id="listings_prices">
 <div class="item">
  <li class="item_name">Watch</li>
  <div class="main_price">Price: $66.68</div>
       <div class="discounted_price">Discounted price: $46.68</div>
   </div>
   <div class="item">
  <li class="item_name">Watch2</li>
  <div class="main_price">Price: $56.68</div>
   </div>
</div>
</body>
</html>

## 1.1 Python script using 'class' attributs

    from bs4 import BeautifulSoup
    import requests
    page_link ='test.html'
    # fetch the content from url
    page_response = requests.get(page_link, timeout=5)
    # parse html
    page_content = BeautifulSoup(page_response.content, "html.parser")

    # extract all html elements where price is stored
    prices = page_content.find_all(class_='main_price')
    # prices has a form:
    #[<div class="main_price">Price: $66.68</div>,
    # <div class="main_price">Price: $56.68</div>]

    # you can also access the main_price class by specifying the tag of the class
    prices = page_content.find_all('div', attrs={'class':'main_price'})

## 1.2 One real exampl


https://www.bloomberg.com/quote/SPX:IND

From the result, we can see that the price is inside a few levels of HTML tags, which is `<div class="basic-quote">` → `<div class="price-container up">` → `<div class="price">`.



Similarly, if you hover and click the name “S&P 500 Index”, it is inside `<div class="basic-quote">` and `<h1 class="name">`.

In [None]:
from bs4 import BeautifulSoup
import requests
page_link ='https://www.bloomberg.com/quote/SPX:IND'
# fetch the content from url
page_response = requests.get(page_link, timeout=5)
# parse html
page_content = BeautifulSoup(page_response.content, "html.parser")




### Price info

In [None]:
# extract all html elements where price is stored
prices = page_content.find_all(class_='price')

prices

In [None]:
# you can also access the main_price class by specifying the tag of the class
prices1 = page_content.find_all('div', attrs={'class':'price'})
prices1

BS will work with special bs4 data structures, which inherit the built-in Python structures. 

So a list of posts will yield a `bs4.element.ResultSet` and each individual entry will be a `bs4.element.Tag`.

In [None]:
type(prices1)

In [None]:
type(prices1[0])

In [None]:
prices1[0].text

### Ticker info

In [None]:
# extract all html elements where price is stored
ticker = page_content.find_all(class_='ticker')
ticker

In [None]:
# you can also access the main_price class by specifying the tag of the class
ticker1 = page_content.find_all('div', attrs={'class':'ticker'})
ticker1

In [None]:
ticker1[0].text.strip() # strip() is used to remove starting 

## 2 The Watch brand, price, and information

https://www.thebay.com/webapp/wcs/stores/servlet/en/thebay/brand/komono?sre=MHP_WL_WHATSNEW3_L3

Extract the brand, price and information for the watch on the webpage.

In [None]:
page_link ='https://www.thebay.com/webapp/wcs/stores/servlet/en/thebay/brand/komono?sre=MHP_WL_WHATSNEW3_L3'
# fetch the content from url
page_response = requests.get(page_link, timeout=5)
# parse html
page_content = BeautifulSoup(page_response.content, "html.parser")



In [None]:
print(page_content)

In [None]:
# extract all html elements where price is stored
prices = page_content.find_all(class_='pro_price_black')

# you can also access the main_price class by specifying the tag of the class
prices = page_content.find_all('div', attrs={'class':'pro_price_black'})

In [None]:
%%html
<div class="pro_price_black" id="WC_CatalogEntryDBThumbnailDisplayJSPF_6555148_div_10">	
	$295.00
	</div>

In [None]:
prices

In [None]:
[i.text.strip() for i in prices ]

In [None]:
%%html
<a class="tit" onmouseover="javascript:this.href = catEntryDisplayUrl_6555148" href="http://www.thebay.com/webapp/wcs/stores/servlet/en/thebay/walther-retrograde-chronograph-rose-goldtone-metal-mesh-bracelet-watch-0600089731631--24" escapexml="false">KOMONO</a>

In [None]:
# extract all html elements where brand name is stored
brands = page_content.find_all(class_='tit')

brands

In [None]:
[i.text.strip() for i in brands ]

In [None]:
%%html
<div class="info" id="WC_CatalogEntryDBThumbnailDisplayJSPF_6555148_div_9b">
		<a id="WC_CatalogEntryDBThumbnailDisplayJSPF_6555148_link_9b" onmouseover="javascript:this.href = catEntryDisplayUrl_6555148" href="http://www.thebay.com/webapp/wcs/stores/servlet/en/thebay/walther-retrograde-chronograph-rose-goldtone-metal-mesh-bracelet-watch-0600089731631--24" escapexml="false">
				Walther Retrograde Chronograph Rose Goldtone Metal Mesh Bracelet Watch
		</a>
	</div>


In [None]:
# extract all html elements where brand name is stored
infos = page_content.find_all(class_='info')

infos

In [None]:
infos[0].text.strip() # strip() is used to remove starting 

In [None]:
[i.text.strip() for i in infos ]

In [None]:
type([i.text.strip() for i in infos ])

In [None]:
import pandas as pd
mydataframe = pd.DataFrame.from_items([
    ('price', [i.text.strip() for i in prices]),
    ('brand', [i.text.strip() for i in brands]),
   ( 'info',[i.text.strip() for i in infos])
]  )
#http://pbpython.com/pandas-list-dict.html

In [None]:
mydataframe 

## 3 A set of procedure 

Putting several of these concepts together, here’s a common idiom: iterating over a bunch of container tags and pull out content from each of them

    for product in page_content.find_all("div", "products"):
        product_title = product.find("h3").text
        product_price = product.find("span", "price").text
        product_url = product.find("a")["href"]
        print "{} is selling for {} at {}".format(product_title, product_price, product_url)

In [None]:
from bs4 import BeautifulSoup
import requests
page_link ='https://www.bestbuy.ca/en-CA/Search/SearchResults.aspx?&filter=category%3aComputers+%26+Tablets%3bcategory%3aTablets+%26+iPads%3bcategory%3aApple+iPads%3bcustom0ipadmodelseries%3aiPad+Pro&lang=en-ca'
# fetch the content from url
page_response = requests.get(page_link, timeout=5)
# parse html
page_content = BeautifulSoup(page_response.content, "html.parser")


In [None]:
#page_content

In [None]:

# you can also access the class by specifying the tag of the class
stats = page_content.find_all(class_='prod-info')
stats

In [None]:
stats[0].find("h4").text

In [None]:
stats[0].find(class_='prodprice').text.strip()

In [None]:
stats[0].find("a")["href"]

In [None]:
for product in page_content.find_all(class_='prod-info'):
    product_title = product.find("h4").text
    product_price = product.find(class_='prodprice').text.strip()
    product_url = product.find("a")["href"]
    print( "{} is selling for {} at {}".format(product_title, product_price, product_url))

## Pandas for tables

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')

url = 'https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)'

tables = pd.read_html(url,thousands=' ', header=0, index_col=0)


In [None]:
tables[2]