# Read csv

In [None]:
import pandas as pd
df = pd.read_csv('results.csv')
print(df.info())
print(df.head())

# Read xlsx

In [None]:
import pandas as pd
xlsx = pd.ExcelFile('results.xlsx')
print(xlsx.sheet_names)

In [None]:
df = xlsx.parse('results')
print(df.info())
print(df.head())

# Read from HTML

In [None]:
# Import packages
import requests
from bs4 import BeautifulSoup

# Specify url: url
url = 'https://en.wikipedia.org/wiki/2018_Pacific_typhoon_season'

In [None]:
# Package the request, send the request and catch the response: r
r = requests.get(url)
print(r) # https://zh.wikipedia.org/wiki/HTTP%E7%8A%B6%E6%80%81%E7%A0%81
print(dir(r))

In [None]:
# Extracts the response as html: html_doc
html_doc = r.text
print(html_doc)

In [None]:
# Create a BeautifulSoup object from the HTML: soup
soup = BeautifulSoup(html_doc)

In [None]:
# Get the title of webpage: web_title
web_title = soup.title
print(web_title)

In [None]:
# Prettify the BeautifulSoup object: pretty_soup
pretty_soup = soup.prettify()

# Print the response
print(pretty_soup)

In [None]:
# Get web text: web_text
web_text = soup.get_text()

# Print web text to the shell
print(web_text)

In [None]:
# Find all 'a' tags (which define hyperlinks): a_tags
a_tags = soup.find_all('a')
# tag list: https://www.w3schools.com/tags/ref_byfunc.asp

# Print the URLs to the shell
for link in a_tags:
    print(link.get('href'))

# Exercise 1

Try to get all images' addresses. <br>
Hint: img ... src = "the address"

In [None]:
# ...

In [None]:
import os
import urllib.request as ur

for link in img_tags:
    imgurl = link.get('src')
    if (not ('png' in imgurl)):
        continue    
    if not os.path.exists('png'):
        os.makedirs('png')
    try:
        ur.urlretrieve('http:' + imgurl, os.path.join('png', os.path.basename(imgurl)))
    except:
        print(os.path.basename(imgurl) + ' Get Error.')    
        continue


# Read from API

In [None]:
# Data Dictionary: http://resource.data.one.gov.hk/ogcio/carpark/Parking_Vacancy_Data_Specification.pdf
# Import packages
from urllib.request import urlopen, Request

# Specify the url
# url = "https://api.data.gov.hk/v1/carpark-info-vacancy?data=vacancy&vehicleTypes=privateCar"
url = "https://api.data.gov.hk/v1/carpark-info-vacancy?data=info&vehicleTypes=privateCar&lang=zh_TW"

# This packages the request: request
request = Request(url)
print(type(request))

In [None]:
# Sends the request and catches the response: response
response = urlopen(request)
print(response)
# Print the datatype of response
print(type(response))

In [None]:
# Extract the response: html
output = response.read()
# Print the html
print(output)

In [None]:
import json
outjson = json.loads(output)
print(outjson)

In [None]:
import pandas as pd
df = pd.DataFrame(outjson['results'])
print(df.head())
print(df.columns)

In [None]:
from pandas.io.json import json_normalize
df = json_normalize(outjson['results'])
print(df.head())
print(df.columns)

# Exercise 2

try to read the following: <br>
url = "https://api.data.gov.hk/v1/carpark-info-vacancy?data=vacancy&vehicleTypes=privateCar"

In [None]:
#...

# 5-Number Summary

In [None]:
import numpy as np

x = np.random.normal(0.75,size=1000)
print(np.min(x), np.mean(x), np.median(x), np.std(x), np.max(x))

In [None]:
import pandas as pd
df = pd.DataFrame(x)
df.describe()

# Moments

In [None]:
print(np.mean(x))
print(np.mean(np.append(x, np.nan)))
print(np.nanmean(np.append(x, np.nan)))

In [None]:
print(np.var(x))
print(np.var(np.append(x, np.nan)))
print(np.nanvar(np.append(x, np.nan)))

In [None]:
import scipy.stats as stats
# https://docs.scipy.org/doc/scipy/reference/stats.html 

print(stats.skew(x))
print(stats.kurtosis(x))

In [None]:
chi_squared_df2 = np.random.chisquare(2, size=10000)
stats.skew(chi_squared_df2)

In [None]:
chi_squared_df5 = np.random.chisquare(5, size=10000)
stats.skew(chi_squared_df5)

In [None]:
# %matplotlib inline
%matplotlib notebook
import matplotlib
import matplotlib.pyplot as plt

output = plt.hist([chi_squared_df2,chi_squared_df5], bins=50, histtype='step', 
                  label=['2 degrees of freedom','5 degrees of freedom'])
plt.legend(loc='upper right')

# Exercise 3

Try to repeat the above: but using np.random.standard_t(5, size=10000), np.random.standard_t(15, size=10000). <br>
Calculate their kurtoses and draw their histograms

In [None]:
# ...

# Probability and Distribution

In [None]:
%matplotlib notebook
import numpy as np
from scipy.stats import binom
# distribution of choices - https://docs.scipy.org/doc/scipy/reference/stats.html

bino_pmf = binom.pmf(np.arange(20), 10, 0.5)
bino_cdf = binom.cdf(np.arange(20), 10, 0.5)
bino_q005 = binom.ppf(0.05, 10, 0.5)
bino_x = np.random.binomial(10, 0.5, 1000)

In [None]:
import matplotlib.pyplot as plt

print(bino_p005)

plt.plot(np.arange(20), bino_pmf, '-', np.arange(20), bino_cdf, '-')
plt.show()

In [None]:
plt.hist(bino_x)

In [None]:
print(np.random.normal(0,1,5))
np.random.seed(0)
print(np.random.normal(0,1,5))

In [None]:
np.random.seed(9527)
print(np.random.normal(0,1,5))
np.random.seed(9527)
print(np.random.normal(0,1,5))

# Exercise 4

For N(0,1), please find a set of pdf, cdf, q(0.025) (~= -1.96). <br>
Within the range of np.arange(-3.0,3.0,0.1)

In [None]:
# ...