# Week 3

<img src="IllustrationCentralTheorem.png" width="600">

This week we starting out heavy with a lot of webscraping. We learned about MongoDB, a NoSQL database paradigm which works much better for storing data scraped from the web. I'm not too familiar with html or web development so there was a ton of new concepts and terminology which I am still absorbing. MongoDB is a lot more flexible that SQL given it's lack of constraints on collections (tables in SQL). 

On day three we starting getting to that mathematics with a day on Probability Theory. It was foundational to make sure everyone is up to speed. I was more comfortable going in to the program with the math so it was a n ice change.

**Topics Covered:**

* Web Scraping
 * BeautifulSoup
* MongoDB
* high-performance Python
 * AWS (EC2, S3)
 * Threading
 * Multiprocessing
* Probability
 * Chain rule
 * Bayes' Theorem
 * Law of Total Probability
* Probability Distributions
 * Bernoulli
 * Binomial
 * Poisson
 * Exponential
 * Geometric
 * Gaussian
 * Joint Distributions
 * Covariance
 * Pearson and Spearman Correlation Coefficients
* Random Variables
* Sampling and Estimation
 * Model fitting
 * Method of Moments
 * Maximum Likelihood Function
 * Confidence Intervals
* Central Limit Theorem
 * Sampling Distributions
 * Boostrap
* Hypothesis Testing
 * z-tests and t-tests

**Some code examples:**

```python
import requests
import os 
from pymongo import MongoClient
from bs4 import BeautifulSoup
import time

def single_query(link, payload):
    response = requests.get(link, params=payload)
    if response.status_code != 200:
        print('WARNING', response.status_code)
    else:
        return response.json()

if __name__ == '__main__':
    client = MongoClient()
    db = client['NYT']
    tab = db['articles']
    link = 'http://api.nytimes.com/svc/search/v2/articlesearch.json'
    payload = {'api-key': os.environ['NYT_API_KEY']}
    payload['sort'] = 'newest'
    print('Starting!')
    for i in range(1):
        payload['page'] = str(i)
        data = single_query(link, payload)
        data = data['response']['docs']
        for article in data:
            r = requests.get(article['web_url'])
            html_string = r.content
            soup = BeautifulSoup(html_string, 'html.parser')
            text_rows = soup.select('.e2kc3sl0')
            # print(text_rows)
            article_text = ''
            for row in text_rows:
                article_text = article_text + '\n' + str(row.string)
            print(article_text)
            article['html'] = html_string
            article['full_text'] = article_text
        results = tab.insert_many(data)
        print('1 pages parsed!'.format(i))
```

```python

import numpy as np 
import pandas as pd
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import boto3

s3 = boto3.client('s3')

s3.list_buckets()

response = s3.list_objects_v2(Bucket = 'jeremy-its-my-bucket')

response['Contents']

for item in response['Contents']:
    print(item)

df = pd.read_csv('https://s3.amazonaws.com/jeremy-its-my-bucket/cancer.csv')

df.info()

df['rate'] = df['cancer'] / df['population']

df['rate'].hist(bins=20)

plt.figure(figsize=(12,8))
plt.hist(df['rate'], bins=20, color='r')
plt.title('Rates of Cancer')
plt.savefig(fname="rate_hist.png")
df['rate'].to_csv('cancer_rates.csv')

s3.upload_file(Filename = 'rate_hist.png', Bucket = 'jeremys-next-bucket', Key = 'rate_hist.png')

s3.upload_file(Filename = 'cancer_rates.csv', Bucket = 'jeremys-next-bucket', Key = 'cancer_rates.csv')

```

```python

import numpy as np
import scipy.stats as scs
import matplotlib.pyplot as plt
%matplotlib inline

def plot_pmf(dist, x):
    plt.scatter(x, dist.pmf(x))
    plt.vlines(x, 0, dist.pmf(x))
    plt.ylim(ymin=0)
    plt.xlabel('x')
    plt.ylabel('P(x)')

def plot_pdf(dist, x):
    plt.plot(x, dist.pdf(x))
    plt.ylim(ymin=0)
    plt.xlabel('x')
    plt.ylabel('pdf')
    
lam = 2
# what we call lambda here, scipy calls mu
pois = scs.distributions.poisson(mu=lam)
x = np.arange(0, 10, 1)
plot_pmf(pois, x)
plt.title('Poisson distribution, lambda={}'.format(lam));

print(pois.pmf(0))

n, p = (20, 0.1)
binom = scs.distributions.binom(n=n,p=p)
x = np.arange(0, 10, 1)
plot_pmf(binom, x)
plt.title('Binomial distribution, n={}, p={}'.format(n,p))
plt.xlabel('x (number of successes)')

print(binom.pmf(2))

lam_e = 0.5
# lambda here gets inverted and passed to scipy as 'scale'
expo = scs.distributions.expon(scale=1./lam_e)
x = np.linspace(0, 10, 1000)
plot_pdf(expo, x)
plt.title('Exponential distribution, lambda={}'.format(lam_e));

print(1 - expo.cdf(3))

p = 0.08
geom = scs.distributions.geom(p=p)
x = np.arange(0, 40, 1)
plot_pmf(geom, x)
plt.title('Geometric distribution, p={}'.format(p));
expected_value = 1 / p
print(expected_value)

```

```python
from scipy.stats.kde import gaussian_kde
from scipy.stats import norm
from numpy import linspace,hstack
from pylab import plot,show,hist

#stack the variable
# samp = hstack([low_income,medium_income,high_income])
# obtaining the pdf (my_pdf is a function!)
min_pdf = gaussian_kde(low_income)
med_pdf = gaussian_kde(medium_income)
high_pdf = gaussian_kde(high_income)

# plotting the result
plt.figure(figsize = (12,8))
plt.xlabel('GPA')
plt.ylabel('Probability Density')
x = linspace(1.5,4,1000)
plot(x,min_pdf(x),'r',label = 'Low Income') # distribution function
plot(x,med_pdf(x),'b',label = 'Med Income')
plot(x,high_pdf(x),'g',label = 'High Income')
# hist(samp,normed=1,alpha=.3) # histogram
# show()
plt.legend()
```