# Jonathan Halverson
# Thursday, August 3, 2017
# Chapter 5 of Bruce and Bruce

In [58]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
% matplotlib inline
plt.style.use('halverson')

### Naive Bayes

Here we try to predict the nature of Wikipedia biographies using only three records per class to try the model.

In [59]:
import re
import requests
from bs4 import BeautifulSoup
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [60]:
def scrape_and_tokenize(person):
     # download and parse the biography
     base_url = 'https://en.wikipedia.org/wiki/'
     r = requests.get(base_url + person)
     soup = BeautifulSoup(r.content, 'lxml')

     # extract the text of each paragraph
     raw_text = ''
     for paragraph in soup.find_all('p'):
          raw_text += paragraph.get_text()
    
     # keep only alphabetical characters and split on whitespace
     letters_only = re.sub("[^a-zA-Z]", " ", raw_text)
     words = letters_only.lower().split()

     # count the words and filter based on count and stopwords, apply stemming
     count = Counter(words)
     porter = PorterStemmer()
     stops = stopwords.words("english")
     words = [porter.stem(word) for word in words if (word not in stops) and (count[word] > 1) and (len(word) > 1)]
     return words

In [61]:
einstein = scrape_and_tokenize('Albert_Einstein')
newton = scrape_and_tokenize('Isaac_Newton')
darwin = scrape_and_tokenize('Charles_Darwin')
spielberg = scrape_and_tokenize('Steven_Spielberg')
allen = scrape_and_tokenize('Woody_Allen')
cameron = scrape_and_tokenize('James_Cameron')
jordan = scrape_and_tokenize('Michael_Jordan')
brady = scrape_and_tokenize('Tom_Brady')
williams = scrape_and_tokenize('Serena_Williams')

In [62]:
einstein[:10]

[u'albert',
 u'einstein',
 u'german',
 u'march',
 u'april',
 u'german',
 u'born',
 u'theoret',
 u'physicist',
 u'einstein']

The idea of Naive Bayes is to calculate $P(Y|X_j)$ based on $P(X_j|Y)$. That is, knowin the probability of all the features being associated with a given class, given the features of a new record what is the most likely class? In this example, we have three records for each type of person. For each class we will compute the probably of each word being found. Then when a new biography is presented we will figure out which class it falls into based on the words in that record.

$P(Y_i|X_1, X_2, ..., X_N)$

In [63]:
from collections import Counter

c = Counter(einstein + newton + darwin + spielberg + allen + cameron + jordan + brady + williams)
c_scientist = Counter(einstein + newton + darwin)
c_filmmaker = Counter(spielberg + allen + cameron)
c_athlete = Counter(jordan + brady + williams)

In [64]:
k = 0.5
p_scientist = {}
for word, count in c_scientist.items():
     p_scientist[word] = (count + k) / float(c[word] + 2 * k)
p_filmmaker = {}
for word, count in c_filmmaker.items():
     p_filmmaker[word] = (count + k) / float(c[word] + 2 * k)
p_athlete = {}
for word, count in c_athlete.items():
     p_athlete[word] = (count + k) / float(c[word] + 2 * k)

In [65]:
p_scientist.items()[:10]

[(u'four', 0.13218390804597702),
 (u'captain', 0.8333333333333334),
 (u'whose', 0.6428571428571429),
 (u'deviat', 0.8333333333333334),
 (u'hermann', 0.875),
 (u'everi', 0.40476190476190477),
 (u'rise', 0.9166666666666666),
 (u'quantiz', 0.9),
 (u'govern', 0.6428571428571429),
 (u'disturb', 0.8333333333333334)]

When we consider new records we must keep in mind that there will be words that did not appear in the training set. These words must be ignored.

In [66]:
kubrick = scrape_and_tokenize('Stanley_Kubrick')
kubrick = [word for word in kubrick if word in c.keys()]

In [73]:
import operator
reduce(operator.mul, np.log(p_scientist.values()))

0.0

For a continuous feature, Gaussian Naive Bayes fit the feature to a Gaussian for each class and uses the pdf to generate the required conditional probabilities.

### Linear discriminant analysis

Find the weights that maximize class separability. The idea is to maximize the ratio of distance between the class centriods over the variance within each class, weighted by the covariance matrix. Bruce & Bruce treat it as a model whereas other sources treat it as a dimensionality reduction technique.