In [1]:
import os
from enum import IntEnum
from pathlib import Path

import pandas as pd

# Load Data

In [2]:
cwd = Path(os.getcwd())
data_dir = cwd.parents[1].joinpath("data/think-bayes")
if not data_dir.exists():
    data_dir.mkdir(parents=True, exist_ok=True)
data_dir

PosixPath('/home/nlibertini/Repositories/bayesian-stats/data/think-bayes')

## Download and save

In [None]:
(
    pd
    .read_csv("https://github.com/AllenDowney/ThinkBayes2/raw/master/data/gss_bayes.csv")
    .to_csv(data_dir.joinpath("gss_bayes.csv"))
)

## Load from local dir

In [3]:
sample_population = pd.read_csv(data_dir.joinpath("gss_bayes.csv"), index_col=0)
sample_population.info()          

<class 'pandas.core.frame.DataFrame'>
Index: 49290 entries, 0 to 49289
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   caseid    49290 non-null  int64  
 1   year      49290 non-null  int64  
 2   age       49290 non-null  float64
 3   sex       49290 non-null  int64  
 4   polviews  49290 non-null  float64
 5   partyid   49290 non-null  float64
 6   indus10   49290 non-null  float64
dtypes: float64(4), int64(3)
memory usage: 3.0 MB


## Column Encodings

In [4]:
class Sex(IntEnum):
    Male = 1
    Female = 2

class PolViews(IntEnum):
    ExtremelyLiberal = 1
    Liberal = 2
    SlightlyLiberal = 3
    Moderate = 4
    SlightlyConservative = 5
    Conservative = 6
    ExtremelyConservative = 7

class Party(IntEnum):
    StrongDemocrat = 0
    NotStrongDemocrat = 1
    IndependentNearDemocrat = 2
    Independent = 3
    IndependentNearRepublican = 4
    NotStrongRepublican = 5
    StrongRepublican = 6
    OtherParty = 7

# What is Probability?
- From a Bayesian perspective, probability is a measure of one's uncertainty or confidence about a particular inference question.
    - *How likely is it to rain tomorrow?*
    - *What's the chance I left the stove on when I rushed out the door to work?*
    - *What is our confidence that a particular candidate will win an election?*
    - *Given we observed three heads in a row, what is the chance the coin being flipped has heads on both sides?*
- One's uncertainty can encompass:
    - Observed frequencies of similar events.
    - Expert insight into the factors that influence the question at hand.
    - All other types of prior knowledge. 
- It is not limited to the narrow definition of *probability = long run frequency*.
- Given this definition of probability, it reveals that **all probabilities are conditional probabilities.**
    - This is because are probability questions are conditioned on what we already know about the question: `P(X | what we know)`

# Conditional Probability
- A conditional probability is the uncertainty we have in an inference question given what we already know.
- It could be our prior probability before looking at any data: `P(female | prior knowledge) ~ 50%`.
- Or it could be the probability conditioned on some new data: 

What is the probability a respondent is a `StrongDemocrat`, given we know they are a `Female`?

In [5]:
females = sample_population[sample_population.sex == Sex.Female]
p = (females.partyid == Party.StrongDemocrat).mean()

print(f"P(StrongDemocrat|female) = {p:.2%}")

P(StrongDemocrat|female) = 17.47%


## Conditional Probability Is Not Commutative
- The order of conditioning matters: P(A|B) != P(B|A)
- The subset of respondents that are female is different than the subset that are strong Democrats.

What is the probability that a respondent is a `Female`, given they are a `StrongDemocrat`?

In [6]:
strong_dems = sample_population[sample_population.partyid == Party.StrongDemocrat]
p = (strong_dems.sex == Sex.Female).mean()

print(f"P(female|StrongDemocrat) = {p:.2%}")

P(female|StrongDemocrat) = 58.75%


## Condition and Conjunction
- We can condition on more than one variable at time.
- The multiple conditioning variables are combined via conjunction.

What probability a respondent is female, given that they are a `StrongDemocrat` with `liberal` political views?

In [7]:
liberal_strong_dems = sample_population[
    (sample_population.polviews <= PolViews.SlightlyLiberal) & 
    (sample_population.partyid == Party.StrongDemocrat)
]
p = (liberal_strong_dems.sex == Sex.Female).mean()

print(f"{len(liberal_strong_dems):,} liberal + StrongDemocrat samples")
print(f"P(female) = {(sample_population.sex == Sex.Female).mean():.2%}")
print(f"P(female|liberal, StrongDemocrat) = {p:.2%}")

3,599 liberal + StrongDemocrat samples
P(female) = 53.79%
P(female|liberal, StrongDemocrat) = 58.54%


# Joint Probability
- Joint probability is the probability of two (or more) events happening together or being observed together
- e.g. Probability of a person being a Female AND being a Democrat
- The AND operation means we are combining two probabilities via conjunction
	- The calculation of the joint probability is sometimes called the "product rule" | "chain rule" of probability.
- For independent events: `P(A,B) = P(A) * P(B)`
- For dependent events/variables with observation overlap in a dataset: `P(A,B) = P(A) * P(B | A)`
- Conjunction is commutative (i.e. order doesn't matter); `P(A,B) = P(B,A)` 

In [8]:
p = (
    (sample_population.sex == Sex.Female) &
    (sample_population.partyid == Party.StrongDemocrat)
).mean()

print(f"P(StrongDemocrat & female) = {p:.2%}")

P(StrongDemocrat & female) = 9.40%


## Conditional probability from joint probability
- The conditional probability is the joint probability normalized by the (marginal) probability of one (or more) variables in the joint distribution
- `P(A|B) = P(A,B) / P(B)`

In [10]:
# P(StrongDemocrat)
p_strong_dem = (sample_population.partyid == Party.StrongDemocrat).mean()
print(f"P(StrongDemocrat) = {p_strong_dem:.2%}")

# P(female, StrongDemocrat)
p_female_and_strong_dem = (
    (sample_population.sex == Sex.Female) &
    (sample_population.partyid == Party.StrongDemocrat)
).mean()
print(f"P(female, StrongDemocrat) = {p_female_and_strong_dem:.2%}")

# P(female | StrongDemocrat)
p_female_given_strong_dem = p_female_and_strong_dem / p_strong_dem
print(f"P(female|StrongDemocrat) = {p_female_given_strong_dem:.2%}")

P(StrongDemocrat) = 16.00%
P(female, StrongDemocrat) = 9.40%
P(female|StrongDemocrat) = 58.75%


## Joint probability from conditional
- The probability of two events occurring together can be reframed as probability of the first event, multiplied by the probability of the second event GIVEN the first event has occurred.
- `P(A,B) = P(B) * P(A|B)`

In [11]:
# P(StrongDemocrat)
p_strong_dem = (sample_population.partyid == Party.StrongDemocrat).mean()
print(f"P(StrongDemocrat) = {p_strong_dem:.2%}")

# P(female | StrongDemocrat)
strong_dems = sample_population[sample_population.partyid == Party.StrongDemocrat]
p_female_given_strong_dem = (strong_dems.sex == Sex.Female).mean()
print(f"P(female | StrongDemocrat) = {p_female_given_strong_dem:.2%}")

# P(female, banker)
p_female_and_strong_dem = p_strong_dem * p_female_given_strong_dem
print(f"P(female, StrongDemocrat) = {p_female_and_strong_dem:.2%}")

P(StrongDemocrat) = 16.00%
P(female | StrongDemocrat) = 58.75%
P(female, StrongDemocrat) = 9.40%


# Marginal Probability
A marginal probability is the  probability of one of the variables, e.g. `P(female)`, in a joint probability distribution, `P(female, banker, Democrat)`, with the other variables removed (marginalized away).
- The marginal variables (e.g. `female`) are the subset of variables being retained for the probability calculation.
- The are called "marginal" because they're calculated by summing values in a table along rows or columns, and writing the sum in the margins of the table.
- The discarded variables (e.g. `banker, Democrat`) are said to have been marginalized out.

```
                            Male  Female  | PartyTotal
==========================================|===========
StrongDemocrat              3252    4632  |       7884
NotStrongDemocrat           4184    5985  |      10169
IndependentNearDemocrat     3064    3113  |       6177
Independent                 3264    3665  |       6929
IndependentNearRepublican   2491    2071  |       4562
NotStrongRepublican         3678    4258  |       7936
StrongRepublican            2438    2491  |       4929
OtherParty                   408     296  |        704
------------------------------------------------------
SexTotal                   22779   26511       
```
- Marginal probability of `Female` = `26511 / 49290` = `53.79%`
- Marginal distribution of `Sex` variable: `[22779, 26511]`
- Marginal probability of `StrongDemocrat` = `7884 / 49290` = `16.00%`
- Marginal distribution of `PoliticalParty`:
```
StrongDemocrat                7884
NotStrongDemocrat            10169
IndependentNearDemocrat       6177
Independent                   6929
IndependentNearRepublican     4562
NotStrongRepublican           7936
StrongRepublican              4929
OtherParty                     704
```

In [12]:
p = (sample_population.sex == Sex.Female).mean()

print(f"P(female) = {p:.2%}")

P(female) = 53.79%


In [13]:
sex_p_dist = sample_population.sex.value_counts(normalize=True)
sex_p_dist

sex
2    0.537858
1    0.462142
Name: proportion, dtype: float64

In [14]:
p = (sample_population.partyid == Party.StrongDemocrat).mean()

print(f"P(StrongDemocrat) = {p:.2%}")

P(StrongDemocrat) = 16.00%


# Bayes's Theorem
- Joint probabilities are commutative: `P(A,B) = P(B,A)`
- Joint probabilities are related to conditional probabilities (which are NOT commutative):
    - `P(A,B) = P(A) * P(B|A)`
    - `P(B,A) = P(B) * P(A|B)`
- Thus, `P(A) * P(B|A) = P(B) * P(A|B)`
- Bayes's Theorem: `P(A|B) = P(A) * P(B|A) / P(B)`

In [15]:
# P(female)
p_female = (sample_population.sex == Sex.Female).mean()
print(f"P(female) = {p_female:.2%}")

# P(StrongDemocrat | female)
females = sample_population[sample_population.sex == Sex.Female]
p_strong_dem_given_female = (females.partyid == Party.StrongDemocrat).mean()
print(f"P(StrongDemocrat|female) = {p_strong_dem_given_female:.2%}")

# P(StrongDemocrat)
p_strong_dem = (sample_population.partyid == Party.StrongDemocrat).mean()
print(f"P(StrongDemocrat) = {p_strong_dem:.2%}")

# P(female | StrongDemocrat)
p_female_given_strong_dem = p_female * p_strong_dem_given_female / p_strong_dem
print(f"P(female | StrongDemocrat) = {p_female_given_strong_dem:.2%}")

P(female) = 53.79%
P(StrongDemocrat|female) = 17.47%
P(StrongDemocrat) = 16.00%
P(female | StrongDemocrat) = 58.75%
