### 1. Sample Space

In [5]:
import pandas as pd

In [6]:
# import data
df = pd.read_csv('./SFBAdrivers.csv')
df # 차를 몇 대 소유하고 있는지, 운전자가 몇 명인지, 전체 가구원이 몇 명인지에 대한 데이터임  

Unnamed: 0,HHVEHCNT,DRVRCNT,HHSIZE
0,2,2,6
1,1,1,1
2,3,4,5
3,2,2,2
4,3,3,3
...,...,...,...
582,2,2,2
583,6,2,2
584,1,2,2
585,3,2,2


In [7]:
df.head()

Unnamed: 0,HHVEHCNT,DRVRCNT,HHSIZE
0,2,2,6
1,1,1,1
2,3,4,5
3,2,2,2
4,3,3,3


In [8]:
# 해당 표본 공간에는 음수가 존재할 수 없음
# a-1. Empirical sample space
min(df.HHVEHCNT), max(df.HHVEHCNT)
# a-2. Theoretical sample space
# HHVEHCNT >= 0

(0, 7)

In [9]:
# b-1. Empirical sample space
min(df.DRVRCNT), max(df.DRVRCNT)
# b-2. Theoretical sample space
# DRVRCNT >= 0, DRVRCNT <= HHSIZE

(0, 5)

In [10]:
# c-1. Empirical sample space
min(df.HHSIZE), max(df.HHSIZE)
# c-2. Theoretical sample space
# HHSIZE >= 1

(1, 8)

`d.` What  is  the  size  of the total  empirical  sample  space,  considering  all three  variables?  

Note 
that  certain  combinations  of  values  are  logically  impossible.  

Big  Sample  space  –  can’t logically have more drivers than household members, but other than that, nothing else is 
logically impossible. 

### 2. Events

In [11]:
# a. Household size is 2
# Event that HHVEHCNT== 2
print('count:', sum(df.HHSIZE == 2))
print('count:', len(df[df.HHSIZE == 2]))

count: 222
count: 222


In [12]:
# b. Number of drivers in the household is 2 and household size is 2.
# Event that (DRVRCNT == 2) & (HHSIZE == 2)
print('count:', len(df[(df.HHSIZE == 2) & (df.DRVRCNT == 2)]))

count: 178


In [13]:
# c. Number of vehicles in the household is 2 and number of household drivers is 2. 
# Event that (HHVEHCNT == 2) & (DRVRCNT == 2)
print('count:', len(df[(df.HHVEHCNT == 2) & (df.DRVRCNT == 2)]))

count: 196


In [14]:
# d. Household size is larger than 5. 
# Event that HHSIZE > 5
print('count:', len(df[(df.HHSIZE > 5)]))

count: 11


In [15]:
# e. Household size is less than 3 and number of household vehicles is 3 or more.
# Event that (HHSIZE < 3) & (HHVEHCNT >= 3)
print('count:', len(df[(df.HHSIZE < 3) & (df.HHVEHCNT >= 3)]))

count: 40


In [16]:
# f. Number of household drivers is 0 OR household vehicles is 0. 
# Event that (DRVRCNT == 3) | (HHVEHCNT == 3)
print('count:', len(df[(df.DRVRCNT == 0) | (df.HHVEHCNT == 0)]))

count: 38


### 3. Set Operations

In [17]:
# Event A: number of persons per household is 2 or fewer
A = df[(df.HHSIZE <= 2)]

# Event Ac: number of persons per household is more than 2
Ac = df[(df.HHSIZE > 2)]

In [18]:
# Event B: household vehicles is 0
B = df[(df.HHVEHCNT == 0)]

# Event Bc: household vehicles is not 0
Bc = df[(df.HHVEHCNT != 0)]

In [28]:
# Find Ac ∩ Bc numerically using Python.
# Create  the  vector  of  observation  numbers  which  correspond  to  Ac∩Bc.  
# Also,  name this vector p. 
# Hint: p = df.query(event_Ac + ' & ' + event_Bc).index

event_A = 'HHSIZE <= 2'
event_Ac = 'HHSIZE > 2'
event_B = 'HHVEHCNT == 0'
event_Bc = 'HHVEHCNT > 0'

# e. Find Ac ∩ Bc numerically using MATLAB
p = df.query(event_Ac + ' & ' + event_Bc)
print(p)

p = p.index
print(len(p))

     HHVEHCNT  DRVRCNT  HHSIZE
0           2        2       6
2           3        4       5
4           3        3       3
7           3        2       4
8           2        2       3
..        ...      ...     ...
571         2        2       4
574         2        2       4
577         3        3       3
578         3        3       3
581         3        3       4

[233 rows x 3 columns]
233


In [29]:
# f. Find A ∪ B numerically using Python
# f.1
r = df.query(event_A + ' | ' + event_B).index
print(len(r))

# f.2 Visually inspect p and r
print(p)
print(r)
# Unique index ... 
# No common elements because the vectors (events) are mutually explusive of one another

354
Int64Index([  0,   2,   4,   7,   8,   9,  11,  12,  13,  15,
            ...
            559, 560, 562, 564, 569, 571, 574, 577, 578, 581],
           dtype='int64', length=233)
Int64Index([  1,   3,   5,   6,  10,  14,  16,  18,  20,  21,
            ...
            573, 575, 576, 579, 580, 582, 583, 584, 585, 586],
           dtype='int64', length=354)


In [30]:
# f.3 Check f.2 with python loop
B = 0
for n in range(0, len(r)-1, 1):
    for m in range(0, len(p)-1, 1):
        if r[n] == p[m]:
            B = B+1
        else: B = B+0

print(B)

0


In [31]:
# g. (A ∪ B)c numerically
len(df.HHVEHCNT) - (len(p)+len(r))

# from g, p and r include the entire sample space

0

### 4. Probability 
`Challenge Question (Extra pts.)`

Define each of the events in (a) through (f) in  Problem 3 as its corresponding letter. 

For  example, event 𝑎 is  number  of  persons  in  household  is  2.  

Let 𝑃(𝐸) be  the  fraction  of  observations  associated with event E.  

In [35]:
# From problem 3
event_A = 'HHSIZE <= 2'
event_B = 'HHSIZE > 2'
event_C = 'HHVEHCNT == 0'
event_D = 'HHVEHCNT > 0'

# a. Write code to evaluate 𝑃(𝑎) , 𝑃(𝑏), 𝑃(𝑐), 𝑃(𝑑)
prob_a = len(df.query(event_A)) / len(df)
prob_b = len(df.query(event_B)) / len(df)
prob_c = len(df.query(event_C)) / len(df)
prob_d = len(df.query(event_D)) / len(df)

print('P(A):', prob_a, 'P(B):', prob_b, 'P(C):', prob_c, 'P(D):', prob_d)

P(A): 0.5945485519591142 P(B): 0.40545144804088584 P(C): 0.06303236797274275 P(D): 0.9369676320272572


In [37]:

# b. For each of the pairs of events drawn from a,b,c, and d—a&b, a&c, b&c 
# —use Python code to demonstrate that:

# b.1 𝑃(𝐸1 ∪ 𝐸2) = 𝑃(𝐸1)+𝑃(𝐸2)−𝑃(𝐸1 ∩𝐸2) 
event_a_in_b = (event_A) + ' & ' + (event_B) 
prob_a_in_b = len(df.query(event_a_in_b)) / len(df)
prob_a_u_b = prob_a + prob_b - prob_a_in_b

print([prob_a_u_b])

# b.2 𝑃(𝐸1 ∩𝐸2) ≥ 𝑃(𝐸1)+𝑃(𝐸2)−1
print(prob_a_in_b >= prob_a + prob_b - 1)

[1.0]
True
