## Data Preparation

In [3]:
# Download the file containing data for seasonal rainfall data 
# in SJ City from 1960-61 to 2002-2003 from LMS. (Filename: ‘SJrainfall.dat’). 
# Load the data into Python with Pandas. 
# The data contains the number of rainy days in season (column 1) 
# and cumulative rainfall in inches (column 2)

import pandas as pd

In [9]:

# load data
df = pd.read_csv('./SJrain.dat', header=None)
print(df.shape)
df.head()

(43, 2)


Unnamed: 0,0,1
0,68,13.87
1,52,17.65
2,63,22.15
3,55,12.32
4,76,22.29


In [10]:
# Generate variables: 
# ‘days’ for number of rainy days in season 
# ‘rain’ for cumulative rainfall (inches)

# assign headers
df.columns = ['days', 'rain']
df.head()

Unnamed: 0,days,rain
0,68,13.87
1,52,17.65
2,63,22.15
3,55,12.32
4,76,22.29


## Problem 01

`E1` = the number of rainy days in SJ in a given `season is > 65 days`

`E2` = amount of cumulative rainfall in SJ in a given `season is > 22in`

We know that if E1 or E2 happens, i.e. `P(E1 U E2)`, we have had a rainy season.

In [14]:
# Write a Python code to compute the following probabilities by use of the data. 
# For each probability, write out in words what it means

# P(E1) : the number of rainy days in SJ in given season > 65 days
E1_cond = 'days > 65'
E1 = df.query(E1_cond).index
Prob_E1 = len(E1)/len(df)
print(Prob_E1)

# P(E2) : amount of cumulative rainfall in SJ in a given season > 22in
E2_cond = 'rain > 22'
E2 = df.query(E2_cond).index
Prob_E2 = len(E2)/len(df)
print(Prob_E2)

# P(E1⋂E2) : 
E1_in_E2_cond = E1_cond + ' & ' + E2_cond
E1_in_E2 = df.query(E1_in_E2_cond).index
Prob_E1_in_E2 = len(E1_in_E2)/len(df)
print(Prob_E1_in_E2)

# P(E1⋃E2) :
E1_u_E2_cond = E1_cond + ' | ' + E2_cond
E1_u_E2 = df.query(E1_u_E2_cond).index
Prob_E1_u_E2 = len(E1_u_E2)/len(df)
print(Prob_E1_u_E2)

# P(E1|E2) :
Prob_E1_given_E2 = Prob_E1_in_E2/Prob_E2
print(Prob_E1_given_E2)

# P(E2|E1) :
Prob_E2_given_E1 = Prob_E1_in_E2/Prob_E1
print(Prob_E2_given_E1)

0.5348837209302325
0.4883720930232558
0.3953488372093023
0.627906976744186
0.8095238095238095
0.7391304347826088


In [60]:
# Are  E1  and  E2 (approximately)  statistically  independent? 
print(Prob_E1_given_E2 == Prob_E1)
print(Prob_E2_given_E1 == Prob_E2)

#  Mutually exclusive?
B = 0
for n in range(0, len(E1)-1, 1):
    for m in range(0, len(E2)-1, 1):
        if E1[n] == E2[m]:
            B = B+1
        else: B = B+0

print(B==0) 

# another answer 
print(E1_in_E2)

False
False
False
Int64Index([4, 6, 8, 12, 13, 17, 21, 22, 23, 25, 32, 34, 37, 38, 39, 41, 42], dtype='int64')


In [21]:
# Verify Bayes’ rule by showing that 𝑃(𝐸2|𝐸1) = 𝑃(𝐸1|𝐸2)∙𝑃(𝐸2) / 𝑃(𝐸1) .
print(Prob_E2_given_E1)
print((Prob_E1_given_E2 * Prob_E2) / Prob_E1)

0.7391304347826088
0.7391304347826088


## Problem 02 

Investigate the proposition that the different seasons represent independent trials with respect to E2.

다른 계절이 E2( rain > 22 )와 관련하여 독립적인 시행을 나타낸다는 명제를 조사하라.

In [24]:
rain_curr = df.rain
dummy = pd.Series([0]) # 1칸 밀기 위한 dummy data
rain_curr = pd.concat([rain_curr, dummy]).reset_index(drop = True) 

In [25]:
rain_prior = df.rain
rain_prior = pd.concat([dummy, rain_prior]).reset_index(drop = True)

In [27]:
rain_both = pd.concat([rain_curr, rain_prior], axis=1)
rain_both.columns = ['current', 'prior']
rain_both.head()

Unnamed: 0,current,prior
0,13.87,0.0
1,17.65,13.87
2,22.15,17.65
3,12.32,22.15
4,22.29,12.32


In [41]:
# Let E2-1 be the event E2 for the prior season. 
# In other words, in the season 2001-2002, E2-1 indicates if seasonal rainfall in 2002-2003 exceeded 22 in.
E2_1 = rain_both.query('prior > 22')
E2_1.head()

Unnamed: 0,current,prior
2,22.15,17.65
4,22.29,12.32
6,29.41,16.32
8,25.09,14.46
12,34.36,11.06


In [69]:
# Estimate  P(E2 | E2-1)  from  the  historical  data  
E2_1_in_E2_cond = 'current > 22 & prior > 22' # 9 elements
Prob_E2_1_in_E2 = len(rain_both.query(E2_1_in_E2_cond)) / (len(rain_both)-1)
Prob_E2 = len(E2_1) / (len(rain_both)-1)
print(Prob_E2, Prob_E2_1_in_E2)

Prob_E2_given_E2_1 = Prob_E2_1_in_E2 / Prob_E2
Prob_E2_given_E2_1 # P(E2)가 0.48로 거의 차이나지 않음 -> 독립 

0.4883720930232558 0.20930232558139536


0.4285714285714286

In [None]:
# and comment on what the result implies about indepence between a given season and the previous one.
# 연도에 관계 없이 계절만이 강수량에 영향을 어느정도 끼친다

## Problem 03

Investigate  the  proposition  that  the  probability  of  having  a  rainy  season  was  different  in  the  1960s-1970s. 

In [70]:
# Let E3 = the event that the season is 1980-81 or before.  
# (Note  that  observations  1-21  inclusive  in  your  vector  correspond  to  the  1960-81 set,  
# and  observations 22-43 inclusive correspond to the 1981-2003 set.)

E3 = df.iloc[0:21] # index location을 통해 접근
Prob_E3 = len(E3)/len(df)
Prob_E3 # 절반 가량의 obserbation

0.4883720930232558

In [72]:
# Calculate P(E1|E3) and P(E2|E3) and comment on this finding.
E1_in_E3_cond = 'days > 65'
E1_in_E3 = E3.query(E1_in_E3_cond).index
Prob_E1_in_E3 = len(E1_in_E3)/len(df)
print(Prob_E1_in_E3)

E2_in_E3_cond = 'rain > 22'
E2_in_E3 = E3.query(E2_in_E3_cond).index
Prob_E2_in_E3 = len(E2_in_E3)/len(df)
print(Prob_E2_in_E3)

# Interprete the result
print(Prob_E1_in_E3/Prob_E3, Prob_E1)
print(Prob_E2_in_E3/Prob_E3, Prob_E2)
# 1981년도 이전의 경우 많은 양의 강수 확률이나 강수 확률 자체가 높지 않았음

0.20930232558139536
0.18604651162790697
0.4285714285714286 0.5348837209302325
0.380952380952381 0.4883720930232558


In [37]:
# Could you also investigate this proposition by calculating P(E3|E2) or P(E3|E1)?  
# P(E3|E2), P(E3|E1)
Prob_E3_given_E2 = Prob_E2_in_E3/Prob_E2
print(Prob_E3_given_E2)

Prob_E3_given_E1 = Prob_E1_in_E3/Prob_E1
print(Prob_E3_given_E1)

# Interprete the result
# 1981년도 이전의 경우 많은 양의 강수 확률이나 강수 확률 자체가 높지 않았음

0.380952380952381
0.39130434782608703
