# 21-344 Final Project Code
# Juliette Wong (jnwong)

In [1]:
import numpy as np
import pandas as pd
import math

In [2]:
df = pd.read_excel("Weather_PGH.xlsx")
df.head()

Unnamed: 0,Date,Weather,High,Low,Precipitation (in),Wunderground?
0,2019-05-10,Cloudy,58,40,0.0,No
1,2019-05-11,Partly Cloudy,55,35,0.0,No
2,2019-05-12,Mostly Sunny,59,35,0.0,No
3,2019-05-13,Mostly Sunny,63,42,0.0,No
4,2019-05-14,Mostly Cloudy,64,43,0.0,No


In [3]:
df.groupby("Weather").count()

Unnamed: 0_level_0,Date,High,Low,Precipitation (in),Wunderground?
Weather,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Cloudy,322,322,322,322,322
Foggy,8,8,8,8,8
Mostly Cloudy,119,119,119,119,119
Mostly Sunny,103,103,103,103,103
Partly Cloudy,71,71,71,71,71
Scattered Showers,55,55,55,55,55
Snow,49,49,49,49,49
Thunderstorm,4,4,4,4,4


## Converting Weather to an Integer Value

In [4]:
def weather_number(weather_name): 
    num = -1
    if weather_name == "Mostly Sunny": 
        num = 0
    elif weather_name == "Partly Cloudy":
        num = 1
    elif weather_name == "Mostly Cloudy":
        num = 2
    elif weather_name == "Cloudy": 
        num = 3
    elif weather_name == "Foggy":
        num = 4
    elif weather_name == "Scattered Showers":
        num = 5
    elif weather_name == "Thunderstorm": 
        num = 6
    elif weather_name == "Snow":
        num = 7
    return num

In [5]:
df["weather_num"] = df["Weather"].apply(weather_number)
df.head()

Unnamed: 0,Date,Weather,High,Low,Precipitation (in),Wunderground?,weather_num
0,2019-05-10,Cloudy,58,40,0.0,No,3
1,2019-05-11,Partly Cloudy,55,35,0.0,No,1
2,2019-05-12,Mostly Sunny,59,35,0.0,No,0
3,2019-05-13,Mostly Sunny,63,42,0.0,No,0
4,2019-05-14,Mostly Cloudy,64,43,0.0,No,2


## Creating 1-Step Transition Matrix

In [42]:
def create_transition_matrix(series): 
    counts = np.zeros((8, 8))
    n = len(series)
    for i in range(n-1):
        current = series[i]
        future = series[i+1]
        counts[current][future] += 1
    transition_matrix = counts/counts.sum(axis=1)[:,None]
    return counts, transition_matrix

In [43]:
count_mat, mat = create_transition_matrix(df["weather_num"])
mat
# Rounding does lead to some error rip
# np.around(mat, decimals=2)
# np.around(mat, decimals=3).sum(axis=1)[:, None] 

array([[0.32038835, 0.15533981, 0.17475728, 0.33009709, 0.        ,
        0.        , 0.00970874, 0.00970874],
       [0.18309859, 0.14084507, 0.21126761, 0.35211268, 0.01408451,
        0.09859155, 0.        , 0.        ],
       [0.1512605 , 0.1512605 , 0.27731092, 0.32773109, 0.        ,
        0.05042017, 0.02521008, 0.01680672],
       [0.07142857, 0.06521739, 0.12732919, 0.53726708, 0.01552795,
        0.10559006, 0.        , 0.07763975],
       [0.25      , 0.        , 0.25      , 0.375     , 0.125     ,
        0.        , 0.        , 0.        ],
       [0.14814815, 0.07407407, 0.09259259, 0.44444444, 0.01851852,
        0.14814815, 0.        , 0.07407407],
       [0.5       , 0.        , 0.25      , 0.25      , 0.        ,
        0.        , 0.        , 0.        ],
       [0.08163265, 0.04081633, 0.08163265, 0.44897959, 0.        ,
        0.        , 0.        , 0.34693878]])

## Creating Multi-Step Transition Matrix

In [30]:
np.linalg.matrix_power(mat, 2)

array([[0.18675035, 0.12000613, 0.1825213 , 0.40186628, 0.00731362,
        0.05898146, 0.00751621, 0.03504466],
       [0.15968582, 0.11050323, 0.17782482, 0.41755132, 0.01103765,
        0.07632404, 0.00710373, 0.03996939],
       [0.16295992, 0.11254185, 0.18936444, 0.40641129, 0.00815313,
        0.07096987, 0.00845958, 0.04113993],
       [0.11832509, 0.08557074, 0.14997733, 0.46453845, 0.01315758,
        0.08522288, 0.00390346, 0.07930448],
       [0.17594793, 0.1011066 , 0.1920155 , 0.4128072 , 0.02144798,
        0.05220132, 0.00872971, 0.03574377],
       [0.1394038 , 0.09043475, 0.14820102, 0.45016235, 0.01300291,
        0.0808484 , 0.0037726 , 0.07417417],
       [0.21586644, 0.13178938, 0.18853867, 0.38129809, 0.00388199,
        0.03900256, 0.01115689, 0.02846599],
       [0.10636686, 0.07421942, 0.13101644, 0.46506259, 0.00754661,
        0.05554786, 0.00285052, 0.15738971]])

In [31]:
np.linalg.matrix_power(mat, 7)

array([[0.14150437, 0.09744159, 0.16315736, 0.43932129, 0.01095422,
        0.07538326, 0.00549897, 0.06673893],
       [0.14140717, 0.09738322, 0.16308778, 0.43941445, 0.01095797,
        0.0753885 , 0.00549182, 0.06686909],
       [0.14143985, 0.09740299, 0.16311101, 0.43938244, 0.01095631,
        0.07538565, 0.00549425, 0.0668275 ],
       [0.14110575, 0.09720353, 0.1628713 , 0.43969857, 0.01096684,
        0.07539747, 0.00546985, 0.06728668],
       [0.14149493, 0.09743596, 0.16315144, 0.43932948, 0.01095444,
        0.07538271, 0.00549841, 0.06675264],
       [0.14117617, 0.0972457 , 0.16292172, 0.43963141, 0.01096428,
        0.0753938 , 0.00547501, 0.06719191],
       [0.14162931, 0.0975166 , 0.16324681, 0.4392015 , 0.01094937,
        0.07537628, 0.00550815, 0.06657198],
       [0.14083562, 0.09704444, 0.16267594, 0.43994451, 0.01096978,
        0.07539089, 0.00545052, 0.0676883 ]])

## Limiting Distribution

In [32]:
P_thousand = np.linalg.matrix_power(mat, 1000)

In [33]:
pi = P_thousand[0]
pi

array([0.14124014, 0.09728391, 0.16296765, 0.43957079, 0.01096225,
       0.07539167, 0.00547969, 0.06710389])

## Showing why Markov Chains work

### Independence Model does not Hold

In [44]:
weather_counts = df.groupby("weather_num").count()["Date"]
weather_prop = weather_counts / weather_counts.sum()

In [25]:
expected = np.zeros((8, 8))
for i in range(8):
    for j in range(8): 
        expected[i][j] = weather_prop[i]*weather_prop[j]*731
np.around(expected)

array([[ 15.,  10.,  17.,  45.,   1.,   8.,   1.,   7.],
       [ 10.,   7.,  12.,  31.,   1.,   5.,   0.,   5.],
       [ 17.,  12.,  19.,  52.,   1.,   9.,   1.,   8.],
       [ 45.,  31.,  52., 142.,   4.,  24.,   2.,  22.],
       [  1.,   1.,   1.,   4.,   0.,   1.,   0.,   1.],
       [  8.,   5.,   9.,  24.,   1.,   4.,   0.,   4.],
       [  1.,   0.,   1.,   2.,   0.,   0.,   0.,   0.],
       [  7.,   5.,   8.,  22.,   1.,   4.,   0.,   3.]])

In [45]:
count_mat

array([[ 33.,  16.,  18.,  34.,   0.,   0.,   1.,   1.],
       [ 13.,  10.,  15.,  25.,   1.,   7.,   0.,   0.],
       [ 18.,  18.,  33.,  39.,   0.,   6.,   3.,   2.],
       [ 23.,  21.,  41., 173.,   5.,  34.,   0.,  25.],
       [  2.,   0.,   2.,   3.,   1.,   0.,   0.,   0.],
       [  8.,   4.,   5.,  24.,   1.,   8.,   0.,   4.],
       [  2.,   0.,   1.,   1.,   0.,   0.,   0.,   0.],
       [  4.,   2.,   4.,  22.,   0.,   0.,   0.,  17.]])

### 1st-Order Markov Chain

In [54]:
def create_counts(series): 
    counts = np.zeros((64, 8))
    n = len(series)
    for i in range(n-2):
        current = series[i]
        next_val = series[i+1]
        future_val = series[i+2]
        counts[8*current + next_val][future_val] += 1
    return counts
actual = create_counts(df["weather_num"])
actual

array([[ 11.,   5.,   4.,  13.,   0.,   0.,   0.,   0.],
       [  2.,   3.,   1.,   7.,   1.,   2.,   0.,   0.],
       [  2.,   0.,   5.,   9.,   0.,   1.,   1.,   0.],
       [  2.,   3.,   8.,  15.,   1.,   4.,   0.,   1.],
       [  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.],
       [  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.],
       [  1.,   0.,   0.,   0.,   0.,   0.,   0.,   0.],
       [  1.,   0.,   0.,   0.,   0.,   0.,   0.,   0.],
       [  6.,   0.,   5.,   2.,   0.,   0.,   0.,   0.],
       [  3.,   1.,   2.,   2.,   0.,   2.,   0.,   0.],
       [  2.,   3.,   4.,   5.,   0.,   0.,   0.,   1.],
       [  4.,   2.,   7.,   8.,   0.,   4.,   0.,   0.],
       [  0.,   0.,   0.,   0.,   1.,   0.,   0.,   0.],
       [  1.,   0.,   1.,   3.,   0.,   1.,   0.,   1.],
       [  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.],
       [  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.],
       [  4.,   4.,   5.,   5.,   0.,   0.,   0.,   0.],
       [  4.,   4.,   3.,   6.,

In [52]:
actual_row_sum = np.sum(actual, axis = 1)
expected = np.zeros((64, 8))

for i in range(64):
    for j in range(8):
        expected[i][j] = mat[i % 8][j] * actual_row_sum[i]
np.around(expected)

array([[11.,  5.,  6., 11.,  0.,  0.,  0.,  0.],
       [ 3.,  2.,  3.,  6.,  0.,  2.,  0.,  0.],
       [ 3.,  3.,  5.,  6.,  0.,  1.,  0.,  0.],
       [ 2.,  2.,  4., 18.,  1.,  4.,  0.,  3.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 4.,  2.,  2.,  4.,  0.,  0.,  0.,  0.],
       [ 2.,  1.,  2.,  4.,  0.,  1.,  0.,  0.],
       [ 2.,  2.,  4.,  5.,  0.,  1.,  0.,  0.],
       [ 2.,  2.,  3., 13.,  0.,  3.,  0.,  2.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  1.,  1.,  3.,  0.,  1.,  0.,  1.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 6.,  3.,  3.,  6.,  0.,  0.,  0.,  0.],
       [ 3.,  3.,  4.,  6.,  0.,  2.,  0.,  0.],
       [ 5.,  5.,  9., 11.,  0.,  2.,  1.,  1.],
       [ 3.,  3.,  5., 21.,  1.,  4.,  0.,  3.],
       [ 0.,  0.,  0