# Ukryte Modele Markova

## Przygotowanie danych

- [x] zdyskretyzować dane 4 lub 3 przedzialy
- [x] tworzenie modelu markova - algorytm viterbiego
- odległość hamminga po nastrojeniu modelu - wybieramy najbardziej podobną obserwacje
- wstępne strojenie macierzy A

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('szczecin_sep_2009.txt', header=None)
df = df.rename({0: "czas", 1: "temperatura", 3: "wilgotnosc", 4: "cisnienie", 5: "wiatr", 11: "condition"}, axis='columns')
df = df.drop(columns=[2, 6, 7, 8, 9, 10])

In [2]:
df['temperatura'] = pd.cut(df['temperatura'], bins=4, labels=[1, 2, 3, 4])
df['wilgotnosc'] = pd.cut(df['wilgotnosc'], bins=4, labels=[1, 2, 3, 4])
df['cisnienie'] = pd.cut(df['cisnienie'], bins=4, labels=[1, 2, 3, 4])
df['wiatr'] = pd.cut(df['wiatr'], bins=4, labels=[1, 2, 3, 4])

In [3]:
df.head()

Unnamed: 0,czas,temperatura,wilgotnosc,cisnienie,wiatr,condition
0,12:00 AM,2,3,3,1,Clear
1,12:30 AM,2,3,3,1,Clear
2,1:00 AM,2,3,3,1,Clear
3,1:30 AM,2,3,3,1,Clear
4,2:00 AM,2,3,3,1,Clear


In [4]:
df.describe()

Unnamed: 0,czas,temperatura,wilgotnosc,cisnienie,wiatr,condition
count,1438,1438,1438,1438,1438,1438
unique,48,4,4,4,2,14
top,8:30 PM,2,4,4,1,Clear
freq,30,735,709,596,807,807


In [5]:
np.unique(df['condition'].values)

array(['Clear', 'Fog', 'Light Rain', 'Light Rain Showers',
       'Light Thunderstorms and Rain', 'Mist', 'Mostly Cloudy',
       'Partly Cloudy', 'Rain', 'Rain Showers', 'Scattered Clouds',
       'Shallow Fog', 'Thunderstorms and Rain', 'Unknown'], dtype=object)

In [6]:
states = df['condition'].to_list()
observations = df[['temperatura', 'wilgotnosc', 'cisnienie', 'wiatr']]
observations = observations.to_numpy()

In [7]:
observations = [f'{temp}{wilg}{cis}{wiatr}' for (temp, wilg, cis, wiatr) in observations]

In [8]:
states[:5]

['Clear', 'Clear', 'Clear', 'Clear', 'Clear']

In [9]:
observations[:5]

['2331', '2331', '2331', '2331', '2331']

## Tworzenie Ukrytego Modelu Markova

In [10]:
import mchmm as hmm
obs_seq = observations
sts_seq = states
a = hmm.HiddenMarkovModel().from_seq(obs_seq, sts_seq)

In [11]:
a.states

array(['Clear', 'Fog', 'Light Rain', 'Light Rain Showers',
       'Light Thunderstorms and Rain', 'Mist', 'Mostly Cloudy',
       'Partly Cloudy', 'Rain', 'Rain Showers', 'Scattered Clouds',
       'Shallow Fog', 'Thunderstorms and Rain', 'Unknown'], dtype='<U28')

In [12]:
a.observations

array(['1321', '1421', '1431', '1434', '1441', '1444', '2221', '2231',
       '2234', '2241', '2311', '2314', '2321', '2324', '2331', '2334',
       '2341', '2344', '2411', '2414', '2421', '2424', '2431', '2434',
       '2441', '2444', '3111', '3121', '3131', '3134', '3141', '3144',
       '3211', '3214', '3221', '3231', '3234', '3241', '3244', '3311',
       '3314', '3321', '3324', '3331', '3334', '3341', '3344', '3411',
       '3414', '3421', '3424', '3431', '3434', '3444', '4121', '4131',
       '4141', '4221', '4231', '4234', '4241'], dtype='<U4')

In [13]:
import pandas as pd
pd.DataFrame(a.ep, index=a.states, columns=a.observations)

Unnamed: 0,1321,1421,1431,1434,1441,1444,2221,2231,2234,2241,...,3431,3434,3444,4121,4131,4141,4221,4231,4234,4241
Clear,0.002478,0.002478,0.04461,0.0,0.039653,0.0,0.001239,0.002478,0.0,0.001239,...,0.001239,0.0,0.0,0.018587,0.002478,0.006196,0.002478,0.004957,0.0,0.01487
Fog,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Light Rain,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Light Rain Showers,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Light Thunderstorms and Rain,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Mist,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Mostly Cloudy,0.0,0.0,0.0,0.005025,0.0,0.015075,0.0,0.0,0.0,0.0,...,0.0,0.01005,0.125628,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Partly Cloudy,0.0,0.0,0.0,0.0,0.0,0.028302,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.009434,0.0
Rain,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Rain Showers,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Viterbi - odkrycie ścieżki stanów pogodowych mając daną sekwencję obserwacji

In [14]:
vs, vsi = a.viterbi(obs_seq)
# states sequence
print("VI", "".join(vs))
# observations
print("NO", obs_seq)

VI ClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearPartly CloudyPartly CloudyPartly CloudyPartly CloudyPartly CloudyClearPartly CloudyPartly CloudyPartly CloudyScattered CloudsScattered CloudsScattered CloudsPartly CloudyPartly CloudyPartly CloudyPartly CloudyPartly CloudyPartly CloudyPartly CloudyPartly CloudyPartly CloudyPartly CloudyPartly CloudyPartly CloudyClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearClearPartly CloudyLight Rain ShowersLight Rain ShowersLight Rain ShowersLight Rain ShowersLight Rain ShowersLight Rain ShowersLight Rain ShowersLight Rain ShowersLight Rain ShowersLight Rain ShowersLight Rain ShowersLight Rain ShowersLight Rain ShowersLight Rain S

## Podział danych na sekwencje uczące i testujące. 

In [15]:
# from sklearn import preprocessing
# states_encoder = preprocessing.LabelEncoder()
# sts_seq = states_encoder.fit_transform(sts_seq)
# sts_seq

In [16]:
# observations_encoder = preprocessing.LabelEncoder()
# obs_seq = observations_encoder.fit_transform(obs_seq)
# obs_seq

In [17]:
T = 24  # dlugosc pojedynczej sekwencji

In [18]:
observations_joined = [obs_seq[i:i+T] for i in range(0, len(obs_seq), T)]
states_joined = [sts_seq[i:i+T] for i in range(0, len(sts_seq), T)]

In [19]:
from sklearn.model_selection import train_test_split
states_train, states_test, observations_train, observations_test = train_test_split(states_joined, observations_joined, test_size=0.3, random_state=42)

In [20]:
states_train[0]

['Clear',
 'Clear',
 'Clear',
 'Clear',
 'Clear',
 'Clear',
 'Clear',
 'Shallow Fog',
 'Shallow Fog',
 'Shallow Fog',
 'Shallow Fog',
 'Shallow Fog',
 'Unknown',
 'Shallow Fog',
 'Shallow Fog',
 'Shallow Fog',
 'Shallow Fog',
 'Mostly Cloudy',
 'Mostly Cloudy',
 'Mostly Cloudy',
 'Mostly Cloudy',
 'Mostly Cloudy',
 'Mostly Cloudy',
 'Mostly Cloudy']

In [21]:
observations_train[0]

['2431',
 '2431',
 '2431',
 '2431',
 '2431',
 '2431',
 '2431',
 '1434',
 '1434',
 '1434',
 '1434',
 '1434',
 '2434',
 '2434',
 '2434',
 '2434',
 '2444',
 '2444',
 '2444',
 '2444',
 '3444',
 '3444',
 '3344',
 '3344']

## Nastroić model Markova za pomocą reestymacji Bauma-Welcha

In [22]:
b = hmm.HiddenMarkovModel().from_baum_welch(observations_train[0], states=a.states)

In [23]:
vs, vsi = a.viterbi(observations_train[0])
vs

array(['Clear', 'Clear', 'Clear', 'Clear', 'Clear', 'Clear', 'Clear',
       'Clear', 'Clear', 'Clear', 'Clear', 'Clear', 'Clear', 'Clear',
       'Clear', 'Shallow Fog', 'Shallow Fog', 'Shallow Fog',
       'Shallow Fog', 'Shallow Fog', 'Shallow Fog', 'Clear', 'Clear',
       'Clear'], dtype='<U28')

In [24]:
states_train[0]

['Clear',
 'Clear',
 'Clear',
 'Clear',
 'Clear',
 'Clear',
 'Clear',
 'Shallow Fog',
 'Shallow Fog',
 'Shallow Fog',
 'Shallow Fog',
 'Shallow Fog',
 'Unknown',
 'Shallow Fog',
 'Shallow Fog',
 'Shallow Fog',
 'Shallow Fog',
 'Mostly Cloudy',
 'Mostly Cloudy',
 'Mostly Cloudy',
 'Mostly Cloudy',
 'Mostly Cloudy',
 'Mostly Cloudy',
 'Mostly Cloudy']

In [25]:
len(observations_train)

42

In [None]:
import time

t1 = time.time()
hmm.HiddenMarkovModel().from_baum_welch(observations_train[0], states=a.states, obs=a.observations, tp=a.tp, ep=a.ep, pi=list(a.pi))
time() - t1

  ksi[i] /= ksi[i].sum()
  ].sum(axis=0) / gamma.sum(axis=0)


In [None]:
import time

t1 = time.time()

for i in range(2):
    a = hmm.HiddenMarkovModel().from_baum_welch(observations_train[i], states=a.states, obs=a.observations, tp=a.tp, ep=a.ep, pi=list(a.pi))
    
time.time() - t1