In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
# load a file that uses , as delimiter
def load_file(path, names):
    if not path.is_file():
        raise FileNotFoundError(str(path))
    data = pd.read_csv(path, sep=",", names=names, header=None)
    return data

In [3]:
def load_df():
    cols = ["state", "emission"]
    file = Path.cwd() / "data.txt"
    return load_file(file, cols)

In [4]:
data = load_df()
data

Unnamed: 0,state,emission
0,foggy,no
1,foggy,no
2,foggy,no
3,rainy,yes
4,sunny,no
5,foggy,no
6,rainy,yes
7,rainy,yes
8,foggy,no
9,rainy,yes


In [5]:
def emissionProbability(data):
    matrix = data.copy()
    #make column for sunny yes, foggy yes, rainy yes
    #for sunny, foggy, and rainy put 1 in the row if true
    matrix['sunny-yes'] = np.where((matrix['state'] == 'sunny') & (matrix['emission'] == 'yes'),1,0) 
    matrix['foggy-yes'] = np.where((matrix['state'] == 'foggy') & (matrix['emission'] == 'yes'),1,0)
    matrix['rainy-yes'] = np.where((matrix['state'] == 'rainy') & (matrix['emission'] == 'yes'),1,0)
    matrix['sunny'] = np.where(matrix['state'] == 'sunny',1,0) 
    matrix['foggy'] = np.where(matrix['state'] == 'foggy',1,0) 
    matrix['rainy'] = np.where(matrix['state'] == 'rainy',1,0) 
    #drop state and emission
    matrix = matrix.drop(['state','emission'],axis=1) 
    #adds all the ones in the columns
    matrix = matrix.sum(axis=0)
    #calculate probablities of each state
    SY=matrix[0]/matrix[4]
    SN= 1-SY
    FY=matrix[1]/matrix[3]
    FN=1-FY
    RY=matrix[2]/matrix[5]
    RN=1-RY
    #create a new dataframe of probablities 
    matrixData =[[SY,SN],[FY,FN],[RY,RN]]
    matrix = pd.DataFrame(matrixData,columns=['yes','no'], index=['sunny','foggy','rainy'])
    return matrix
b = emissionProbability(data)
b

Unnamed: 0,yes,no
sunny,0.16092,0.83908
foggy,0.159919,0.840081
rainy,0.812245,0.187755


In [6]:
def stateTransitionProbability(data):
    data['next'] = data['state'].shift(periods=-1)
    b = pd.DataFrame(data=data.groupby(['state', 'next']).size()).reset_index()
    b['subtotal'] = b.groupby('state')[0].transform('sum')
    b['prob'] = b[0]/b['subtotal']
    b = b.drop([0, 'subtotal'], axis=1)
    b = b.pivot(index='state', columns='next', values='prob')
    return b

In [7]:
def seq_to_col(sequence):
    new_seq = []
    for i, val in enumerate(sequence):
        val = f"({i}) " + val
        new_seq.append(val)
    return new_seq

In [8]:
def probability_of_sequence(data, sequence):
    a = stateTransitionProbability(data)
    a = a.reindex(["sunny", "foggy", "rainy"], columns=["sunny", "foggy", "rainy"]) 
    a = a.to_numpy()
    b = emissionProbability(data)
    hidden_states = ["sunny", "foggy", "rainy"]
    hmm = pd.DataFrame(data={"0" : (1.0, 0.0, 0.0)}, index=hidden_states)
    hmm = hmm.join(pd.DataFrame(0., columns = sequence, index = hidden_states))
    for index in range(1, hmm.shape[1]):
        current_obs = hmm.iloc[:, index]
        last_obs = hmm.iloc[:, index-1]
        for i in range(3):
            trans = 0
            for j in range(3):
                trans += (a[j,i] * last_obs[j])
            current_row = hmm.iloc[i]
            prob = trans * b.at[current_row.name, current_obs.name[4:]]
            current_obs[i] = prob
    prob_of_sequence = hmm.iloc[:,-1].sum()
    return prob_of_sequence, hmm, a, b

In [9]:
sequence = ["yes", "no", "yes"]

In [10]:
prob, hmm, a, b = probability_of_sequence(data, seq_to_col(sequence))

In [11]:
prob

0.03517274609773816

In [12]:
hmm

Unnamed: 0,0,(0) yes,(1) no,(2) yes
sunny,1.0,0.127952,0.096847,0.013781
foggy,0.0,0.024004,0.034645,0.005374
rainy,0.0,0.044484,0.007469,0.016018


In [13]:
a

array([[0.79513185, 0.15010142, 0.05476673],
       [0.20689655, 0.50191571, 0.29118774],
       [0.19591837, 0.2244898 , 0.57959184]])

In [14]:
b

Unnamed: 0,yes,no
sunny,0.16092,0.83908
foggy,0.159919,0.840081
rainy,0.812245,0.187755


In [15]:

def findMostLikly(hmm):
    #find max of each column
    i = np.where(hmm == np.amax(hmm))
    i = list(zip(i[1],i[0]))
    i = pd.DataFrame(i)
    
    sorted = i.sort_values(by=[0], inplace = False )
    sorted = sorted.reset_index()
      
    for x in range(0,sorted.shape[0]):
        if(x == 0):
            print( "The most likely sequence is: ")
        elif(sorted[1][x] == 0 ):
            print(x, "- sunny")
        elif(sorted[1][x] == 1):
            print(x,"- foggy")
        else:
            print(x,"- rainy")

findMostLikly(hmm)

      
        



The most likely sequence is: 
1 - sunny
2 - sunny
3 - rainy
