# Przygotowanie danych do łańcuchów markowa

Podział danych na zdania.

In [291]:
import numpy as np
f = open('data/tusk2011.txt')
expose = ''.join(f.readlines())

In [292]:
chars_to_remove = ['+', ',', '-', ':', ';', '?', '–', '”', '„', '…', '\n', '\n\n', '  ']
for char in chars_to_remove: 
    expose = expose.replace(char, ' ')
expose = expose.lower()

In [293]:
sentences = expose.split('. ')
letters = expose.split()

In [294]:
from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(sentences, test_size=0.3, random_state=42)

In [295]:
len(X_train)

191

In [296]:
len(X_test)

82

In [297]:
X_train[0]

'tak jest'

In [298]:
states = np.unique(list(expose))
states

array([' ', '(', ')', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7',
       '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k',
       'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y',
       'z', 'é', 'ó', 'ą', 'ć', 'ę', 'ł', 'ń', 'ś', 'ź', 'ż'], dtype='<U1')

In [299]:
A = np.zeros((states.shape[0], states.shape[0]))
pi = np.zeros(states.shape[0])

In [300]:
for sentence in X_train:
    characters = list(sentence)
    pi[np.where(states == characters[0])[0][0]] += 1
    for i in range(1, len(characters)):
        prev_state = np.where(states == characters[i-1])[0][0]
        curr_state = np.where(states == characters[i])[0][0]
        A[prev_state, curr_state] += 1

In [301]:
result = np.where(states == 'm')[0][0]
result

27

In [302]:
pi = pi/np.sum(pi)

In [303]:
A = A/np.sum(A)

Poprawka LaPlace'a

In [304]:
A[A<1e-5] = 1e-5

In [305]:
A

array([[8.62036655e-04, 1.00000000e-05, 1.00000000e-05, ...,
        3.74798546e-04, 3.74798546e-05, 3.26074735e-03],
       [1.00000000e-05, 1.00000000e-05, 1.00000000e-05, ...,
        1.00000000e-05, 1.00000000e-05, 1.00000000e-05],
       [1.00000000e-05, 1.00000000e-05, 1.00000000e-05, ...,
        1.00000000e-05, 1.00000000e-05, 1.00000000e-05],
       ...,
       [3.74798546e-04, 1.00000000e-05, 1.00000000e-05, ...,
        1.00000000e-05, 1.00000000e-05, 1.00000000e-05],
       [3.74798546e-05, 1.00000000e-05, 1.00000000e-05, ...,
        1.00000000e-05, 1.00000000e-05, 1.00000000e-05],
       [1.27431506e-03, 1.00000000e-05, 1.00000000e-05, ...,
        1.00000000e-05, 1.00000000e-05, 1.00000000e-05]])

## Testowanie

In [306]:
probas = []
for sentence in X_test:
    characters = list(sentence)
    proba = pi[np.where(states == characters[0])[0][0]]
    for i in range(1, len(characters)):
        prev_state = np.where(states == characters[i-1])[0][0]
        curr_state = np.where(states == characters[i])[0][0]
        proba = proba * A[prev_state, curr_state]
    probas.append(proba)

In [307]:
probas

[0.0,
 1.2358889216085613e-101,
 5.47503761e-315,
 1.496410565846503e-75,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 2.551005231739956e-146,
 3.561982018433196e-164,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 2.55040998194224e-216,
 0.0,
 1.3504251971534633e-273,
 0.0,
 0.0,
 1.6068623418408e-42,
 0.0,
 4.6666149322530376e-271,
 1.668741932485891e-196,
 2.572619958835774e-146,
 6.576127603181665e-110,
 0.0,
 1.1655089560402666e-301,
 0.0,
 1.0782971574702307e-187,
 0.0,
 2.335381608711149e-178,
 1.0471338056005824e-262,
 3.2793867447927865e-157,
 0.0,
 5.049116959167234e-202,
 0.0,
 0.0,
 0.0,
 1.023545443890706e-270,
 1.0621361697113633e-278,
 0.0,
 0.0,
 0.0,
 1.5627150992309638e-209,
 8.192603374833674e-199,
 9.384968666137921e-214,
 1.3593440881177248e-218,
 8.604423360363529e-270,
 0.0,
 1.565928491973768e-102,
 2.659122645145195e-35,
 0.0,
 2.3417050546720424e-223,
 0.0,
 9.844511288895797e-141,
 4.807729480310887e-309,
 0.0,
 1.3395299903911193e-238,
 0.0,
 0.0,
 1.694959000336698e-138,
 0.

## Przekształcenie danych wejściowych

In [8]:
import numpy as np
from sklearn.model_selection import train_test_split

flatten = lambda t: [item for sublist in t for item in sublist]

def preprocess_data(path):
    f = open(path)
    expose = ''.join(f.readlines())

    chars_to_remove = ['+', ',', '-', ':', ';', '?', '–', '”', '„', '…', '\n', '\n\n', '  ', '%', '(', ')', '\xad', '/']
    for char in chars_to_remove: 
        expose = expose.replace(char, ' ')
    expose = expose.lower()

    sentences = expose.split('. ')
    words_in_sentences = [sentence.split() for sentence in sentences]
    words = flatten(words_in_sentences)
    states = np.unique(words)
    try:
        sentences.remove([])
    except:
        pass
    
    return words_in_sentences, states

In [10]:
X_tusk, states_tusk = preprocess_data('data/tusk2011.txt')
X_szydlo, states_szydlo = preprocess_data('data/szydlo2015.txt')
X_morawiecki, states_morawiecki = preprocess_data('data/morawiecki2019.txt')
X_kopacz, states_kopacz = preprocess_data('data/kopacz2014.txt')

UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 8074: character maps to <undefined>

In [310]:
states = list(set(states_szydlo).union(set(states_tusk)).union(set(states_morawiecki)).union(set(states_kopacz)))

In [311]:
X_train_tusk, X_test_tusk = train_test_split(X_tusk, test_size=0.3, random_state=42)
X_train_szydlo, X_test_szydlo = train_test_split(X_szydlo, test_size=0.3, random_state=42)
X_train_morawiecki, X_test_morawiecki = train_test_split(X_morawiecki, test_size=0.3, random_state=42)
X_train_kopacz, X_test_kopacz = train_test_split(X_kopacz, test_size=0.3, random_state=42)
X_train_kopacz.remove([])

In [312]:
class MarkovChain:
    def __init__(self):
        self.A = None
        self.pi = None
    
    def fit(self, sentences, states):
        self.A = np.zeros((len(states), len(states)))
        self.pi = np.zeros(len(states))
        
        for sentence in sentences:
            index_first_word = states.index(sentence[0])
            self.pi[index_first_word] += 1
            for i in range(1, len(sentence)):
                prev_state = states.index(sentence[i-1])
                curr_state = states.index(sentence[i])
                self.A[prev_state, curr_state] += 1
        self.pi = self.pi/np.sum(self.pi)
        self.A = self.A/np.sum(self.A, axis=1, keepdims=True)
        self.A[self.A<1e-5] = 1e-3

    def predict(self, sentences):
        probas = []
        for sentence in sentences:
            index_first_word = states.index(sentence[0])
            proba = self.pi[index_first_word]
            
            for i in range(1, len(sentence)):
                prev_state = states.index(sentence[i-1])
                curr_state = states.index(sentence[i])
                proba = proba * self.A[prev_state, curr_state]
            probas.append(proba)
        return probas

In [313]:
mc_tusk = MarkovChain()
mc_tusk.fit(X_train_tusk, states)

In [314]:
mc_szydlo = MarkovChain()
mc_szydlo.fit(X_train_szydlo, states)

In [315]:
mc_morawiecki = MarkovChain()
mc_morawiecki.fit(X_train_morawiecki, states)

In [316]:
mc_kopacz = MarkovChain()
mc_kopacz.fit(X_train_kopacz, states)

In [317]:
mc_kopacz.predict(X_test_tusk)

[6.003879367027405e-106,
 0.0,
 0.0,
 4.784688995215311e-15,
 0.0,
 0.0,
 0.0,
 0.0,
 1.5067465195695659e-127,
 7.533732597847831e-143,
 6.380318137117326e-73,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 4.253545424744887e-147,
 2.372447902575575e-219,
 5.084685435935506e-30,
 0.0,
 0.0,
 3.001939683513703e-90,
 8.507090849489768e-145,
 7.457538639372076e-14,
 0.0,
 1.8643846598430188e-47,
 0.0,
 4.784688995215312e-33,
 0.0,
 0.0,
 2.1267727123724416e-55,
 0.0,
 0.0,
 0.0,
 0.0,
 1.2007758734054806e-54,
 0.0,
 0.0,
 0.0,
 1.6948951453118365e-99,
 6.0038793670274035e-115,
 0.0,
 2.3923444976076564e-44,
 7.656381764540787e-60,
 1.694895145311836e-72,
 0.0,
 4.253545424744884e-97,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 2.3923444976076557e-20,
 0.0,
 5.3373946849789793e-104,
 1.8715599868877486e-41,
 2.087754154266505e-109,
 0.0,
 3.3897902906236724e-65,
 0.0,
 4.253545424744883e-49,
 0.0,
 2.2413228677812953e-162,
 8.474475726559177e-21,
 0.0,
 3.389790290623674e-132,
 3.389790290623672e-39,
 0.0,
 0.0,

In [318]:
def classify(sentences):
    morawiecki_score = np.mean(mc_morawiecki.predict(sentences))
    tusk_score = np.mean(mc_tusk.predict(sentences))
    szydlo_score = np.mean(mc_szydlo.predict(sentences))
    kopacz_score = np.mean(mc_kopacz.predict(sentences))
    labels = ['morawiecki', 'tusk', 'szydlo', 'kopacz']
    scores = np.array([morawiecki_score, tusk_score, szydlo_score, kopacz_score])
    best = np.argmin(scores)
    return labels[best]

In [319]:
classify(X_test_morawiecki)

'szydlo'

In [320]:
classify(X_test_tusk)

'kopacz'

In [321]:
classify(X_test_szydlo)

'kopacz'

In [322]:
classify(X_test_kopacz)

'szydlo'

In [1]:
import numpy as np

A = np.array([[1, 2, 3], [2, 1, 0], [1, 0, 1]])

array([[0.16666667, 0.33333333, 0.5       ],
       [0.66666667, 0.33333333, 0.        ],
       [0.5       , 0.        , 0.5       ]])