In [None]:
!wget -nc https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/edgar_allan_poe.txt
!wget -nc https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/robert_frost.txt

--2021-09-23 15:49:29--  https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/edgar_allan_poe.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26622 (26K) [text/plain]
Saving to: ‘edgar_allan_poe.txt’


2021-09-23 15:49:29 (17.6 MB/s) - ‘edgar_allan_poe.txt’ saved [26622/26622]

--2021-09-23 15:49:29--  https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/robert_frost.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 56286 (55K) [text/plain]
Saving 

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import string
from sklearn.model_selection import train_test_split 

In [None]:
input_files = ["edgar_allan_poe.txt",
               "robert_frost.txt"]


In [None]:
# genero las listas de poemas y labels poe:0, frost:1.
labels = []
input_text = []
for label,f in enumerate(input_files):
  print(f"{f} corresponds to {label}")
  for line in open(f):
    line = line.rstrip().lower()
    if line:
      line = line.translate(str.maketrans("","", string.punctuation)) # remove punctuation
      input_text.append(line)
      labels.append(label)





edgar_allan_poe.txt corresponds to 0
robert_frost.txt corresponds to 1


In [None]:
X_train, X_test, y_train, y_test = train_test_split(input_text, labels)

In [None]:
len(y_train), len(y_test)

(1615, 539)

In [None]:
X_train[:5], y_train[:5]

(['but lo a stir is in the air',
  'that much to sell theyre worth as much to keep',
  'nor the ghoulhaunted woodland of weir',
  'while a bolder note than this might swell',
  'between the woods and frozen lake'],
 [0, 1, 0, 0, 1])

In [None]:
# generar un diccionario de tokens con un índica para cada uno

idx = 1
word2idx = {"<unk>":0}

for text in X_train:
  tokens = text.split()
  for token in tokens:
    if token not in word2idx:
      word2idx[token] = idx
      idx +=1

word2idx["tell"]

252

In [None]:
#convert data into int- cada palabra en X_train se transforma en un numero
X_train_int= []
X_test_int = []

for text in X_train:
  tokens = text.split()
  line_as_int = [word2idx[token] for token in tokens]
  X_train_int.append(line_as_int)

for text in X_test:
  tokens = text.split()
  line_as_int = [word2idx.get(token,0) for token in tokens] # .get(token,0) por si hay palabras nuevas
  X_test_int.append(line_as_int)






X_train_int[100:105] # el split genera una lista por oración, cada numero una palabra

[[31, 11, 379, 380, 381],
 [1, 47, 358, 218, 11, 382, 20, 7, 383],
 [384, 385, 386, 385, 387, 388],
 [354, 76, 3, 221, 7, 389, 114, 179, 390, 391],
 [31, 392, 35, 94, 58, 393, 394]]

In [None]:
#armar dos matriz de markov una para cada autor

V = len(word2idx) # Vocan size

A0= np.ones((V,V))
pi0= np.ones(V)

A1= np.ones((V,V))
pi1= np.ones(V)



In [None]:
# compute counts for A and pi
def compute_counts(text_as_int, A, pi):
  for tokens in text_as_int:
    last_idx = None
    for idx in tokens:
      if last_idx is None:
        # it's the first word in a sentence
        pi[idx] += 1
      else:
        # the last word exists, so count a transition
        A[last_idx, idx] += 1

      # update last idx
      last_idx = idx





In [None]:
compute_counts([t for t, y in zip(X_train_int, y_train) if y == 0], A0, pi0)
compute_counts([t for t, y in zip(X_train_int, y_train) if y == 1], A1, pi1)

In [None]:
#normalizar la matriz de markov

A0 /= A0.sum(axis=1, keepdims=True)
pi0 /= pi0.sum()

A1 /= A1.sum(axis=1, keepdims=True)
pi1 /= pi1.sum()

# log sobre las probabilidades de que aparezca una palabra dada

logA0 = np.log(A0)
logpi0 = np.log(pi0)

logA1 = np.log(A1)
logpi1 = np.log(pi1)


In [None]:
# log priors

count0 = sum(y==0 for y in y_train)
count1 = sum(y==1 for y in y_train)
total = len(y_train)

p0 = count0/total # probadilidad que sea de Poe
p1 = count1/total # Probabilidade que sea de Frost

logp0 = np.log(p0)
logp1 = np.log(p1)

p0,p1

(0.33869969040247677, 0.6613003095975232)

In [None]:
class Classifier:
  def __init__(self, logAs, logpis, logpriors):
    self.logAs = logAs
    self.logpis = logpis
    self.logpriors = logpriors
    self.K = len(logpriors) # number of classes

  def _compute_log_likelihood(self, input_, class_):
    logA = self.logAs[class_]
    logpi = self.logpis[class_]

    last_idx = None
    logprob = 0
    for idx in input_:
      if last_idx is None:
        # it's the first token
        logprob += logpi[idx]
      else:
        logprob += logA[last_idx, idx]
      
      # update last_idx
      last_idx = idx
    
    return logprob
  
  def predict(self, inputs):
    predictions = np.zeros(len(inputs))
    for i, input_ in enumerate(inputs):
      posteriors = [self._compute_log_likelihood(input_, c) + self.logpriors[c] for c in range(self.K)] 
      pred = np.argmax(posteriors)# posterior devuelve lis
      predictions[i] = pred
    return predictions

In [None]:
clf = Classifier([logA0,logA1], [logpi0, logpi1], [logp0, logp1])


In [None]:
Ptrain = clf.predict(X_train_int)
print(f"Train acc: {np.mean(Ptrain == y_train)}")

Train acc: 0.9950464396284829


In [None]:
Pest = clf.predict(X_test_int)
print(f"Train acc: {np.mean(Pest == y_test)}")

Train acc: 0.8051948051948052


In [None]:
from sklearn.metrics import confusion_matrix, f1_score

cm = confusion_matrix(y_train, Ptrain)
cm

array([[ 539,    8],
       [   0, 1068]])

In [None]:
cm_test = confusion_matrix(y_test, Pest)
cm_test

array([[ 85,  86],
       [ 19, 349]])

In [None]:
f1_score(y_train, Ptrain)


0.9962686567164178

In [None]:
f1_score(y_test, Pest)

0.8692403486924035

In [None]:
A0[252,5]

0.0004038772213247173

In [None]:
def word_choice_poe(word):
  value = word2idx[word]
  max_val = np.argmax(A0[value])

  key_list = list(word2idx.keys())
  val_list = list(word2idx.values())

  position = val_list.index(value)
  print(key_list[position])

  next = val_list.index(max_val)
  print(key_list[next])

word_choice_poe("the")

the
night


In [None]:
def word_choice_frost(word):
  value = word2idx[word]
  max_val = np.argmax(A1[value])

  key_list = list(word2idx.keys())
  val_list = list(word2idx.values())

  position = val_list.index(value)
  print(key_list[position])

  next = val_list.index(max_val)
  print(key_list[next])

In [None]:
word_choice_frost("the")

the
cellar


In [None]:
import math

def word_choice_poe_random(word):
  value = word2idx[word]
  col_nb = range(len(A0))
  prob = A0[value,:]/A0[value,:].sum(keepdims=True)

  max_val = np.random.choice(col_nb, p=prob)

  key_list = list(word2idx.keys())
  val_list = list(word2idx.values())

  position = val_list.index(value)
  print(key_list[position])

  next = val_list.index(max_val)
  print(key_list[next])

word_choice_poe_random("yaanek")

yaanek
best


In [None]:
def word_choice_frost_random(word):
  value = word2idx[word]
  prob = A1[value,:]/A0[value,:].sum(keepdims=True)
  col_nb = range(len(A1))
  max_val = np.random.choice(col_nb, p=prob)

  key_list = list(word2idx.keys())
  val_list = list(word2idx.values())

  position = val_list.index(value)
  #(key_list[position])

  next = val_list.index(max_val)
  return key_list[next]

word_choice_frost_random("grain")

'esquimaux'