In [20]:
import numpy as np
from collections import defaultdict
import random
import math


random.seed(0)
np.random.seed()
# Corpus
sentences = [
    "the cat sat on the mat",
    "a dog barked loudly",
    "many birds fly in the sky",
    "children play in the garden",
    "some dogs run fast",
    "the girl jumped high",
    "birds sing in trees",
    "the children run fast",
    "many children also play",
    "the dog play with the children",
    "the birds are on the trees"
]

# Tokenize corpus
corpus = [w.lower() for s in sentences for w in s.split()]
vocab = list(set(corpus))
word2idx = {w: i for i, w in enumerate(vocab)}
idx2word = {i: w for w, i in word2idx.items()}
V = len(vocab)

# Initialize nouniness score per word
noun_score = np.random.randn(V) * 0.1

# Transition weight for predicting previous word given nouniness of current
transition = np.random.randn(V) * 0.1

# Sigmoid and softmax
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

# Training loop
lr = 0.01
epochs = 1000

best_loss=None
for epoch in range(epochs):
    total_loss = 0
    for t in range(1, len(corpus)):
        curr_word = corpus[t]
        prev_word = corpus[t - 1]
        i = word2idx[curr_word]
        j = word2idx[prev_word]

        # Predict previous word distribution using current word's nouniness
        pred_scores = noun_score[i] * transition
        pred_probs = softmax(pred_scores)

        # Cross-entropy loss
        loss = -np.log(pred_probs[j] + 1e-9)
        total_loss += loss

        # Gradients
        grad = pred_probs
        grad[j] -= 1

        # Update transition weights
        transition -= lr * noun_score[i] * grad

        # Update noun_score of current word
        noun_score[i] -= lr * np.dot(grad, transition)

    if epoch % 5 == 0 or epoch == epochs - 1:
        print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")
    if best_loss==None: best_loss=total_loss
    else:
        if total_loss>best_loss: break

# Output noun scores
print("\n🔍 Inferred 'nouniness' of words (based on preceding word prediction):")
for i in np.argsort(-sigmoid(noun_score)):
    print(f"{idx2word[i]:10s} → noun-score: {sigmoid(noun_score[i]):.3f}")


Epoch 1, Loss: 175.0394
Epoch 6, Loss: 174.9886
Epoch 11, Loss: 174.9336
Epoch 16, Loss: 174.8722
Epoch 21, Loss: 174.8014
Epoch 26, Loss: 174.7178
Epoch 31, Loss: 174.6172
Epoch 36, Loss: 174.4941
Epoch 41, Loss: 174.3413
Epoch 46, Loss: 174.1500
Epoch 51, Loss: 173.9086
Epoch 56, Loss: 173.6025
Epoch 61, Loss: 173.2136
Epoch 66, Loss: 172.7202
Epoch 71, Loss: 172.0974
Epoch 76, Loss: 171.3189
Epoch 81, Loss: 170.3613
Epoch 86, Loss: 169.2105
Epoch 91, Loss: 167.8715
Epoch 96, Loss: 166.3770
Epoch 101, Loss: 164.7874
Epoch 106, Loss: 163.1752
Epoch 111, Loss: 161.5965
Epoch 116, Loss: 160.0702
Epoch 121, Loss: 158.5782
Epoch 126, Loss: 157.0809
Epoch 131, Loss: 155.5332
Epoch 136, Loss: 153.8939
Epoch 141, Loss: 152.1298
Epoch 146, Loss: 150.2192
Epoch 151, Loss: 148.1556
Epoch 156, Loss: 145.9547
Epoch 161, Loss: 143.6611
Epoch 166, Loss: 141.3506
Epoch 171, Loss: 139.1180
Epoch 176, Loss: 137.0506
Epoch 181, Loss: 135.2026
Epoch 186, Loss: 133.5890
Epoch 191, Loss: 132.1981
Epoch 19

In [18]:
import numpy as np
from collections import defaultdict
import random

# Example corpus
sentences = [
    "the cat sat on the mat",
    "a dog barked loudly",
    "the man saw a woman",
    "birds fly over trees",
    "she eats quickly"
]

sentences = [
    "the cat sat on the mat",
    "a dog barked loudly",
    "many birds fly in the sky",
    "children play in the garden",
    "some dogs run fast",
    "the girl jumped high",
    "birds sing in trees",
    "the children run fast",
    "many children also play",
    "the dog play with the children",
    "the birds are on the trees"
]


# Tokenize
tokenized = [s.lower().split() for s in sentences]
vocab = list(set(word for sent in tokenized for word in sent))
word2idx = {w: i for i, w in enumerate(vocab)}
idx2word = {i: w for w, i in word2idx.items()}
vocab_size = len(vocab)

# Initialize noun scores randomly [0,1]
np.random.seed(42)
noun_scores = np.random.rand(vocab_size)

# Hyperparameters
epochs = 200
lr = 0.01

# Helper to get context counts
def get_contexts(tokenized):
    contexts = []
    for sent in tokenized:
        for i in range(1, len(sent)-1):
            center = sent[i]
            prev_w = sent[i-1]
            next_w = sent[i+1]
            contexts.append((center, prev_w, next_w))
    return contexts

contexts = get_contexts(tokenized)

for epoch in range(epochs):
    # Collect context predictions
    pred_prev = defaultdict(float)
    pred_next = defaultdict(float)
    count_prev = defaultdict(float)
    count_next = defaultdict(float)

    # E-step: build context prediction tables
    for center, prev, next_ in contexts:
        idx = word2idx[center]
        weight = noun_scores[idx]
        pred_prev[prev] += weight
        pred_next[next_] += weight
        count_prev[prev] += 1
        count_next[next_] += 1

    # Normalize
    for word in pred_prev:
        pred_prev[word] /= (count_prev[word] + 1e-5)
    for word in pred_next:
        pred_next[word] /= (count_next[word] + 1e-5)

    # M-step: update noun scores
    for center, prev, next_ in contexts:
        idx = word2idx[center]
        score = noun_scores[idx]

        # How well does this noun score predict actual context?
        pred_quality = pred_prev[prev] + pred_next[next_]

        # Gradient-like update
        noun_scores[idx] += lr * (pred_quality - score)
        noun_scores[idx] = np.clip(noun_scores[idx], 0, 1)

# Show learned noun scores
for word, idx in word2idx.items():
    print(f"{word:10} -> noun-likelihood: {noun_scores[idx]:.2f}")


fast       -> noun-likelihood: 0.37
children   -> noun-likelihood: 1.00
a          -> noun-likelihood: 0.73
sing       -> noun-likelihood: 1.00
jumped     -> noun-likelihood: 1.00
on         -> noun-likelihood: 1.00
are        -> noun-likelihood: 1.00
also       -> noun-likelihood: 1.00
some       -> noun-likelihood: 0.60
barked     -> noun-likelihood: 1.00
garden     -> noun-likelihood: 0.02
with       -> noun-likelihood: 1.00
sat        -> noun-likelihood: 1.00
many       -> noun-likelihood: 0.21
girl       -> noun-likelihood: 1.00
play       -> noun-likelihood: 1.00
high       -> noun-likelihood: 0.30
the        -> noun-likelihood: 1.00
in         -> noun-likelihood: 1.00
cat        -> noun-likelihood: 1.00
mat        -> noun-likelihood: 0.61
birds      -> noun-likelihood: 1.00
dogs       -> noun-likelihood: 1.00
run        -> noun-likelihood: 1.00
dog        -> noun-likelihood: 1.00
loudly     -> noun-likelihood: 0.79
sky        -> noun-likelihood: 0.20
trees      -> noun-likelihoo

In [33]:
#infer verb weights of words

import numpy as np
from collections import defaultdict
import random

random.seed(0)
np.random.seed()


# Corpus: tokenized and lowercased
sentences = [
    "the cat sat on the mat",
    "a dog barked loudly",
    "many birds fly in the sky",
    "children play in the garden",
    "some dogs run fast",
    "the girl jumped high",
    "birds sing in trees",
    "dogs play in the house"
]

# Flattened corpus
corpus = [w.lower() for s in sentences for w in s.split()]
vocab = list(set(corpus))
word2idx = {w: i for i, w in enumerate(vocab)}
idx2word = {i: w for w, i in word2idx.items()}
V = len(vocab)

# Initialize verbiness score per word
verb_score = np.random.randn(V) * 0.1

# Transition vector: how verb_score helps predict previous word
transition = np.random.randn(V) * 0.1

# Sigmoid and softmax
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

# Training loop
lr = 0.01
epochs = 2000

for epoch in range(epochs):
    total_loss = 0
    for t in range(1, len(corpus)):
        curr_word = corpus[t]
        prev_word = corpus[t - 1]
        i = word2idx[curr_word]
        j = word2idx[prev_word]

        # Predict previous word using verb_score
        pred_scores = verb_score[i] * transition
        pred_probs = softmax(pred_scores)

        # Cross-entropy loss
        loss = -np.log(pred_probs[j] + 1e-9)
        total_loss += loss

        # Gradients
        grad = pred_probs
        grad[j] -= 1

        # Update transition vector
        transition -= lr * verb_score[i] * grad

        # Update verb_score
        verb_score[i] -= lr * np.dot(grad, transition)

    if epoch % 50 == 0 or epoch == epochs - 1:
        print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

# Output verb scores
print("\n🔍 Inferred 'verbiness' of words (based on preceding word prediction):")
for i in np.argsort(-sigmoid(verb_score)):
    print(f"{idx2word[i]:10s} → verb-score: {sigmoid(verb_score[i]):.3f}")


Epoch 1, Loss: 121.9687
Epoch 51, Loss: 121.5021
Epoch 101, Loss: 119.1262
Epoch 151, Loss: 107.3537
Epoch 201, Loss: 97.3362
Epoch 251, Loss: 89.7385
Epoch 301, Loss: 86.2087
Epoch 351, Loss: 84.2420
Epoch 401, Loss: 82.8808
Epoch 451, Loss: 81.9030
Epoch 501, Loss: 81.1513
Epoch 551, Loss: 80.5200
Epoch 601, Loss: 79.9669
Epoch 651, Loss: 79.4756
Epoch 701, Loss: 79.0371
Epoch 751, Loss: 78.6447
Epoch 801, Loss: 78.2922
Epoch 851, Loss: 77.9741
Epoch 901, Loss: 77.6859
Epoch 951, Loss: 77.4236
Epoch 1001, Loss: 77.1838
Epoch 1051, Loss: 76.9636
Epoch 1101, Loss: 76.7601
Epoch 1151, Loss: 76.5714
Epoch 1201, Loss: 76.3954
Epoch 1251, Loss: 76.2306
Epoch 1301, Loss: 76.0758
Epoch 1351, Loss: 75.9299
Epoch 1401, Loss: 75.7920
Epoch 1451, Loss: 75.6613
Epoch 1501, Loss: 75.5373
Epoch 1551, Loss: 75.4195
Epoch 1601, Loss: 75.3074
Epoch 1651, Loss: 75.2006
Epoch 1701, Loss: 75.0987
Epoch 1751, Loss: 75.0014
Epoch 1801, Loss: 74.9085
Epoch 1851, Loss: 74.8196
Epoch 1901, Loss: 74.7345
Epoch

In [6]:
import numpy as np
import random
from collections import defaultdict

class BayesianHMMUnsupervisedPOS:
    def __init__(self, possible_tags, vocab, alpha=1.0, beta=1.0, smoothing_factor=0.1):
        self.possible_tags = possible_tags  # List of possible POS tags
        self.vocab = vocab  # List of all unique words in the corpus
        self.alpha = alpha  # Dirichlet prior for transition probabilities
        self.beta = beta  # Dirichlet prior for emission probabilities
        self.smoothing_factor = smoothing_factor  # For unseen words

        # Initialize counts
        self.transition_counts = defaultdict(lambda: defaultdict(float)) # P(tag_j | tag_i)
        self.emission_counts = defaultdict(lambda: defaultdict(float)) # P(word | tag)
        self.tag_counts = defaultdict(float) # P(tag)

    def _initialize_random_tags(self, sentences):
        # Assign a random tag to each word initially
        initial_tags = []
        for sentence in sentences:
            sentence_tags = []
            for word in sentence:
                sentence_tags.append(random.choice(self.possible_tags))
            initial_tags.append(sentence_tags)
        return initial_tags

    def _update_counts(self, sentences, tags):
        # Reset counts for each iteration
        self.transition_counts = defaultdict(lambda: defaultdict(float))
        self.emission_counts = defaultdict(lambda: defaultdict(float))
        self.tag_counts = defaultdict(float)

        for i, sentence in enumerate(sentences):
            current_tags = tags[i]
            # Handle initial tag (transition from a 'start' state)
            if current_tags:
                self.tag_counts[current_tags[0]] += 1

            for j in range(len(sentence)):
                word = sentence[j]
                tag = current_tags[j]

                self.emission_counts[tag][word] += 1
                self.tag_counts[tag] += 1

                if j > 0:
                    prev_tag = current_tags[j-1]
                    self.transition_counts[prev_tag][tag] += 1

    def _calculate_probabilities(self):
        # Calculate smoothed probabilities (using Dirichlet priors)

        # Emission probabilities: P(word | tag)
        emission_probs = defaultdict(lambda: defaultdict(float))
        for tag in self.possible_tags:
            for word in self.vocab:
                # Add beta prior (smoothing)
                numerator = self.emission_counts[tag][word] + self.beta
                denominator = self.tag_counts[tag] + self.beta * len(self.vocab)
                emission_probs[tag][word] = numerator / denominator

        # Transition probabilities: P(tag_j | tag_i)
        transition_probs = defaultdict(lambda: defaultdict(float))
        for prev_tag in self.possible_tags:
            for current_tag in self.possible_tags:
                # Add alpha prior (smoothing)
                numerator = self.transition_counts[prev_tag][current_tag] + self.alpha
                denominator = sum(self.transition_counts[prev_tag].values()) + self.alpha * len(self.possible_tags)
                transition_probs[prev_tag][current_tag] = numerator / denominator

        return emission_probs, transition_probs

    def _gibbs_sample_tag(self, sentence, current_tags, word_index, emission_probs, transition_probs):
        word = sentence[word_index]
        probabilities = []

        for candidate_tag in self.possible_tags:
            # Emission probability
            emission_p = emission_probs[candidate_tag][word]

            # Transition probability from previous tag
            prev_tag = current_tags[word_index - 1] if word_index > 0 else None
            if prev_tag:
                transition_p_from_prev = transition_probs[prev_tag][candidate_tag]
            else: # If it's the first word, no previous tag
                transition_p_from_prev = 1.0 # Or some initial tag probability

            # Transition probability to next tag
            next_tag = current_tags[word_index + 1] if word_index < len(sentence) - 1 else None
            if next_tag:
                transition_p_to_next = transition_probs[candidate_tag][next_tag]
            else: # If it's the last word, no next tag
                transition_p_to_next = 1.0 # Or some final tag probability

            # Calculate joint probability (proportional to posterior)
            prob = emission_p * transition_p_from_prev * transition_p_to_next
            probabilities.append(prob)

        # Normalize and sample a new tag
        total_prob = sum(probabilities)
        if total_prob == 0:  # Handle cases where all probabilities are zero
            # Fallback: assign a random tag or use a default
            return random.choice(self.possible_tags)

        normalized_probs = [p / total_prob for p in probabilities]
        new_tag = random.choices(self.possible_tags, weights=normalized_probs, k=1)[0]
        return new_tag

    def train(self, sentences, iterations=50):
        # 1. Initialize tags randomly
        current_tags = self._initialize_random_tags(sentences)

        for iteration in range(iterations):
            # 2. Update counts based on current tags
            self._update_counts(sentences, current_tags)

            # 3. Calculate (smoothed) probabilities
            emission_probs, transition_probs = self._calculate_probabilities()

            # 4. Gibbs sample new tags for each word
            for s_idx, sentence in enumerate(sentences):
                for w_idx in range(len(sentence)):
                    new_tag = self._gibbs_sample_tag(
                        sentence,
                        current_tags[s_idx],
                        w_idx,
                        emission_probs,
                        transition_probs
                    )
                    current_tags[s_idx][w_idx] = new_tag

            print(f"Iteration {iteration + 1}/{iterations} complete.")

        return current_tags

# --- Example Usage ---
# Dummy data - in a real scenario, you'd have a large corpus
corpus = [
    ["the", "fat", "cat", "sat", "on", "the", "mat"],
    ["a", "dog", "ran", "fast"],
    ["the", "birds", "sing"],
    ["the", "big","bird",  "sings"],
    ["the", "cat", "ran","in","the","house"]
]

# Define possible tags (simplified for illustration)
possible_tags = ["DET", "NOUN", "VERB", "PREP", "ADJ"]

# Build vocabulary from the corpus
all_words = set(word for sentence in corpus for word in sentence)
vocab = list(all_words)

# Initialize and train the model
b_hmm = BayesianHMMUnsupervisedPOS(possible_tags, vocab, alpha=0.1, beta=0.1) # Small Dirichlet priors
learned_tags = b_hmm.train(corpus, iterations=200)

# Print the tagged sentences
print("\n--- Learned Tags ---")
for i, sentence in enumerate(corpus):
    tagged_sentence = list(zip(sentence, learned_tags[i]))
    print(tagged_sentence)


Iteration 1/200 complete.
Iteration 2/200 complete.
Iteration 3/200 complete.
Iteration 4/200 complete.
Iteration 5/200 complete.
Iteration 6/200 complete.
Iteration 7/200 complete.
Iteration 8/200 complete.
Iteration 9/200 complete.
Iteration 10/200 complete.
Iteration 11/200 complete.
Iteration 12/200 complete.
Iteration 13/200 complete.
Iteration 14/200 complete.
Iteration 15/200 complete.
Iteration 16/200 complete.
Iteration 17/200 complete.
Iteration 18/200 complete.
Iteration 19/200 complete.
Iteration 20/200 complete.
Iteration 21/200 complete.
Iteration 22/200 complete.
Iteration 23/200 complete.
Iteration 24/200 complete.
Iteration 25/200 complete.
Iteration 26/200 complete.
Iteration 27/200 complete.
Iteration 28/200 complete.
Iteration 29/200 complete.
Iteration 30/200 complete.
Iteration 31/200 complete.
Iteration 32/200 complete.
Iteration 33/200 complete.
Iteration 34/200 complete.
Iteration 35/200 complete.
Iteration 36/200 complete.
Iteration 37/200 complete.
Iteration 

In [12]:
import numpy as np
from collections import defaultdict
import random

# Example corpus
sentences = [
    "the cat sat on the mat",
    "a dog barked loudly",
    "the man saw a woman",
    "birds fly over trees",
    "she eats quickly"
]

# Tokenize and build vocabulary
tokenized = [s.lower().split() for s in sentences]
vocab = list(set(word for sent in tokenized for word in sent))
word2idx = {w: i for i, w in enumerate(vocab)}
idx2word = {i: w for w, i in word2idx.items()}

# Parameters
num_tags = 5  # number of hidden states (e.g., POS tags)
vocab_size = len(vocab)
num_sentences = len(tokenized)

# Initialize HMM parameters randomly
np.random.seed(42)
trans_probs = np.random.dirichlet(np.ones(num_tags), size=num_tags)        # P(tag_t | tag_{t-1})
emit_probs = np.random.dirichlet(np.ones(vocab_size), size=num_tags)       # P(word | tag)
start_probs = np.random.dirichlet(np.ones(num_tags))                       # P(tag_0)

# Forward-Backward (E-step) and parameter update (M-step)
def forward(sent):
    T = len(sent)
    alpha = np.zeros((T, num_tags))
    alpha[0] = start_probs * emit_probs[:, sent[0]]
    for t in range(1, T):
        for j in range(num_tags):
            alpha[t, j] = np.sum(alpha[t-1] * trans_probs[:, j]) * emit_probs[j, sent[t]]
    return alpha

def backward(sent):
    T = len(sent)
    beta = np.zeros((T, num_tags))
    beta[T-1] = 1
    for t in reversed(range(T-1)):
        for i in range(num_tags):
            beta[t, i] = np.sum(beta[t+1] * trans_probs[i] * emit_probs[:, sent[t+1]])
    return beta

def baum_welch(tokenized_idx, epochs=10):
    global trans_probs, emit_probs, start_probs
    for epoch in range(epochs):
        A = np.zeros_like(trans_probs)
        B = np.zeros_like(emit_probs)
        pi = np.zeros(num_tags)
        
        for sent in tokenized_idx:
            T = len(sent)
            alpha = forward(sent)
            beta = backward(sent)
            prob = np.sum(alpha[-1])

            gamma = (alpha * beta) / prob
            xi = np.zeros((T-1, num_tags, num_tags))

            for t in range(T-1):
                denom = np.sum(alpha[t][:, None] * trans_probs * emit_probs[:, sent[t+1]] * beta[t+1])
                for i in range(num_tags):
                    for j in range(num_tags):
                        xi[t, i, j] = alpha[t, i] * trans_probs[i, j] * emit_probs[j, sent[t+1]] * beta[t+1, j]
                xi[t] /= denom

            pi += gamma[0]
            for t in range(T):
                B[:, sent[t]] += gamma[t]
            for t in range(T-1):
                A += xi[t]
        
        # Normalize
        trans_probs = A / A.sum(axis=1, keepdims=True)
        emit_probs = B / B.sum(axis=1, keepdims=True)
        start_probs = pi / pi.sum()

# Convert tokenized to indices
tokenized_idx = [[word2idx[w] for w in sent] for sent in tokenized]

# Run training
baum_welch(tokenized_idx, epochs=10)

# Viterbi decoding
def viterbi(sent):
    T = len(sent)
    delta = np.zeros((T, num_tags))
    psi = np.zeros((T, num_tags), dtype=int)
    
    delta[0] = start_probs * emit_probs[:, sent[0]]
    
    for t in range(1, T):
        for j in range(num_tags):
            scores = delta[t-1] * trans_probs[:, j]
            psi[t, j] = np.argmax(scores)
            delta[t, j] = np.max(scores) * emit_probs[j, sent[t]]
    
    states = np.zeros(T, dtype=int)
    states[-1] = np.argmax(delta[-1])
    for t in reversed(range(1, T)):
        states[t-1] = psi[t, states[t]]
    
    return states

# Predict tags
tagged_sentences = []
for sent in tokenized_idx:
    tags = viterbi(sent)
    tags=[int(v) for v in tags]
    tagged_sentences.append(list(zip([idx2word[i]  for i in sent], tags)))

tagged_sentences


[[('the', 0), ('cat', 2), ('sat', 1), ('on', 4), ('the', 0), ('mat', 1)],
 [('a', 1), ('dog', 3), ('barked', 2), ('loudly', 1)],
 [('the', 1), ('man', 4), ('saw', 0), ('a', 1), ('woman', 4)],
 [('birds', 0), ('fly', 1), ('over', 3), ('trees', 2)],
 [('she', 1), ('eats', 4), ('quickly', 0)]]

In [21]:
# Let's expand the corpus with more diverse and structured sentences
corpus = [
    ["the", "cat", "sat", "on", "the", "mat"],
    ["the", "dog", "barked"],
    ["she", "eats", "an", "apple"],
    ["he", "runs", "fast"],
    ["a", "boy", "plays", "with", "a", "ball"],
    ["birds", "fly", "in", "the", "sky"],
    ["the", "sun", "shines"],
    ["they", "read", "books"],
    ["children", "like", "to", "play", "outside"],
    ["the", "man", "opened", "the", "door"],
]

# Reuse the same training function from earlier
def train_unsupervised_pos_tagger(corpus, tags, num_iterations=10):
    from collections import defaultdict
    import numpy as np

    vocab = set(word for sentence in corpus for word in sentence)
    word_tag_probs = {word: np.random.dirichlet(np.ones(len(tags))).tolist() for word in vocab}
    tag_transition_probs = {t1: {t2: 1.0 / len(tags) for t2 in tags} for t1 in tags}
    word_given_tag_probs = {t: defaultdict(lambda: 1e-6) for t in tags}

    for _ in range(num_iterations):
        tag_counts = defaultdict(float)
        tag_pair_counts = {t1: defaultdict(float) for t1 in tags}
        word_tag_counts = {t: defaultdict(float) for t in tags}

        for sentence in corpus:
            for i, word in enumerate(sentence):
                # Get current tag distribution
                current_probs = np.array(word_tag_probs[word])
                for j, tag in enumerate(tags):
                    # Use transition probability from previous tag
                    if i > 0:
                        prev_word = sentence[i - 1]
                        prev_probs = word_tag_probs[prev_word]
                        transition_score = sum(prev_probs[k] * tag_transition_probs[tags[k]][tag] for k in range(len(tags)))
                    else:
                        transition_score = 1.0 / len(tags)
                    # Word likelihood
                    emission = word_given_tag_probs[tag][word]
                    current_probs[j] = transition_score * emission
                # Normalize
                current_probs /= current_probs.sum() if current_probs.sum() > 0 else 1.0
                word_tag_probs[word] = current_probs.tolist()

                for j, tag in enumerate(tags):
                    tag_counts[tag] += current_probs[j]
                    word_tag_counts[tag][word] += current_probs[j]
                    if i > 0:
                        for k, prev_tag in enumerate(tags):
                            tag_pair_counts[prev_tag][tag] += word_tag_probs[sentence[i - 1]][k] * current_probs[j]

        # Update transition probabilities
        for t1 in tags:
            total = sum(tag_pair_counts[t1].values())
            for t2 in tags:
                tag_transition_probs[t1][t2] = tag_pair_counts[t1][t2] / total if total > 0 else 1.0 / len(tags)

        # Update emission probabilities
        for tag in tags:
            total = sum(word_tag_counts[tag].values())
            for word in vocab:
                word_given_tag_probs[tag][word] = word_tag_counts[tag][word] / total if total > 0 else 1e-6

    # Final tag distributions
    return {
        word: {tag: round(prob, 3) for tag, prob in zip(tags, probs)}
        for word, probs in word_tag_probs.items()
    }

# Define tags
tags = ["NOUN", "VERB", "DET", "OTHER"]

# Train the model
tag_distributions = train_unsupervised_pos_tagger(corpus, tags, num_iterations=10)
tag_distributions_sorted = dict(sorted(tag_distributions.items()))

tag_distributions_sorted



{'a': {'NOUN': 0.25, 'VERB': 0.25, 'DET': 0.25, 'OTHER': 0.25},
 'an': {'NOUN': 0.25, 'VERB': 0.25, 'DET': 0.25, 'OTHER': 0.25},
 'apple': {'NOUN': 0.25, 'VERB': 0.25, 'DET': 0.25, 'OTHER': 0.25},
 'ball': {'NOUN': 0.25, 'VERB': 0.25, 'DET': 0.25, 'OTHER': 0.25},
 'barked': {'NOUN': 0.25, 'VERB': 0.25, 'DET': 0.25, 'OTHER': 0.25},
 'birds': {'NOUN': 0.25, 'VERB': 0.25, 'DET': 0.25, 'OTHER': 0.25},
 'books': {'NOUN': 0.25, 'VERB': 0.25, 'DET': 0.25, 'OTHER': 0.25},
 'boy': {'NOUN': 0.25, 'VERB': 0.25, 'DET': 0.25, 'OTHER': 0.25},
 'cat': {'NOUN': 0.25, 'VERB': 0.25, 'DET': 0.25, 'OTHER': 0.25},
 'children': {'NOUN': 0.25, 'VERB': 0.25, 'DET': 0.25, 'OTHER': 0.25},
 'dog': {'NOUN': 0.25, 'VERB': 0.25, 'DET': 0.25, 'OTHER': 0.25},
 'door': {'NOUN': 0.25, 'VERB': 0.25, 'DET': 0.25, 'OTHER': 0.25},
 'eats': {'NOUN': 0.25, 'VERB': 0.25, 'DET': 0.25, 'OTHER': 0.25},
 'fast': {'NOUN': 0.25, 'VERB': 0.25, 'DET': 0.25, 'OTHER': 0.25},
 'fly': {'NOUN': 0.25, 'VERB': 0.25, 'DET': 0.25, 'OTHER': 0.

In [68]:
#28 July 2025 - step by step initialize & update language model with arbitrary tags, constrained by word tag weights and tag transitions
import numpy as np
from collections import defaultdict, Counter
from itertools import groupby
import random
import math, re

def tok_basic(txt,add_sent_tags=False): 
  txt=re.sub("(?u)(\W)",r" \1 ", txt)
  out=re.split("\s+",txt)
  tokens=[v for v in out if v]
  if add_sent_tags: tokens=["<s>"]+tokens+["</s>"]
  return tokens

def create_one_hot_vec(hot_i,vec_size):
  zeros=[0.]*vec_size
  zeros[hot_i]=1.
  return np.array(zeros)



random.seed(0)
np.random.seed()
# Corpus
sentences = [
    "the cat sat on the mat",
    "a dog barked loudly",
    "many birds fly in the sky",
    "children play in the garden",
    "some dogs run fast",
    "the girl jumped high",
    "birds sing in trees",
    "the children run fast",
    "many children also play",
    "the dog play with the children",
    "the birds are on the trees",
    "she jumped high",
    "they play in the house",
    "it sat on the table"
]

#tags=["N","V","PREP","DET","ADJ","ADV", "<s>","</s>"]
#available_tags=["N","V","PREP","DET","ADJ","ADV"]
available_tags=["N","V","PREP","DET","OTHER"]
all_tags=available_tags+["<s>","</s>"]


all_tokenized=[]
all_token_pairs=[]
word_counter={}
for sent_i,sent0 in enumerate(sentences):
    tokens=tok_basic(sent0,add_sent_tags=True)
    all_tokenized.append(tokens)
    for tk0 in tokens: word_counter[tk0]=word_counter.get(tk0,0)+1
    for tk_i0,cur_tk0 in enumerate(tokens[:-1]):
        next_tk0=tokens[tk_i0+1]
        all_token_pairs.append((cur_tk0,next_tk0))

vocab=sorted(list(word_counter.keys())) 

print(vocab)
word2idx = {w: i for i, w in enumerate(vocab)}
idx2word = {i: w for w, i in word2idx.items()}


#for a in all_token_pairs: print(a)

word_tag_list=[]
#tag_counter={}
# Tokenize corpus
for sent_i,sent_tokens in enumerate(all_tokenized):
    #print(sent_tokens)
    for tk0 in sent_tokens:
        if tk0 in ["<s>","</s>"]: cur_tag=tk0 
        else: cur_tag = random.choice(available_tags)
        word_tag_list.append((sent_i,tk0,cur_tag))
        #tag_counter[cur_tag]=tag_counter.get(cur_tag,0)+1

sorted_by_word=sorted(word_tag_list,key=lambda x:x[1])
sorted_by_tag=sorted(word_tag_list,key=lambda x:x[-1])
#word_tag_list.sort(key=lambda x:x[1])
tag_pairs_list=[]
for i0,cur_item0 in enumerate(word_tag_list[:-1]):
    next_item0=word_tag_list[i0+1]
    if next_item0[0]!=cur_item0[0]: continue
    cur_tag_pair=cur_item0[-1],next_item0[-1]
    tag_pairs_list.append(cur_tag_pair)
    #print(i0,cur_item0,next_item0,cur_tag_pair)

# grouped_by_word=[(key,[v[-1] for v in list(group)]) for key,group in groupby(sorted_by_word,lambda x:x[1]) ]
# word_tag_wt_dict={}
# for w0,w0_tags0 in grouped_by_word:
#     cur_count0=len(w0_tags0)
#     counted_tags=dict(Counter(w0_tags0))
#     for tag0,tag_count0 in counted_tags.items():
#         tag_wt0=tag_count0/cur_count0
#         word_tag_wt_dict[(w0,tag0)]=tag_wt0

#populating P(t|w) - probability of tag given word
P_t_w_list=[]

grouped_by_word_dict=dict(iter([(key,dict(Counter([v[-1] for v in list(group)]))) for key,group in groupby(sorted_by_word,lambda x:x[1]) ])) 
for word0 in vocab:
    corr_dict0=grouped_by_word_dict.get(word0,{})
    cur_count0=sum(corr_dict0.values())
    cur_row0=[]
    for tag0 in all_tags:
        if cur_count0==0: word_tag_wt0=0
        else: word_tag_wt0=corr_dict0.get(tag0,0)/cur_count0
        cur_row0.append(word_tag_wt0)
    #print(word0,cur_count0,corr_dict0,cur_row0)
    P_t_w_list.append(cur_row0)

P_t_w_array=np.array(P_t_w_list)
print("P_t_w_array",P_t_w_array.shape)


grouped_by_tag=[(key,[v[1] for v in list(group)]) for key,group in groupby(sorted_by_tag,lambda x:x[-1]) ]
tag_word_wt_dict={}
for tag0,tag_words0 in grouped_by_tag:
    #print(tag0,tag_words0)
    cur_count0=len(tag_words0)
    counted_words=dict(Counter(tag_words0))
    for word0,word_count0 in counted_words.items():
        word_wt0=word_count0/cur_count0
        tag_word_wt_dict[(tag0,word0)]=word_wt0

#populating P(w|t) - probability of word given tag
P_w_t_list=[]
for tag0 in all_tags:
    #print(tag0)
    all_word_weights=[]
    for word0 in vocab:
        cur_pair=(tag0,word0)
        cur_wt=tag_word_wt_dict.get(cur_pair,0)
        all_word_weights.append(cur_wt)
    #print(all_word_weights)
    P_w_t_list.append(all_word_weights)
        
P_w_t_array=np.array(P_w_t_list)        
#print(P_w_t_array)
print("P_w_t_array", P_w_t_array.shape)

#calculate tag transitions
tag_transition_wt_dict={}
tag_pairs_list_sorted_w1=sorted(tag_pairs_list,key=lambda x:x[0])
#print(tag_pairs_list_sorted_w1)
tag_pairs_list_grouped_w1=[(key,[v[1] for v in list(group)]) for key,group in groupby(tag_pairs_list_sorted_w1,lambda x:x[0])]
for tag0,grp0 in tag_pairs_list_grouped_w1:
    cur_dict=dict(Counter(grp0))
    cur_count0=len(grp0)
    #print(key0,cur_dict)
    for tag1,tag_count1 in cur_dict.items():
        tag_wt1=tag_count1/cur_count0
        tag_transition_wt_dict[(tag0,tag1)]=tag_wt1

#populating P(t2|t1) - probability of tag transition
P_t2_t1_list=[]
for tag0 in all_tags:
    #print(tag0)
    all_next_tag_weights=[]
    for tag1 in all_tags:
        cur_pair=(tag0,tag1)
        cur_wt=tag_transition_wt_dict.get(cur_pair,0)
        all_next_tag_weights.append(cur_wt)
    #print(all_word_weights)
    P_t2_t1_list.append(all_next_tag_weights)
        
P_t2_t1_array=np.array(P_t2_t1_list)        
#print(P_w_t_array)
print("P_t2_t1_array", P_t2_t1_array.shape)

cur_pair_i=5
cur_pair=all_token_pairs[cur_pair_i]
word1,word2=cur_pair
print(cur_pair)
P_w1=create_one_hot_vec(word2idx[word1] ,len(vocab) )

actual_w2=create_one_hot_vec(word2idx[word2] ,len(vocab) )

print("P_w1",P_w1.shape)
print(P_w1)


prediction0=P_w1 @ P_t_w_array @ P_t2_t1_array @ P_w_t_array

print("prediction",prediction0.shape)
print(prediction0)

print("actual",actual_w2.shape)
print(actual_w2)

# P_w1 (36,)
# P_t_w_array (36, 7)
# P_w_t_array (7, 36)
# P_t2_t1_array (7, 7)


# for a,b in tag_transition_wt_dict.items():
#     print(a,b)
# for a,b in tag_word_wt_dict.items():
#     print(a,b)
#print(a,len(b), )
#corpus = [["<s>"]+ w.lower() for s in sentences for w in s.split()]
# vocab = list(set(corpus))
# word2idx = {w: i for i, w in enumerate(vocab)}
# idx2word = {i: w for w, i in word2idx.items()}
# V = len(vocab)

# # Initialize nouniness score per word
# noun_score = np.random.randn(V) * 0.1

# Transition weight for predicting previous word given nouniness of current


['</s>', '<s>', 'a', 'also', 'are', 'barked', 'birds', 'cat', 'children', 'dog', 'dogs', 'fast', 'fly', 'garden', 'girl', 'high', 'house', 'in', 'it', 'jumped', 'loudly', 'many', 'mat', 'on', 'play', 'run', 'sat', 'she', 'sing', 'sky', 'some', 'table', 'the', 'they', 'trees', 'with']
P_t_w_array (36, 7)
P_w_t_array (7, 36)
P_t2_t1_array (7, 7)
('the', 'mat')
P_w1 (36,)
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
prediction (36,)
[0.20394918 0.         0.01347651 0.0116274  0.00992445 0.01347651
 0.03492903 0.01347651 0.04788678 0.02155185 0.00992445 0.0251039
 0.01285843 0.01285843 0.01347651 0.02412968 0.00992445 0.04397859
 0.01285843 0.02536072 0.00992445 0.02571686 0.01347651 0.03317924
 0.05266988 0.0251039  0.0251039  0.01250229 0.01347651 0.01250229
 0.01250229 0.01285843 0.14000897 0.01285843 0.02571686 0.0116274 ]
actual (36,)
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
 0. 0. 0. 0.

In [57]:
all_token_pairs

[('<s>', 'the'),
 ('the', 'cat'),
 ('cat', 'sat'),
 ('sat', 'on'),
 ('on', 'the'),
 ('the', 'mat'),
 ('mat', '</s>'),
 ('<s>', 'a'),
 ('a', 'dog'),
 ('dog', 'barked'),
 ('barked', 'loudly'),
 ('loudly', '</s>'),
 ('<s>', 'many'),
 ('many', 'birds'),
 ('birds', 'fly'),
 ('fly', 'in'),
 ('in', 'the'),
 ('the', 'sky'),
 ('sky', '</s>'),
 ('<s>', 'children'),
 ('children', 'play'),
 ('play', 'in'),
 ('in', 'the'),
 ('the', 'garden'),
 ('garden', '</s>'),
 ('<s>', 'some'),
 ('some', 'dogs'),
 ('dogs', 'run'),
 ('run', 'fast'),
 ('fast', '</s>'),
 ('<s>', 'the'),
 ('the', 'girl'),
 ('girl', 'jumped'),
 ('jumped', 'high'),
 ('high', '</s>'),
 ('<s>', 'birds'),
 ('birds', 'sing'),
 ('sing', 'in'),
 ('in', 'trees'),
 ('trees', '</s>'),
 ('<s>', 'the'),
 ('the', 'children'),
 ('children', 'run'),
 ('run', 'fast'),
 ('fast', '</s>'),
 ('<s>', 'many'),
 ('many', 'children'),
 ('children', 'also'),
 ('also', 'play'),
 ('play', '</s>'),
 ('<s>', 'the'),
 ('the', 'dog'),
 ('dog', 'play'),
 ('play', '

In [85]:
#9 August 2025 - class for step by step initialize & update language model with arbitrary tags, constrained by word tag weights and tag transitions
import numpy as np
from collections import defaultdict, Counter
from itertools import groupby
import random
import math, re, string
# from torch import nn

# loss_fn = nn.CrossEntropyLoss()
# loss_fn = nn.BCELoss()
# loss_fn =nn.BCEWithLogitsLoss()
# loss_fn = nn.MSELoss()

def get_mse_loss(y_true, y_pred):
    # Calculate the difference between predictions and actual values
    differences = y_pred - y_true
    # Square the differences
    squared_differences = differences ** 2
    # Calculate the mean of the squared differences
    mse_loss_out = np.mean(squared_differences)
    return mse_loss_out


def tok_basic(txt,add_sent_tags=False): 
  txt=re.sub("(?u)(\W)",r" \1 ", txt)
  out=re.split("\s+",txt)
  tokens=[v for v in out if v]
  if add_sent_tags: tokens=["<s>"]+tokens+["</s>"]
  return tokens

def create_one_hot_vec(hot_i,vec_size):
  zeros=[0.]*vec_size
  zeros[hot_i]=1.
  return np.array(zeros)


random.seed(0)
np.random.seed()

class shadow:
    def __init__(self,sent_list,tag_list,params={}):
        self.sent_list=sent_list
        self.tag_list=tag_list
        self.params={}
        self.add_sent_tags=self.params.get("add_sent_tags",True)
        extended_tags=["<s>","</s>"] #maybe will also include numbers and punctuation - special characters, hashtags, whatever
        if self.add_sent_tags: 
            self.tag_list+=extended_tags
            self.tag_list=sorted(list(set(self.tag_list)))
        self.available_tags=[v for v in self.tag_list if not v in extended_tags]
        self.tokenized_sent_list=[]
        self.tagged_tokens_list=[]
        self.token_counter={}
        self.ngram_list=[]
        for sent0 in self.sent_list:
            cur_tokens=tok_basic(sent0,add_sent_tags=self.add_sent_tags)
            print(cur_tokens)
            cur_tokens_with_tags=[]
            for tk0 in cur_tokens: 
                self.token_counter[tk0]=self.token_counter.get(tk0,0)+1
                assigned_tag=self.assign_tag(tk0)
                cur_tokens_with_tags.append((tk0,assigned_tag))
                self.tagged_tokens_list.append((tk0,assigned_tag))
            for tk_i0,cur_tk0 in enumerate(cur_tokens[:-1]):
                next_tk0=cur_tokens[tk_i0+1]
                self.ngram_list.append((cur_tk0,next_tk0))

        self.vocab=sorted(list(self.token_counter.keys())) 
        
        self.token2idx = {w: i for i, w in enumerate(self.vocab)}
        self.idx2token = {i: w for w, i in self.token2idx.items()}

        self.tag2idx = {w: i for i, w in enumerate(self.tag_list)}
        self.idx2tag = {i: w for w, i in self.tag2idx.items()}

        self.n_vocab=len(self.vocab)
        self.n_tags=len(self.tag_list)

        self.calc_prob()
        #self.train()
            
    def assign_tag(self,token,params={}):
        if token in ["<s>","</s>"]: tag=token
        elif token.isdigit(): tag="NUM"
        elif token in string.punctuation: tag="PUNC" 
        else: tag=random.choice(self.available_tags)
        return tag
    def calc_prob(self): #calculate P(word|tag) , P(tag|word), P(tag2|tag1)
        tag_pairs=[(self.tagged_tokens_list[i0][1],self.tagged_tokens_list[i0+1][1]) for i0 in range(len(self.tagged_tokens_list)-1)] #get the tag pairs from the current assignment
        only_tags=[v[1] for v in self.tagged_tokens_list]
        tag_counter=dict(Counter(only_tags))
        sorted_by_word=sorted(self.tagged_tokens_list,key=lambda x:x[0])
        sorted_by_tag=sorted(self.tagged_tokens_list,key=lambda x:x[1])
        tag_pairs_sorted=sorted(tag_pairs,key=lambda x:x[0])
        
        grouped_by_word=dict(iter( [(key,dict( Counter([v[1] for v in list(group)]) )  ) for key,group in groupby(sorted_by_word,lambda x:x[0])] )) 
        grouped_by_tag=dict(iter( [(key,dict( Counter([v[0] for v in list(group)]) )  ) for key,group in groupby(sorted_by_tag,lambda x:x[1])] )) 
        tag_pairs_grouped_by_tag=dict(iter( [(key,dict( Counter([v[1] for v in list(group)]) )  ) for key,group in groupby(tag_pairs_sorted,lambda x:x[0])] )) 

        self.grouped_by_word=grouped_by_word
        self.grouped_by_tag=grouped_by_tag
        self.tag_pairs_grouped_by_tag=tag_pairs_grouped_by_tag
        

        self.fwd_tag_transition_matrix=np.zeros((self.n_tags,self.n_tags))
        self.p_tag_given_word_matrix=np.zeros((self.n_vocab,self.n_tags)) #check
        self.p_word_given_tag_matrix=np.zeros((self.n_tags,self.n_vocab)) #check

        for word0,word_tag_count0 in grouped_by_word.items():
            word_i0=self.token2idx.get(word0)
            word_count0=self.token_counter.get(word0)
            if word_i0==None or word_count0==None: continue
            #print(word0,word_i0,word_tag_count0)
            for tag0,tag_count0 in word_tag_count0.items():
                tag_i0=self.tag2idx.get(tag0)
                if tag_i0==None: continue
                ratio0=tag_count0/word_count0
                self.p_tag_given_word_matrix[word_i0][tag_i0]=ratio0

        for tag0,tag_word_count0 in grouped_by_tag.items():
            tag_i0=self.tag2idx.get(tag0)
            tag_count0=tag_counter.get(tag0)
            if tag_i0==None or tag_count0==None: continue
            #print(tag0,tag_i0,tag_word_count0)
            for word0,word_count0 in tag_word_count0.items():
                word_i0=self.token2idx.get(word0)
                if word_i0==None: continue
                ratio0=word_count0/tag_count0
                self.p_word_given_tag_matrix[tag_i0][word_i0]=ratio0

        #Now updating tag transitions
        for tag0,tag_tag_count0 in tag_pairs_grouped_by_tag.items():
            tag_i0=self.tag2idx.get(tag0)
            tag_count0=tag_counter.get(tag0)
            if tag_i0==None or tag_count0==None: continue
            #print(tag0,tag_i0,tag_tag_count0)
            for tag1,tag_count1 in tag_tag_count0.items():
                tag_i1=self.tag2idx.get(tag1)
                if tag_i1==None: continue
                ratio0=tag_count1/tag_count0
                self.fwd_tag_transition_matrix[tag_i0][tag_i1]=ratio0


    def train(self,params={}):
        all_loss=0
        for i0 in range(len(self.tagged_tokens_list)-1):
            item0,item1=self.tagged_tokens_list[i0],self.tagged_tokens_list[i0+1]
            print(item0,item1)
            token0,tag0=item0
            token1,tag1=item1
            token0_i,token1_i=self.token2idx.get(token0),self.token2idx.get(token1)
            token0_oh=create_one_hot_vec(token0_i,self.n_vocab)
            token1_oh=create_one_hot_vec(token1_i,self.n_vocab)
            pred01=token0_oh @ self.p_tag_given_word_matrix @ self.fwd_tag_transition_matrix @ self.p_word_given_tag_matrix
            pred01_with_tokens=[(v,round(float(w),3) ) for v,w in zip(self.vocab,pred01)]
            pred01_with_tokens.sort(key=lambda x:-x[-1])
            #print(pred01.shape)
            #print(pred01_with_tokens[:5])
            cur_loss=get_mse_loss(token1_oh,pred01)
            print(tag0, "cur_loss", cur_loss)
            
            for temp_tag0 in self.available_tags: #iterate over available tags for current token, to see which yields best loss
                shadow_obj.update_tag(item_i,temp_tag0)
                shadow_obj.calc_prob()
                temp_pred01=token0_oh @ self.p_tag_given_word_matrix @ self.fwd_tag_transition_matrix @ self.p_word_given_tag_matrix
                temp_loss=get_mse_loss(token1_oh,temp_pred01)
                print(temp_tag0, temp_loss)
            print("---")
                
            

            all_loss+=cur_loss
        print("all_loss", round(all_loss,4))
    def update_tag(self,item_i,new_tag):
        cur_item=self.tagged_tokens_list[item_i]
        self.tagged_tokens_list[item_i]=(cur_item[0],new_tag)
        print("updated", cur_item,self.tagged_tokens_list[item_i])
    def update_word_tags(self,word,new_tag):
        for i, cur_item in enumerate(self.tagged_tokens_list):
            if cur_item[0].lower()==word.lower(): self.tagged_tokens_list[item_i]=(self.tagged_tokens_list[item_i][0],new_tag)
        
        

        

# Corpus
sentences = [
    "the cat sat on the mat",
    "a dog barked loudly",
    "many birds fly in the sky",
    "children play in the garden",
    "some dogs run fast.",
    "the girl jumped high",
    "birds sing in trees",
    "the children run fast.",
    "many children also play",
    "the dog play with the children",
    "the birds are on the trees",
    "she jumped high",
    "they play in the house",
    "it sat on the table",
    "there were 15 boys",
    "there are also 12 men, 7 women"
]

#tags=["N","V","PREP","DET","ADJ","ADV", "<s>","</s>"]
#available_tags=["N","V","PREP","DET","ADJ","ADV"]
available_tags=["N","V","PREP","DET","OTHER","NUM","PUNC"]
all_tags=available_tags+["<s>","</s>"]


shadow_obj=shadow(sentences,available_tags)
shadow_obj.train()

# shadow_obj.update_tag(1,"DET")
# shadow_obj.update_tag(2,"N")
# shadow_obj.update_tag(6,"N")
# shadow_obj.update_tag(9,"DET")
# shadow_obj.update_tag(10,"N")
# shadow_obj.update_tag(11,"V")
# shadow_obj.update_tag(11,"OTHER")

# shadow_obj.update_tag(38,"N")


# shadow_obj.calc_prob()
# shadow_obj.train()

# item_i=37

# for tag0 in shadow_obj.available_tags:
#     print(tag0)
#     shadow_obj.update_tag(item_i,tag0)
#     shadow_obj.calc_prob()
#     shadow_obj.train()
    
    


['<s>', 'the', 'cat', 'sat', 'on', 'the', 'mat', '</s>']
['<s>', 'a', 'dog', 'barked', 'loudly', '</s>']
['<s>', 'many', 'birds', 'fly', 'in', 'the', 'sky', '</s>']
['<s>', 'children', 'play', 'in', 'the', 'garden', '</s>']
['<s>', 'some', 'dogs', 'run', 'fast', '.', '</s>']
['<s>', 'the', 'girl', 'jumped', 'high', '</s>']
['<s>', 'birds', 'sing', 'in', 'trees', '</s>']
['<s>', 'the', 'children', 'run', 'fast', '.', '</s>']
['<s>', 'many', 'children', 'also', 'play', '</s>']
['<s>', 'the', 'dog', 'play', 'with', 'the', 'children', '</s>']
['<s>', 'the', 'birds', 'are', 'on', 'the', 'trees', '</s>']
['<s>', 'she', 'jumped', 'high', '</s>']
['<s>', 'they', 'play', 'in', 'the', 'house', '</s>']
['<s>', 'it', 'sat', 'on', 'the', 'table', '</s>']
['<s>', 'there', 'were', '15', 'boys', '</s>']
['<s>', 'there', 'are', 'also', '12', 'men', ',', '7', 'women', '</s>']
('<s>', '<s>') ('the', 'V')
<s> cur_loss 0.015372973809238633
updated ('the', 'PREP') ('the', 'DET')
DET 0.015025349218469278
upd

In [74]:
for i,a in enumerate(shadow_obj.tagged_tokens_list): print(i,a)

0 ('<s>', '<s>')
1 ('the', 'V')
2 ('cat', 'N')
3 ('sat', 'V')
4 ('on', 'OTHER')
5 ('the', 'DET')
6 ('mat', 'N')
7 ('</s>', '</s>')
8 ('<s>', '<s>')
9 ('a', 'PREP')
10 ('dog', 'N')
11 ('barked', 'OTHER')
12 ('loudly', 'V')
13 ('</s>', '</s>')
14 ('<s>', '<s>')
15 ('many', 'V')
16 ('birds', 'NUM')
17 ('fly', 'OTHER')
18 ('in', 'NUM')
19 ('the', 'PREP')
20 ('sky', 'N')
21 ('</s>', '</s>')
22 ('<s>', '<s>')
23 ('children', 'PREP')
24 ('play', 'N')
25 ('in', 'NUM')
26 ('the', 'N')
27 ('garden', 'V')
28 ('</s>', '</s>')
29 ('<s>', '<s>')
30 ('some', 'DET')
31 ('dogs', 'PREP')
32 ('run', 'V')
33 ('fast', 'NUM')
34 ('.', 'PUNC')
35 ('</s>', '</s>')
36 ('<s>', '<s>')
37 ('the', 'PREP')
38 ('girl', 'PUNC')
39 ('jumped', 'V')
40 ('high', 'PREP')
41 ('</s>', '</s>')
42 ('<s>', '<s>')
43 ('birds', 'N')
44 ('sing', 'NUM')
45 ('in', 'DET')
46 ('trees', 'PUNC')
47 ('</s>', '</s>')
48 ('<s>', '<s>')
49 ('the', 'DET')
50 ('children', 'V')
51 ('run', 'PUNC')
52 ('fast', 'NUM')
53 ('.', 'PUNC')
54 ('</s>'

In [26]:
print(12)
print(dir(string))
print(string.punctuation)

12
['Formatter', 'Template', '_ChainMap', '__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', '_re', '_sentinel_dict', '_string', 'ascii_letters', 'ascii_lowercase', 'ascii_uppercase', 'capwords', 'digits', 'hexdigits', 'octdigits', 'printable', 'punctuation', 'whitespace']
!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
