<a href="https://colab.research.google.com/github/iamsmnt/test1/blob/master/abbreviation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd
from collections import defaultdict, Counter


In [5]:
corpus = input()

The quick brown fox jumps into the water and wait for the crocodile to come and sees that the water current suddenly increases and both the crocodile and fox starts flowing through the water


In [0]:
class LanguageNgramModel:
    """ 
    The model remembers and predicts which letters follow which.
    Constructor parameters:
        order - number of characters the model remembers, or n-1
        smoothing - the number, added to each counter for stability
        recursive - weight of the model of one order less
    Learned parameters:
        counter_ - storage of n-grams, as dict of counters  
        vocabulary_ - set of characters that the model knows
    """
    def __init__(self, order=1, smoothing=1.0, recursive=0.001):
        self.order = order
        self.smoothing = smoothing
        self.recursive = recursive
    
    def fit(self, corpus):
        """ Estimate freqency of all n-grams in the text
        parameters:
            corpus - a text string 
        """
        self.counter_ = defaultdict(lambda: Counter())
        self.vocabulary_ = set()
        for i, token in enumerate(corpus[self.order:]):
            context = corpus[i:(i+self.order)]
            self.counter_[context][token] += 1
            self.vocabulary_.add(token)
            print(i,self.counter_)
        self.vocabulary_ = sorted(list(self.vocabulary_))
        if self.recursive > 0 and self.order > 0:
            self.child_ = LanguageNgramModel(self.order-1, self.smoothing, self.recursive)
            self.child_.fit(corpus)
        print(self.counter_)
        print(self.vocabulary_)
            
    def get_counts(self, context):
        """ Estimate frequency of all symbols that may follow the context
        Parameters:
            context - text string (only the last self.order chars matter)
        Returns: 
            freq - vector of letter conditional frequencies, as pandas.Series
        """
        if self.order:
            local = context[-self.order:]
        else:
            local = ''
        freq_dict = self.counter_[local]
        freq = pd.Series(index=self.vocabulary_)
        for i, token in enumerate(self.vocabulary_):
            freq[token] = freq_dict[token] + self.smoothing
        if self.recursive > 0 and self.order > 0:
            child_freq = self.child_.get_counts(context) * self.recursive
            freq += child_freq
        return freq
    
    def predict_proba(self, context):
        """ Estimate probability of all symbols that may follow the context
        Parameters:
            context - text string (only the last self.order chars matter)
        Returns: 
            freq - vector of letter conditional frequencies, as pandas.Series
        """
        counts = self.get_counts(context)
        return counts / counts.sum()
    def single_log_proba(self, context, continuation):
        """ Estimate log probability of the certain continuation of the context
        Parameters:
            context - text string, known beginning of the phrase
            continuation - text string, its hypothetical end
        Returns: 
            result - a float, log of probability
        """
        result = 0.0
        for token in continuation:
            result += np.log(self.predict_proba(context)[token])
            context += token
        return result
    
    def single_proba(self, context, continuation):
        """ Estimate probability of the certain continuation of the context
        Parameters:
            context - text string, known beginning of the phrase
            continuation - text string, its hypothetical end
        Returns: 
            result - a float, probability
        """
        return np.exp(self.single_log_proba(context, continuation))
            
  

In [8]:
lang_model = LanguageNgramModel(1)
lang_model.fit(' abracadabra ')
print(lang_model.predict_proba(' bra'))

     0.181777
a    0.091297
b    0.272529
c    0.181686
d    0.181686
r    0.091025
dtype: float64


In [30]:
lang_model.fit('mississippi')

0 defaultdict(<function LanguageNgramModel.fit.<locals>.<lambda> at 0x7f884c812f28>, {'': Counter({'m': 1})})
1 defaultdict(<function LanguageNgramModel.fit.<locals>.<lambda> at 0x7f884c812f28>, {'': Counter({'m': 1, 'i': 1})})
2 defaultdict(<function LanguageNgramModel.fit.<locals>.<lambda> at 0x7f884c812f28>, {'': Counter({'m': 1, 'i': 1, 's': 1})})
3 defaultdict(<function LanguageNgramModel.fit.<locals>.<lambda> at 0x7f884c812f28>, {'': Counter({'s': 2, 'm': 1, 'i': 1})})
4 defaultdict(<function LanguageNgramModel.fit.<locals>.<lambda> at 0x7f884c812f28>, {'': Counter({'i': 2, 's': 2, 'm': 1})})
5 defaultdict(<function LanguageNgramModel.fit.<locals>.<lambda> at 0x7f884c812f28>, {'': Counter({'s': 3, 'i': 2, 'm': 1})})
6 defaultdict(<function LanguageNgramModel.fit.<locals>.<lambda> at 0x7f884c812f28>, {'': Counter({'s': 4, 'i': 2, 'm': 1})})
7 defaultdict(<function LanguageNgramModel.fit.<locals>.<lambda> at 0x7f884c812f28>, {'': Counter({'s': 4, 'i': 3, 'm': 1})})
8 defaultdict(<f

In [23]:
lang_model.fit('dandelion')

0 defaultdict(<function LanguageNgramModel.fit.<locals>.<lambda> at 0x7f884d290ea0>, {'': Counter({'d': 1})})
1 defaultdict(<function LanguageNgramModel.fit.<locals>.<lambda> at 0x7f884d290ea0>, {'': Counter({'d': 1, 'a': 1})})
2 defaultdict(<function LanguageNgramModel.fit.<locals>.<lambda> at 0x7f884d290ea0>, {'': Counter({'d': 1, 'a': 1, 'n': 1})})
3 defaultdict(<function LanguageNgramModel.fit.<locals>.<lambda> at 0x7f884d290ea0>, {'': Counter({'d': 2, 'a': 1, 'n': 1})})
4 defaultdict(<function LanguageNgramModel.fit.<locals>.<lambda> at 0x7f884d290ea0>, {'': Counter({'d': 2, 'a': 1, 'n': 1, 'e': 1})})
5 defaultdict(<function LanguageNgramModel.fit.<locals>.<lambda> at 0x7f884d290ea0>, {'': Counter({'d': 2, 'a': 1, 'n': 1, 'e': 1, 'l': 1})})
6 defaultdict(<function LanguageNgramModel.fit.<locals>.<lambda> at 0x7f884d290ea0>, {'': Counter({'d': 2, 'a': 1, 'n': 1, 'e': 1, 'l': 1, 'i': 1})})
7 defaultdict(<function LanguageNgramModel.fit.<locals>.<lambda> at 0x7f884d290ea0>, {'': Coun

In [24]:
lang_model.fit('cat')

0 defaultdict(<function LanguageNgramModel.fit.<locals>.<lambda> at 0x7f884c7f4ea0>, {'': Counter({'c': 1})})
1 defaultdict(<function LanguageNgramModel.fit.<locals>.<lambda> at 0x7f884c7f4ea0>, {'': Counter({'c': 1, 'a': 1})})
2 defaultdict(<function LanguageNgramModel.fit.<locals>.<lambda> at 0x7f884c7f4ea0>, {'': Counter({'c': 1, 'a': 1, 't': 1})})


In [32]:
s = [('yellow', 1), ('blue', 2), ('yellow', 3), ('blue', 4), ('red', 1)]
d = defaultdict(list)
for k, v in s:
  d[k].append(v)
  print(d.items())

dict_items([('yellow', [1])])
dict_items([('yellow', [1]), ('blue', [2])])
dict_items([('yellow', [1, 3]), ('blue', [2])])
dict_items([('yellow', [1, 3]), ('blue', [2, 4])])
dict_items([('yellow', [1, 3]), ('blue', [2, 4]), ('red', [1])])
