<a href="https://colab.research.google.com/github/iamsmnt/abbreviation_expansion/blob/master/abbreviation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd
from collections import defaultdict, Counter

In [0]:
class LanguageNgramModel:
    """ 
    The model remembers and predicts which letters follow which.
    Constructor parameters:
        order - number of characters the model remembers, or n-1
        smoothing - the number, added to each counter for stability
        recursive - weight of the model of one order less
    Learned parameters:
        counter_ - storage of n-grams, as dict of counters  
        vocabulary_ - set of characters that the model knows
    """
    def __init__(self, order=1, smoothing=1.0, recursive=0.001,vocabulary_=[]):
        self.order = order
        self.smoothing = smoothing
        self.recursive = recursive
        self.vocabulary_ = vocabulary_
    
    def fit(self, corpus):
        """ Estimate freqency of all n-grams in the text
        parameters:
            corpus - a text string 
        """
        self.counter_ = defaultdict(lambda: Counter())
        self.vocabulary_ = set()
        for i, token in enumerate(corpus[self.order:]):
            context = corpus[i:(i+self.order)]
            self.counter_[context][token] += 1
            self.vocabulary_.add(token)
            print(i,self.counter_)
        self.vocabulary_ = sorted(list(self.vocabulary_))
        if self.recursive > 0 and self.order > 0:
            self.child_ = LanguageNgramModel(self.order-1, self.smoothing, self.recursive)
            self.child_.fit(corpus)
        print(self.counter_)
        print(self.vocabulary_)
            
    def get_counts(self, context):
        """ Estimate frequency of all symbols that may follow the context
        Parameters:
            context - text string (only the last self.order chars matter)
        Returns: 
            freq - vector of letter conditional frequencies, as pandas.Series
        """
        if self.order:
            local = context[-self.order:]
        else:
            local = ''
        freq_dict = self.counter_[local]
        freq = pd.Series(index=self.vocabulary_)
        for i, token in enumerate(self.vocabulary_):
            freq[token] = freq_dict[token] + self.smoothing
        if self.recursive > 0 and self.order > 0:
            child_freq = self.child_.get_counts(context) * self.recursive
            freq += child_freq
        return freq
    
    def predict_proba(self, context):
        """ Estimate probability of all symbols that may follow the context
        Parameters:
            context - text string (only the last self.order chars matter)
        Returns: 
            freq - vector of letter conditional frequencies, as pandas.Series
        """
        counts = self.get_counts(context)
        return counts / counts.sum()
    def single_log_proba(self, context, continuation):
        """ Estimate log probability of the certain continuation of the context
        Parameters:
            context - text string, known beginning of the phrase
            continuation - text string, its hypothetical end
        Returns: 
            result - a float, log of probability
        """
        result = 0.0
        for token in continuation:
            result += np.log(self.predict_proba(context)[token])
            context += token
        return result
    
    def single_proba(self, context, continuation):
        """ Estimate probability of the certain continuation of the context
        Parameters:
            context - text string, known beginning of the phrase
            continuation - text string, its hypothetical end
        Returns: 
            result - a float, probability
        """
        return np.exp(self.single_log_proba(context, continuation))
            
  

In [0]:
lang_model = LanguageNgramModel()


In [0]:
class MissingLetterModel:
    """ 
    The model remembers and predicts which letters are usually missed.
    Constructor parameters:
        order - number of characters the model remembers, or n-1
        smoothing_missed - the number added to missed counter
        smoothing_total - the number added to total counter
    Learned parameters:
        missed_counter_ - counter of occurences of the missed characters 
        total_counter_ - counter of occurences of all characters 
    """
    def __init__(self, order=0, smoothing_missed=0.3, smoothing_total=1.0):
        self.order = order
        self.smoothing_missed = smoothing_missed
        self.smoothing_total = smoothing_total
    
    def fit(self, sentence_pairs):
        """ Estimate of missing probability for each symbol
        Parameters:
            sentence_pairs - list of (original phrase, abbreviation)
        In the abbreviation, all missed symbols are replaced with "-"
        """
        self.missed_counter_ = defaultdict(lambda: Counter())
        self.total_counter_ = defaultdict(lambda: Counter())
        for (original, observed) in sentence_pairs:
            for i, (original_letter, observed_letter) \
                    in enumerate(zip(original[self.order:], observed[self.order:])):
                context = original[i:(i+self.order)]
                if observed_letter == '-':
                  self.missed_counter_[context][original_letter] += 1
                self.total_counter_[context][original_letter] += 1 
    
    def predict_proba(self, context, last_letter):
        """ Estimate of probability of last_letter being missed after context"""
        if self.order:
            local = context[-self.order:]
        else:
            local = ''
        missed_freq = self.missed_counter_[local][last_letter] + self.smoothing_missed
        total_freq = self.total_counter_[local][last_letter] + self.smoothing_total
        return missed_freq / total_freq
    
    def single_log_proba(self, context, continuation, actual=None):
        """ Estimate log probability that after context, 
            continuation is abbreviated to actual.
        If actual is None, it is assumed that nothing is abbreviated.
        """
        if not actual:
            actual = continuation
        result = 0.0
        for orig_token, act_token in zip(continuation, actual):
            pp = self.predict_proba(context, orig_token)
            if act_token != '-':
                pp = 1 - pp
            result += np.log(pp)
            context += orig_token
        return result
    
    def single_proba(self, context, continuation, actual=None):
        """ Estimate probability that after context, 
            continuation is abbreviated to actual.
        If actual is None, it is assumed that nothing is abbreviated.
        """
        return np.exp(self.single_log_proba(context, continuation, actual))

In [0]:
missed_model = MissingLetterModel()



In [0]:
from heapq import heappush, heappop

def generate_options(prefix_proba, prefix, suffix, 
                     lang_model, missed_model, optimism=0.5, cache=None):
    """ Generate partial options of abbreviation decoding (a helper function)
    Parameters:
        prefix_proba - log probability of decoded part of the abbreviation
        prefix - decoded part of the abbreviation
        suffix - not decoded part of the abbreviation
        lang_model - the language model
        missed_model - the abbreviation probability model
        optimism - coefficient for log likelihood of the word end
        cache - storage of suffix likelihood estimates
    Returns: list of options in the form (likelihood estimate, decoded part, 
        not decoded part, the new letter, the suffix likelihood estimate)
    """
    options = []
    for letter in lang_model.vocabulary_ + ['']:
        if letter:  # here we assume the character was missing
            next_letter = letter
            new_suffix = suffix
            new_prefix = prefix + next_letter
            proba_missing_state = - np.log(missed_model.predict_proba(prefix, letter))
        else:  # here we assume there was no missing character
            next_letter = suffix[0]
            new_suffix = suffix[1:]
            new_prefix = prefix + next_letter
            proba_missing_state = - np.log((1 - missed_model.predict_proba(prefix, next_letter)))
        proba_next_letter = - np.log(lang_model.single_proba(prefix, next_letter))
        if cache:
            proba_suffix = cache[len(new_suffix)] * optimism
        else:
            proba_suffix = - np.log(lang_model.single_proba(new_prefix, new_suffix)) * optimism
        proba = prefix_proba + proba_next_letter + proba_missing_state + proba_suffix
        options.append((proba, new_prefix, new_suffix, letter, proba_suffix))
    return options



In [0]:
def noisy_channel(word, lang_model, missed_model, freedom=3.0, 
                  max_attempts=10000, optimism=0.9, verbose=False):
    """ Suggest phrases, for which word may be the abbreviation 
    parameters:
        word - string, the abbreviation
        lang_model - the language model
        missed_model - the abbreviation probability model
        freedom - possible quality range of log likelihood of the candidates
        max_attempts - maximum number of iterations
        optimism - coefficient for log likelihood of the word end
        verbose - whether to print current candidates in the runtime
    returns: dict of keys - suggested phrases, and values - 
        minus log likelihood of candidates
        The less this value, the more likely the suggestion
    """
    query = word + ' '
    prefix = ' '
    prefix_proba = 0.0
    suffix = query
    full_origin_logprob = -lang_model.single_log_proba(prefix, query)
    no_missing_logprob = -missed_model.single_log_proba(prefix, query)
    best_logprob = full_origin_logprob + no_missing_logprob
    # add an empty prefix to the heap
    heap = [(best_logprob * optimism, prefix, suffix, '', best_logprob * optimism)]
    # add the default candidate (without missing characters) 
    candidates = [(best_logprob, prefix + query, '', None, 0.0)]
    if verbose:
        print('baseline score is', best_logprob)
    # prepare storage of the phrase suffix probabilities
    cache = {}
    for i in range(len(query)+1):
        future_suffix = query[:i]
        cache[len(future_suffix)] = -lang_model.single_log_proba('', future_suffix) # rough approximation
        cache[len(future_suffix)] += -missed_model.single_log_proba('', future_suffix) # at least add missingness
    for i in range(max_attempts):
        if not heap:
            break
        next_best = heappop(heap)
        if verbose:
            print(next_best)
        if next_best[2] == '':  # the phrase is fully decoded
            # if the phrase is good enough, add it to the answer
            if next_best[0] <= best_logprob + freedom:
                candidates.append(next_best)
                # update estimate of the best likelihood
                if next_best[0] < best_logprob:
                    best_logprob = next_best[0]
        else: # # the phrase is not fully decoded - generate more options
            prefix_proba = next_best[0] - next_best[4] # all proba estimate minus suffix
            prefix = next_best[1]
            suffix = next_best[2]
            new_options = generate_options(
                prefix_proba, prefix, suffix, lang_model, 
                missed_model, optimism, cache)
            # add only the solution potentioally no worse than the best + freedom
            for new_option in new_options: 
                if new_option[0] < best_logprob + freedom:
                    heappush(heap, new_option)
    if verbose:
        print('heap size is', len(heap), 'after', i, 'iterations')
    result = {}
    for candidate in candidates:
        if candidate[0] <= best_logprob + freedom:
            result[candidate[1][1:-1]] = candidate[0]
    return result

In [8]:
import re
text = input()

The most complex part of the model that supports all types of transactions, financial or non-financial. Also includes information about channels used to generate or settle the transaction Provides a detailed view of transactions and various postings of the transaction amount into the general ledger. Capable of monitoring mobile payments as well as payments that will be enabled through PSD2 directive. Important for the analysis of the potential of specific locations for opening new branches or set up new ATMs.Includes records of personally identifiable information, their mappings to various source systems and their categorization. Holds the information about all the consents given and their relationship to consent donors which allows for transparent GDPR compliance Account bank loan 


In [9]:
text2 = re.sub(r'[^a-z ]+', '', text.lower().replace('\n', ' '))
all_letters = ''.join(list(sorted(list(set(text2)))))
print(repr(all_letters)) # 

' abcdefghiklmnoprstuvwxyz'


In [0]:
missing_set =  (
    [(all_letters, '-' * len(all_letters))] * 3 # all chars missing
    + [(all_letters, all_letters)] * 10 # all chars are NOT missing
    + [('aeiouy', '------')] * 30 # only vowels are missing
)

In [11]:
big_lang_m = LanguageNgramModel(order=4, smoothing=0.001, recursive=0.01)
big_lang_m.fit(text2)
big_err_m = MissingLetterModel(order=0, smoothing_missed=0.1)
big_err_m.fit(missing_set)

0 defaultdict(<function LanguageNgramModel.fit.<locals>.<lambda> at 0x7fc06dfee9d8>, {'the ': Counter({'m': 1})})
1 defaultdict(<function LanguageNgramModel.fit.<locals>.<lambda> at 0x7fc06dfee9d8>, {'the ': Counter({'m': 1}), 'he m': Counter({'o': 1})})
2 defaultdict(<function LanguageNgramModel.fit.<locals>.<lambda> at 0x7fc06dfee9d8>, {'the ': Counter({'m': 1}), 'he m': Counter({'o': 1}), 'e mo': Counter({'s': 1})})
3 defaultdict(<function LanguageNgramModel.fit.<locals>.<lambda> at 0x7fc06dfee9d8>, {'the ': Counter({'m': 1}), 'he m': Counter({'o': 1}), 'e mo': Counter({'s': 1}), ' mos': Counter({'t': 1})})
4 defaultdict(<function LanguageNgramModel.fit.<locals>.<lambda> at 0x7fc06dfee9d8>, {'the ': Counter({'m': 1}), 'he m': Counter({'o': 1}), 'e mo': Counter({'s': 1}), ' mos': Counter({'t': 1}), 'most': Counter({' ': 1})})
5 defaultdict(<function LanguageNgramModel.fit.<locals>.<lambda> at 0x7fc06dfee9d8>, {'the ': Counter({'m': 1}), 'he m': Counter({'o': 1}), 'e mo': Counter({'s'

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)




746 defaultdict(<function LanguageNgramModel.fit.<locals>.<lambda> at 0x7fc06df9cb70>, {'t': Counter({'h': 15, 'i': 14, ' ': 10, 'r': 5, 'o': 5, 's': 4, 'e': 4, 'a': 2, 'y': 1, 't': 1, 'l': 1, 'm': 1}), 'h': Counter({'e': 13, 'a': 3, ' ': 2, 'i': 2, 'r': 1, 'o': 1}), 'e': Counter({' ': 17, 'n': 12, 's': 5, 'l': 4, 'd': 4, 'r': 4, 't': 3, 'w': 3, 'c': 3, 'i': 3, 'x': 1, 'm': 1, 'g': 1}), ' ': Counter({'t': 23, 'a': 15, 'o': 12, 'p': 8, 'i': 7, 'c': 6, 's': 6, 'm': 5, 'f': 4, 'n': 3, 'g': 3, 'd': 3, 'v': 3, 'w': 3, 'u': 2, 'l': 2, 'b': 2, 'r': 2, 'e': 1, 'h': 1}), 'm': Counter({'o': 5, 'a': 4, 'p': 2, 'e': 2, 's': 2}), 'o': Counter({'n': 16, 'r': 15, 'f': 8, 'u': 7, ' ': 5, 's': 2, 'm': 1, 'd': 1, 'v': 1, 'b': 1, 't': 1, 'c': 1, 'p': 1, 'l': 1, 'w': 1}), 's': Counter({' ': 25, 'e': 5, 'a': 4, 't': 3, 'o': 3, 'i': 2, 'p': 2, 'u': 1, 'd': 1, 'y': 1, 'h': 1}), 'c': Counter({'t': 5, 'o': 4, 'i': 3, 'h': 3, 'a': 3, 'l': 2, ' ': 1, 'e': 1}), 'p': Counter({'a': 5, 'o': 4, 'e': 4, 'p': 2, ' ': 

In [0]:
import pandas as pd
import random


In [0]:
col = []

In [0]:
for i in range(0,2000):
  col.append('acc no')
for i in range(2000,3500):
  col.append('accnt no')
for i in range(3500,5000):
  col.append('accn no')  

In [16]:
col

['acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',
 'acc no',

In [0]:
df = pd.DataFrame(columns = col)

In [23]:
df

Unnamed: 0,acc no,acc no.1,acc no.2,acc no.3,acc no.4,acc no.5,acc no.6,acc no.7,acc no.8,acc no.9,...,accn no,accn no.1,accn no.2,accn no.3,accn no.4,accn no.5,accn no.6,accn no.7,accn no.8,accn no.9


In [27]:
for colmn in df.columns:
  noisy_channel(colmn, big_lang_m, big_err_m)


KeyboardInterrupt: ignored

In [22]:
for i in range(len(df.columns)):
  print()

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [0]:
noisy_channel('bn', big_lang_m, big_err_m)

{'bank': 10.106282395812887, 'bn': 10.414958038943075}

In [0]:
noisy_channel('acc no', big_lang_m, big_err_m)

{'account into': 17.94289715993588}

In [0]:
noisy_channel('ln', big_lang_m, big_err_m)

{'ln': 10.415717782064528}

In [0]:
noisy_channel('lo', big_lang_m, big_err_m)

{'lo': 11.969492295071948, 'loan': 11.418983063278672}

In [0]:
noisy_channel('sa', big_lang_m, big_err_m)

{'sa': 11.588476812576097}

In [0]:
noisy_channel('sme', big_lang_m, big_err_m)

{'sme': 15.035974178348798}

In [0]:
noisy_channel('sppr', big_lang_m, big_err_m)

{'suppor': 15.886396626540435, 'supports': 13.185151446713904}

In [0]:
noisy_channel('fian', big_lang_m, big_err_m)

{'financial': 16.255545728690453}