-
Notifications
You must be signed in to change notification settings - Fork 1
/
markov.py
82 lines (65 loc) · 2.87 KB
/
markov.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
from typing import List
import pandas as pd
from nltk import lm
from nltk.util import ngrams
from nltk.lm.preprocessing import padded_everygram_pipeline
from sklearn.base import ClassifierMixin
class Markov(ClassifierMixin):
def __init__(self, X: pd.DataFrame, order: int = 3, smoothing: float = 0.1):
"""
A Markov model that calculates the entropies of a list of sounds.
:param X: the DataFrame containing the segments and borrowing scores.
:param order: the ngram order.
:param smoothing: the smoothing discounting value.
"""
self.order = order
self.smoothing = smoothing
loanwords = X[X.borrowing_score == 1].value
nativewords = X[X.borrowing_score == 0].value
loanwords_train, loanwords_vocab = padded_everygram_pipeline(self.order, loanwords)
nativewords_train, nativewords_vocab = padded_everygram_pipeline(self.order, nativewords)
self.loanwords_model = lm.KneserNeyInterpolated(
order=self.order,
discount=self.smoothing,
vocabulary=lm.Vocabulary(loanwords_vocab, unk_cutoff=2)
)
self.nativewords_model = lm.KneserNeyInterpolated(
order=self.order,
discount=self.smoothing,
vocabulary=lm.Vocabulary(nativewords_vocab, unk_cutoff=2)
)
self.loanwords_model.fit(loanwords_train)
self.nativewords_model.fit(nativewords_train)
def calculate_entropies(self, model, words) -> List[float]:
"""
Given a list of words, calculate its entropies.
:param words: a list of words segmented into a list of sounds.
For example: [m, e, ŋ, g, o, s, o, ʔ]
:returns: a list of entropies.
"""
ngrams_ = [
list(
ngrams(word,
self.order,
pad_left=True,
pad_right=True,
left_pad_symbol='<s>',
right_pad_symbol='</s>'))
for word in words
]
return [model.entropy(sounds) for sounds in ngrams_]
def predict(self, X: pd.DataFrame) -> List[int]:
"""
Given a list of segments, predict whether the
given segment is a loanword (1) or not (0).
ref: Mattis 2020. Refactored for readability.
:param X: the test set containing a column of value representing segments.
:returns: a list of classifications.
"""
native_entropies = self.calculate_entropies(self.nativewords_model, X.value)
loan_entropies = self.calculate_entropies(self.loanwords_model, X.value)
predictions = [
1 if loan_entropy < native_entropy else 0
for native_entropy, loan_entropy in zip(native_entropies, loan_entropies)
]
return predictions