In [1]:
from pymystem3 import Mystem
from catboost import Pool, CatBoostClassifier 
import re

mystem = Mystem()

In [2]:
class Tag:
    
    def __init__(self, start, length, word=None, tag=None):
        self.word = word
        self.start = start
        self.length = length
        self.tag = tag
        
    def __str__(self):
        return f'{self.word} {self.start} {self.length} {self.tag}'

In [3]:
def intersect(word, tag):
    return (word.start >= tag.start and word.start <= tag.start + tag.length) or \
           (tag.start >= word.start and tag.start <= word.start + word.length)

def load_labels(data):
    lines = None
    with open('labels.txt') as labels:
        lines = labels.readlines()
    labels = []
    for line, seq in zip(lines, data):
        split = iter(line.split())        
        tags = []
        while True:
            token = next(split)
            if token == 'EOL':
                break
            else:
                start = int(token)
                length = int(next(split))
                tag = next(split)
                word = seq[start:start + length]
                tags.append(Tag(start=start, length=length, tag=tag, word=word))
        
        all_words = parse_words(seq)
        for w in all_words:
            for t in tags:
                if intersect(w, t):
                    w.tag = t.tag
        
        # all_words = list(filter(lambda x: not any(map(lambda y: intersect(y, x), tags)), parse_words(seq)))
        
        labels.append(all_words)
    return labels

def load_data(file):
    with open(file) as f:
        return list(f.readlines())

separators = ['.', ',', '—', ' ', '\n', '?', '!', '"', '(', ')', '«', '»']

def parse_words(line):
    words = []
    word = []
    for i in range(len(line)):
        if line[i] not in separators:
            word.append(line[i])
        else:
            if word:
                words.append(Tag(word=''.join(word), tag=None, start=i - len(word), length=len(word)))
                word = []
    return words

In [14]:
def get_features(word):
    has_capital = 1 if (re.match('[А-ЯЁ]', word.word) is not None) else 0
    caps = 1 if word.word == word.word.upper() else 0
    latin = 1 if re.match('[A-Z]', word.word) else 0
    
    tag = 'NA'
    if 'analysis' in mystem.analyze(word.word)[0]:
        x = mystem.analyze(word.word)[0]['analysis']
        if x != []:
            tag = re.match('[A-Z]+', x[0]['gr'])[0]   
    
    
    return [has_capital, caps, latin, tag]

In [17]:
# def context_features(words):
#     if len(words) == 1:
#         return [[0, 0, 0, 'NA'] + get_features(words[0]) + [0, 0, 0, 'NA']]
    
#     d = []
#     d.append([0, 0, 0, 'NA'] + get_features(words[0]) + get_features(words[1]))
#     for i in range(1, len(words) - 1):
#         d.append(get_features(words[i-1]) + get_features(words[i]) + get_features(words[i + 1]))
#     d.append(get_features(words[-2]) + get_features(words[-1]) + [0, 0, 0, 'NA'])
#     return d

CONTEXT_SIZE = 2

def context_features(words):
    features = list(map(get_features, words))
    padding = [0, 0, 0, 'NA']
    for i in range(CONTEXT_SIZE):
        features.append(padding)
        features.insert(0, padding)
    
    d = []
    for i in range(CONTEXT_SIZE, len(features) - CONTEXT_SIZE):
        context = []
        for j in range(CONTEXT_SIZE):
            context.extend(features[i - j])
        context.extend(features[i])
#         for j in range(CONTEXT_SIZE):
#             context.extend(features[i + j])
        d.append(context)
    return d

In [18]:
data = load_labels(load_data('train.txt'))

train_data = []
for words in data:
    train_data.extend(context_features(words))

print(train_data[:10])

[[1, 0, 0, 'S', 0, 0, 0, 'NA', 1, 0, 0, 'S'], [0, 0, 0, 'S', 1, 0, 0, 'S', 0, 0, 0, 'S'], [0, 0, 0, 'S', 0, 0, 0, 'S', 0, 0, 0, 'S'], [0, 0, 0, 'CONJ', 0, 0, 0, 'S', 0, 0, 0, 'CONJ'], [0, 0, 0, 'S', 0, 0, 0, 'CONJ', 0, 0, 0, 'S'], [0, 0, 0, 'PR', 0, 0, 0, 'S', 0, 0, 0, 'PR'], [0, 0, 0, 'A', 0, 0, 0, 'PR', 0, 0, 0, 'A'], [0, 0, 0, 'CONJ', 0, 0, 0, 'A', 0, 0, 0, 'CONJ'], [0, 0, 0, 'S', 0, 0, 0, 'CONJ', 0, 0, 0, 'S'], [0, 0, 0, 'S', 0, 0, 0, 'S', 0, 0, 0, 'S']]


In [19]:
train_label = []
for x in data:
    for y in x:
        train_label.append(y.tag)

In [20]:
cat_features = [i for i in range(12)]
train_dataset = Pool(data=train_data,
                     label=train_label,
                     cat_features=cat_features)

In [24]:
model = CatBoostClassifier(iterations=150,
                           learning_rate=1,
                           depth=10,
                           loss_function='MultiClass',
                           task_type='GPU')

model.fit(train_dataset)

0:	learn: 0.2864879	total: 27.2ms	remaining: 4.05s
1:	learn: 0.2382886	total: 52.9ms	remaining: 3.92s
2:	learn: 0.2232431	total: 79.9ms	remaining: 3.92s
3:	learn: 0.2175505	total: 105ms	remaining: 3.82s
4:	learn: 0.2149835	total: 131ms	remaining: 3.79s
5:	learn: 0.2137000	total: 156ms	remaining: 3.73s
6:	learn: 0.2127568	total: 185ms	remaining: 3.77s
7:	learn: 0.2121860	total: 210ms	remaining: 3.73s
8:	learn: 0.2118126	total: 236ms	remaining: 3.7s
9:	learn: 0.2112926	total: 263ms	remaining: 3.68s
10:	learn: 0.2108173	total: 290ms	remaining: 3.66s
11:	learn: 0.2102403	total: 316ms	remaining: 3.63s
12:	learn: 0.2095641	total: 344ms	remaining: 3.63s
13:	learn: 0.2092749	total: 371ms	remaining: 3.6s
14:	learn: 0.2089809	total: 398ms	remaining: 3.58s
15:	learn: 0.2088663	total: 425ms	remaining: 3.56s
16:	learn: 0.2085103	total: 454ms	remaining: 3.56s
17:	learn: 0.2083296	total: 485ms	remaining: 3.55s
18:	learn: 0.2081321	total: 513ms	remaining: 3.53s
19:	learn: 0.2079936	total: 538ms	remain

<catboost.core.CatBoostClassifier at 0x7ff172d3e8d0>

In [25]:
test_data = load_data('test.txt')

for line in test_data:
    words = parse_words(line)
    eval_data = context_features(words)
    classes = model.predict(eval_data)
    for cl, w in zip(classes, words):
        if cl[0] != 'None':
            print(w.start, w.length, cl[0], end=' ')
    print('EOL')

EOL
EOL
16 7 PERSON EOL
EOL
EOL
EOL
66 3 ORG EOL
56 6 PERSON 63 8 PERSON EOL
EOL
26 7 ORG EOL
EOL
6 15 ORG EOL
0 8 ORG EOL
15 2 ORG 322 10 ORG EOL
24 3 ORG 29 7 PERSON 45 10 ORG EOL
EOL
EOL
56 8 ORG EOL
EOL
EOL
EOL
12 3 ORG 17 8 PERSON 28 3 ORG 33 13 PERSON 50 3 ORG 55 19 PERSON EOL
EOL
76 6 PERSON 162 6 ORG EOL
57 2 ORG EOL
16 3 ORG 87 2 ORG EOL
8 7 PERSON 29 8 ORG 38 5 ORG 44 4 ORG EOL
EOL
2 6 PERSON 28 3 ORG EOL
6 4 PERSON EOL
7 7 PERSON 111 9 PERSON EOL
EOL
70 4 ORG 75 13 ORG EOL
27 8 ORG 101 7 ORG 121 3 ORG EOL
EOL
21 5 ORG 77 6 PERSON EOL
EOL
EOL
EOL
54 3 ORG 78 2 ORG EOL
90 8 ORG 99 5 ORG EOL
11 4 PERSON 16 7 PERSON 61 8 ORG EOL
EOL
0 8 ORG EOL
82 5 ORG EOL
EOL
47 8 PERSON 70 3 ORG EOL
EOL
EOL
EOL
76 8 PERSON EOL
14 5 ORG EOL
EOL
57 4 ORG 111 4 ORG 136 5 ORG EOL
133 6 PERSON EOL
EOL
21 5 ORG 27 3 ORG 32 7 PERSON EOL
EOL
EOL
EOL
EOL
29 4 PERSON EOL
26 7 ORG 53 6 ORG EOL
40 9 PERSON 50 8 PERSON 181 9 ORG EOL
105 9 PERSON 116 9 PERSON 126 10 PERSON EOL
6 10 PERSON EOL
16 2 ORG 77 2

EOL
EOL
111 8 ORG EOL
EOL
EOL
EOL


In [113]:
train_data = [["summer", 1924, 44],
              ["summer", 1932, 37],
              ["winter", 1980, 37],
              ["summer", 2012, 204]]

eval_data = [["winter", 1996, 197],
             ["winter", 1968, 37],
             ["summer", 2002, 77],
             ["summer", 1948, 59]]

cat_features = [0]

train_label = ["France", "USA", "USA", "UK"]
eval_label = ["USA", "France", "USA", "UK"]

In [114]:
train_dataset = Pool(data=train_data,
                     label=train_label,
                     cat_features=cat_features)

eval_dataset = Pool(data=eval_data,
                    label=eval_label,
                    cat_features=cat_features)

# Initialize CatBoostClassifier
model = CatBoostClassifier(iterations=10,
                           learning_rate=1,
                           depth=2,
                           loss_function='MultiClass')
# Fit model
model.fit(train_dataset)
# Get predicted classes
preds_class = model.predict(eval_dataset)
# Get predicted probabilities for each class
preds_proba = model.predict_proba(eval_dataset)
# Get predicted RawFormulaVal
preds_raw = model.predict(eval_dataset, 
                          prediction_type='RawFormulaVal')

0:	learn: 0.9417331	total: 1.72ms	remaining: 15.5ms
1:	learn: 0.8421839	total: 4.04ms	remaining: 16.1ms
2:	learn: 0.6597822	total: 5.47ms	remaining: 12.8ms
3:	learn: 0.6028493	total: 7.99ms	remaining: 12ms
4:	learn: 0.4900112	total: 8.82ms	remaining: 8.82ms
5:	learn: 0.4076408	total: 10.7ms	remaining: 7.13ms
6:	learn: 0.3458205	total: 13.5ms	remaining: 5.8ms
7:	learn: 0.2982687	total: 15ms	remaining: 3.75ms
8:	learn: 0.2608927	total: 15.7ms	remaining: 1.75ms
9:	learn: 0.2309514	total: 17.1ms	remaining: 0us


In [7]:
preds_class

array([['USA'],
       ['USA'],
       ['UK'],
       ['USA']], dtype='<U3')