In [1]:
%matplotlib inline

In [2]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

Принцип работы этой модели:

Идея FastText в том, что морфология важна для определения значения слова. Если в Word2Vec для каждого слова создается отдельный вектор, то в FastText слово представлено как сумма векторов его компонентов (char-нграмм). Таким образом векторы могут быть сформированы даже для out-of-vocabulary слов.

## Training models




In [76]:
#!pip3 install gensim --user

from pprint import pprint as print
from gensim.models.fasttext import FastText
from gensim.test.utils import datapath
import re

# функция для считывания и загрузки файла
def read_sentences_from_file(file_path):
    # открываем файл
    with open(file_path, 'r', encoding='utf-8') as file:
        # здесь может быть какая угодно другая предобработка текста
        sentences = [line.strip().split() for line in file]
        #print("here ", type(sentences))
        #for i in range(len(sentences)):
        #    sentences[i] = str(sentences[i]).lower()
        #    sentences[i] = re.sub(r'[^a-z\s]', '', str(sentences[i]))
            #sentences[i] = re.sub(r'\s+', ' ', sentences[i]).strip()
    return sentences

custom_sentences = read_sentences_from_file('input.txt')
print(custom_sentences[:100]) # [['First', 'Citizen:'] ...

# обучаем модель, прописываем параметры, какие нам нравятся, все по аналогии с Word2Vec
custom_model = FastText(sentences=custom_sentences, vector_size=100, window=5, min_count=1, workers=4)

# build the vocabulary
model.build_vocab(corpus_file=corpus_file)

# train the model
custom_model.train(
    corpus_file=corpus_file, epochs=model.epochs,
    total_examples=model.corpus_count, total_words=model.corpus_total_words,
)

print(custom_model)

2024-01-25 19:23:01,677 : INFO : collecting all words and their counts
2024-01-25 19:23:01,678 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-01-25 19:23:01,699 : INFO : PROGRESS: at sentence #10000, processed 48251 words, keeping 9798 word types
2024-01-25 19:23:01,721 : INFO : PROGRESS: at sentence #20000, processed 102675 words, keeping 16517 word types
2024-01-25 19:23:01,744 : INFO : PROGRESS: at sentence #30000, processed 155232 words, keeping 21473 word types
2024-01-25 19:23:01,765 : INFO : collected 25670 word types from a corpus of 202651 raw words and 40000 sentences
2024-01-25 19:23:01,766 : INFO : Creating a fresh vocabulary


[['First', 'Citizen:'],
 ['Before', 'we', 'proceed', 'any', 'further,', 'hear', 'me', 'speak.'],
 [],
 ['All:'],
 ['Speak,', 'speak.'],
 [],
 ['First', 'Citizen:'],
 ['You',
  'are',
  'all',
  'resolved',
  'rather',
  'to',
  'die',
  'than',
  'to',
  'famish?'],
 [],
 ['All:'],
 ['Resolved.', 'resolved.'],
 [],
 ['First', 'Citizen:'],
 ['First,',
  'you',
  'know',
  'Caius',
  'Marcius',
  'is',
  'chief',
  'enemy',
  'to',
  'the',
  'people.'],
 [],
 ['All:'],
 ['We', "know't,", 'we', "know't."],
 [],
 ['First', 'Citizen:'],
 ['Let',
  'us',
  'kill',
  'him,',
  'and',
  "we'll",
  'have',
  'corn',
  'at',
  'our',
  'own',
  'price.'],
 ["Is't", 'a', 'verdict?'],
 [],
 ['All:'],
 ['No',
  'more',
  'talking',
  "on't;",
  'let',
  'it',
  'be',
  'done:',
  'away,',
  'away!'],
 [],
 ['Second', 'Citizen:'],
 ['One', 'word,', 'good', 'citizens.'],
 [],
 ['First', 'Citizen:'],
 ['We', 'are', 'accounted', 'poor', 'citizens,', 'the', 'patricians', 'good.'],
 ['What',
  'authorit

2024-01-25 19:23:01,869 : INFO : FastText lifecycle event {'msg': 'effective_min_count=1 retains 25670 unique words (100.00% of original 25670, drops 0)', 'datetime': '2024-01-25T19:23:01.869391', 'gensim': '4.3.2', 'python': '3.8.8 (default, Apr 13 2021, 15:08:03) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'prepare_vocab'}
2024-01-25 19:23:01,870 : INFO : FastText lifecycle event {'msg': 'effective_min_count=1 leaves 202651 word corpus (100.00% of original 202651, drops 0)', 'datetime': '2024-01-25T19:23:01.870350', 'gensim': '4.3.2', 'python': '3.8.8 (default, Apr 13 2021, 15:08:03) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'prepare_vocab'}
2024-01-25 19:23:02,034 : INFO : deleting the raw counts dictionary of 25670 items
2024-01-25 19:23:02,035 : INFO : sample=0.001 downsamples 45 most-common words
2024-01-25 19:23:02,037 : INFO : FastText lifecycle event {'msg': 'downsampling leaves estimated 168646.91294783

<gensim.models.fasttext.FastText object at 0x000002760F697760>


## Word vector lookup


All information necessary for looking up fastText words (incl. OOV words) is
contained in its ``model.wv`` attribute.

If you don't need to continue training your model, you can export & save this `.wv`
attribute and discard `model`, to save space and RAM.




In [77]:
wv = custom_model.wv
print(wv)

#
# FastText models support vector lookups for out-of-vocabulary words by summing up character ngrams belonging to the word.
#
print('first' in wv.key_to_index)

<gensim.models.fasttext.FastTextKeyedVectors object at 0x000002760EE04160>
True


In [78]:
print('queen' in wv.key_to_index)

True


In [79]:
print(wv['queen'])

array([ 0.17036636,  0.11650486, -1.0494837 , -0.06330542,  0.58523583,
        0.50222754,  0.3042531 ,  1.1636422 , -0.40916002, -0.56979954,
        0.5112044 , -0.28100216,  0.02548225,  1.4353436 , -0.70828426,
       -0.80785763, -0.04859999, -0.03423959, -0.7232618 , -0.56549174,
       -0.5198323 ,  0.47164232, -0.5048996 , -0.2741103 , -0.16621596,
       -0.488023  , -0.77342886, -0.5510966 ,  0.17865784,  0.9906498 ,
       -0.7956326 ,  0.11637522,  1.1931915 , -0.56495327, -0.17591673,
        0.3511306 , -0.69072187, -0.33180344, -0.6337029 , -0.30346203,
        0.52230066, -0.5429514 ,  0.39075312, -0.27921364,  0.23890541,
       -0.3825281 , -0.05146267,  0.05710865,  0.6314949 ,  0.15411547,
        0.18805647, -0.2885942 ,  0.23656331, -1.8217834 , -0.01878889,
       -0.55638427, -0.6235669 , -0.02545961, -0.13432382,  0.1546252 ,
       -0.28258798, -0.5002011 , -0.04425686,  1.1610763 , -0.05549501,
        1.3503082 , -0.20217952, -0.00494334,  0.6147996 ,  0.51

## Similarity operations




In [80]:
print(wv.similarity("queen", "king"))

0.9934779


In [81]:
print(wv.similarity("first", "second"))

0.99822974


### Other similarity operations

Вот тут отлично видно, что модель ориентируется на последовательность символов


In [82]:
print(wv.most_similar("king"))

[('liking', 0.9999139904975891),
 ('king?', 0.9998611211776733),
 ("king'?", 0.9998340010643005),
 ('Drinking', 0.9998327493667603),
 ('king:', 0.9998314380645752),
 ('Doing', 0.9998264908790588),
 ('Wooing', 0.9998261332511902),
 ('seeking', 0.999819815158844),
 ('asking', 0.9998108744621277),
 ('king;', 0.9997879266738892)]


In [83]:
print(wv.doesnt_match("queen king knight".split()))

'king'
