In [1]:
# Load libraries

import fastText as ft

In [2]:
# Show help page

help(ft.FastText)

Help on module fastText.FastText in fastText:

NAME
    fastText.FastText

DESCRIPTION
    # Copyright (c) 2017-present, Facebook, Inc.
    # All rights reserved.
    #
    # This source code is licensed under the BSD-style license found in the
    # LICENSE file in the root directory of this source tree. An additional grant
    # of patent rights can be found in the PATENTS file in the same directory.

FUNCTIONS
    load_model(path)
        Load a model given a filepath and return a model object.
    
    tokenize(text)
        Given a string of text, tokenize it and return a list of tokens
    
    train_supervised(input, lr=0.1, dim=100, ws=5, epoch=5, minCount=1, minCountLabel=0, minn=0, maxn=0, neg=5, wordNgrams=1, loss='softmax', bucket=2000000, thread=12, lrUpdateRate=100, t=0.0001, label='__label__', verbose=2, pretrainedVectors='')
        Train a supervised model and return a model object.
        
        input must be a filepath. The input text does not need to be tokenized

In [3]:
# Training unsupervised model

m = ft.train_unsupervised('./data/fasttext/sample.txt', model='skipgram')

In [4]:
# Save the model

m.save_model('./data/fasttext/sample.bin')

In [5]:
# Load the model

m = ft.load_model('./data/fasttext/sample.bin')

In [6]:
# Show information of instance methods

help(m)

Help on _FastText in module fastText.FastText object:

class _FastText(builtins.object)
 |  This class defines the API to inspect models and should not be used to
 |  create objects. It will be returned by functions such as load_model or
 |  train.
 |  
 |  In general this API assumes to be given only unicode for Python2 and the
 |  Python3 equvalent called str for any string-like arguments. All unicode
 |  strings are then encoded as UTF-8 and fed to the fastText C++ API.
 |  
 |  Methods defined here:
 |  
 |  __init__(self, model=None)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  get_dimension(self)
 |      Get the dimension (size) of a lookup vector (hidden layer).
 |  
 |  get_input_matrix(self)
 |      Get a copy of the full input matrix of a Model. This only
 |      works if the model is not quantized.
 |  
 |  get_input_vector(self, ind)
 |      Given an index, get the corresponding vector of the Input Matrix.
 |  
 |  get_labels(self, include

In [7]:
# Get a word vector

m.get_word_vector('apple')

array([ 1.6689689e-03, -2.3811102e-04,  1.3997646e-03, -3.1854757e-03,
       -1.6521695e-03, -2.1124871e-04,  5.0553156e-04, -3.5350578e-05,
       -5.4407719e-04, -8.6803967e-04,  3.3899541e-03,  1.8175674e-03,
        1.4265777e-03,  4.6955649e-04, -2.1355667e-03,  1.7310826e-04,
       -1.2147116e-03, -1.0363069e-04,  6.9989247e-04,  9.0023276e-04,
        6.0979772e-04, -9.8473299e-04,  4.2517553e-04, -1.0625961e-03,
        1.2552290e-03, -5.9807132e-04, -1.3126169e-03, -1.4560657e-04,
        1.0438816e-03, -2.7143772e-04, -2.1538280e-03,  1.5644798e-03,
        1.0823327e-03, -5.1334407e-04, -1.2542373e-03, -3.6767201e-04,
        4.7030728e-04,  1.1535233e-03,  1.0591045e-03,  2.2052901e-03,
        5.4061896e-04, -5.8454258e-04,  2.6625529e-04,  1.1832056e-03,
        3.1306047e-03, -4.5848521e-04,  8.1904401e-04, -1.0699088e-03,
       -1.1882274e-03,  2.9149237e-03,  3.8979432e-04, -7.4145255e-06,
       -1.4305267e-03, -3.6523014e-04,  3.3737475e-04, -8.6324115e-04,
      

In [8]:
# Get subwords

m.get_subwords('apple')

(['<ap',
  '<app',
  '<appl',
  '<apple',
  'app',
  'appl',
  'apple',
  'apple>',
  'ppl',
  'pple',
  'pple>',
  'ple',
  'ple>',
  'le>'],
 array([ 206658, 1444358, 1283422,  320209, 1074100, 1211464,  767175,
         254491, 1738747,  344746,  304028, 1534556, 1415910, 1646852]))

In [9]:
# Calculate the similarity between 2 word vectors

import numpy as np

def similarity(m, w1, w2):
    v1 = m.get_word_vector(w1)
    v2 = m.get_word_vector(w2)

    n1 = np.linalg.norm(v1)
    n2 = np.linalg.norm(v2)

    return np.dot(v1, v2) / (n1 * n2)

similarity(m, 'apple', 'tree')

-0.17698961