**Subscribe** to this channel and **follow** *@dsbyhadi* on twitter for updates!

*email: datasciencebyhadi@gmail.com*

----

### Sentiment analysis via Embedding.
#### Tool: fastText

In [2]:
import fastText #version 0.8.22
import pandas as pd
import os
from fastText import train_supervised
import numpy as np

***Note:*** 
fasttext default arguments are different between python and CLI. Always check to find out what parameters are used.

In [3]:
# To show the output of all lines in a cell rather that just the last line
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

`The architecture of fasttext supervised learning is like word2vec CBOW where the target word is replaced with the label.`

In [3]:
def print_results(N, p, r):
    print("N\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

Link to download the data: https://www.kaggle.com/marklvl/sentiment-labelled-sentences-data-set#sentiment%20labelled%20sentences.zip

In [4]:
DATA_DIR = "~/Downloads/sentiment_labelled_sentences/"

In [5]:
data_amazon = pd.read_table(DATA_DIR+"amazon_cells_labelled.txt"
                          , sep='\t', header=None, names=['sentence', 'sentiment'])
data_imdb = pd.read_table(DATA_DIR+"imdb_labelled.txt"
                          , sep='\t', header=None, names=['sentence', 'sentiment'])
data_yelp = pd.read_table(DATA_DIR+"yelp_labelled.txt"
                          , sep='\t', header=None, names=['sentence', 'sentiment'])

In [6]:
all_data = pd.concat([data_amazon, data_imdb, data_yelp])
all_data.head(2)

Unnamed: 0,sentence,sentiment
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1


### Formatting the label as the default for fasttext

In [7]:
all_data['label'] = all_data.apply(lambda row: '__label__' + str(row.sentiment), axis=1)

In [8]:
all_data.head(2)

Unnamed: 0,sentence,sentiment,label
0,So there is no way for me to plug it in here i...,0,__label__0
1,"Good case, Excellent value.",1,__label__1


In [9]:
import re

def normalize(row):
    lower = row['sentence'].lower()
    #correct all multiple white spaces to a single white space
    no_mult_ws = re.sub('[\s]+', ' ', lower)
    text = no_mult_ws.strip()
    return text

all_data['normalized_sentence'] = all_data.apply(normalize, axis=1)

In [10]:
all_data.head(2)

Unnamed: 0,sentence,sentiment,label,normalized_sentence
0,So there is no way for me to plug it in here i...,0,__label__0,so there is no way for me to plug it in here i...
1,"Good case, Excellent value.",1,__label__1,"good case, excellent value."


In [11]:
labeled_data = all_data.drop(['sentence', 'sentiment'], axis=1)
labeled_data.head(2)

Unnamed: 0,label,normalized_sentence
0,__label__0,so there is no way for me to plug it in here i...
1,__label__1,"good case, excellent value."


In [12]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(labeled_data, test_size=0.35, random_state=4)

In [13]:
test_data.head(2)

Unnamed: 0,label,normalized_sentence
177,__label__1,"the atmosphere is modern and hip, while mainta..."
720,__label__1,"cute, quaint, simple, honest."


#### Save model to disk to be read by fasttext

In [14]:
labeled_data.to_csv(path_or_buf='./sentiment.all', header=False, index=False, sep='\t')
train_data.to_csv(path_or_buf='./sentiment.train', header=False, index=False, sep='\t')
test_data.to_csv(path_or_buf='./sentiment.test', header=False, index=False, sep='\t')

### Fasttext model training/eval/etc.
##### This model is a binary classifier.

In [84]:
train_data_path = os.path.join("./", 'sentiment.train')
test_data_path = os.path.join("./", 'sentiment.test')

sentiment_model = train_supervised(
    input=train_data_path,
    lr=1,
    dim=100,
    ws=5,
    epoch=5,
    minCount=1,
    minCountLabel=0,
    minn=2,
    maxn=3,
    neg=5,
    wordNgrams=2,
    loss="softmax", #hs, ns (multi-label), ova(Based on documentation should be multi-label but in reallity it's not!)
    bucket=200000,
    lrUpdateRate=100,
    t=1e-4,
    label="__label__",
    verbose=2,
    pretrainedVectors="",
)
print("On train")
print_results(*sentiment_model.test(train_data_path))
print("On test")
print_results(*sentiment_model.test(test_data_path))
sentiment_model.save_model("sentiment_model.bin")

On train
N	1786
P@1	0.941
R@1	0.941
On test
N	962
P@1	0.789
R@1	0.789


Check here for the input arguments explanations: 'https://fasttext.cc/docs/en/options.html'

In [85]:
# Testing trained model on some random document
sentiment_model.predict("you are not a cool guy but i really like you", k=2)
sentiment_model.predict("yeah..", k=2)

(('__label__0', '__label__1'), array([ 0.57224286,  0.42777717]))

(('__label__1', '__label__0'), array([ 0.83184433,  0.16817565]))

In [86]:
pred = sentiment_model.predict("the best soundtrack ever to anything.: i'm reading a lot of reviews saying that this is the best 'game soundtrack' and i figured that i'd write a review to disagree a bit. this in my opinino is yasunori mitsuda's ultimate masterpiece. the music is timeless and i'm been listening to it for years now and its beauty simply refuses to fade.the price tag on this is pretty staggering i must say, but if you are going to buy any cd for this much money, this is the only one that i feel would be worth every penny.", k=2)
pred[1][0]
pred[1][1]
pred[1][0]+pred[1][1]

0.59387063980102539

0.40614932775497437

1.0000199675559998

In [17]:
sentiment_model.predict("you are not a cool guy but i really like you", k=2)
sentiment_model.predict("yeah..", k=2)

(('__label__0', '__label__1'), array([ 0.55279148,  0.44722855]))

(('__label__1', '__label__0'), array([ 0.89712566,  0.10289439]))

Quantizing sacrifices a bit of performance to reduce the size of model. Good when size matters, like when deploying to
edge device, e.g. mobile.

In [18]:
sentiment_model.quantize(input=train_data_path, qnorm=True, retrain=True, cutoff=200000)
sentiment_model.save_model("sent_model.ftz")

In [19]:
print_results(*sentiment_model.test(test_data_path))

N	962
P@1	0.805
R@1	0.805


In [20]:
sentiment_model.predict("you are not a cool guy but i really like you", k=2)
sentiment_model.predict("yeah..", k=2)

(('__label__0', '__label__1'), array([ 0.57850939,  0.42151058]))

(('__label__1', '__label__0'), array([ 0.94297862,  0.05704143]))

In [21]:
# pr = sentiment_model.predict("you are not a cool guy but i really like you", k=1)
pr = sentiment_model.predict("nice", k=1)
pr

(('__label__1',), array([ 1.00001001]))

In [22]:
if "0" in pr[0][0]:
    print(1 - pr[1][0])
else:
    pr[1][0]

1.0000100135803223

In [23]:
def get_pos_probab(doc):
    pr = sentiment_model.predict(doc, k=1)
    if "0" in pr[0][0]:
        return round(1 - pr[1][0], 2)
    else:
        return round(pr[1][0], 2)

In [24]:
test_data['probability'] = test_data['normalized_sentence'].apply(get_pos_probab)
test_data['prediction'] = test_data['probability'].apply(lambda row: int(round(row, 0)))
test_data['orig_label'] = test_data['label'].apply(lambda row: row[-1:])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [25]:
test_data.head(5)

Unnamed: 0,label,normalized_sentence,probability,prediction,orig_label
177,__label__1,"the atmosphere is modern and hip, while mainta...",0.91,1,1
720,__label__1,"cute, quaint, simple, honest.",0.83,1,1
525,__label__0,"it's an empty, hollow shell of a movie.",0.2,0,0
630,__label__0,don't bother - go to the store.,0.02,0,0
137,__label__0,to those who find this movie intelligent or ev...,0.8,1,0


In [26]:
pd.options.mode.chained_assignment = None

In [27]:
train_data['probability'] = train_data['normalized_sentence'].apply(get_pos_probab)
train_data['prediction'] = train_data['probability'].apply(lambda row: int(round(row, 0)))
train_data['orig_label'] = train_data['label'].apply(lambda row: row[-1:])

In [28]:
train_data.head(3)

Unnamed: 0,label,normalized_sentence,probability,prediction,orig_label
15,__label__1,highly recommend for any one who has a blue to...,0.91,1,1
356,__label__0,"sadly, gordon ramsey's steak is a place we sha...",0.1,0,0
496,__label__1,the pancake was also really good and pretty la...,0.89,1,1


In [29]:
train_data.dtypes
train_data['orig_label'] = train_data['orig_label'].astype('int')
train_data['prediction'] = train_data['prediction'].astype('int')
train_data.dtypes

label                   object
normalized_sentence     object
probability            float64
prediction               int64
orig_label              object
dtype: object

label                   object
normalized_sentence     object
probability            float64
prediction               int64
orig_label               int64
dtype: object

In [30]:
test_data['orig_label'] = test_data['orig_label'].astype('category')
test_data['prediction'] = test_data['prediction'].astype('category')

In [31]:
test_data.head(6)
train_data.head(6)

Unnamed: 0,label,normalized_sentence,probability,prediction,orig_label
177,__label__1,"the atmosphere is modern and hip, while mainta...",0.91,1,1
720,__label__1,"cute, quaint, simple, honest.",0.83,1,1
525,__label__0,"it's an empty, hollow shell of a movie.",0.2,0,0
630,__label__0,don't bother - go to the store.,0.02,0,0
137,__label__0,to those who find this movie intelligent or ev...,0.8,1,0
555,__label__0,i know this is not like the other restaurants ...,0.11,0,0


Unnamed: 0,label,normalized_sentence,probability,prediction,orig_label
15,__label__1,highly recommend for any one who has a blue to...,0.91,1,1
356,__label__0,"sadly, gordon ramsey's steak is a place we sha...",0.1,0,0
496,__label__1,the pancake was also really good and pretty la...,0.89,1,1
753,__label__1,cheap but hey it works.. was pleasantly supris...,0.87,1,1
9,__label__1,a great touch.,1.0,1,1
975,__label__1,it is the best charger i have seen on the mark...,0.99,1,1


In [32]:
from sklearn import metrics

metrics.recall_score(train_data['orig_label'], train_data['prediction'], average='micro')
metrics.recall_score(test_data['orig_label'], test_data['prediction'], average='micro')
metrics.accuracy_score(test_data['orig_label'], test_data['prediction'])
metrics.recall_score(test_data['orig_label'], test_data['prediction'], average='macro')

0.98992161254199329

ValueError: Mix of label input types (string and number)

However, in our example, we are going to use the precision@1 provided by fasttext itself, although the above cells show how to compute any other arbitrary metric.

Model parameter search/tuning..

In [None]:
train_data_path = os.path.join("./", 'sentiment.train')
test_data_path = os.path.join("./", 'sentiment.test')

def grid_search(lr, dim, ws, epoch, minn, maxn, wordNgrams):
    for l_rate in lr:
        for d in dim:
            for s in ws:
                for ep in epoch:
                    for mi in minn:
                        for ma in maxn:
                            if (ma >= mi):
                                for n in wordNgrams:
                                    sentiment_model_mc = train_supervised(
                                                        input=train_data_path,
                                                        lr=l_rate,
                                                        dim=d,
                                                        ws=s,
                                                        epoch=ep,
                                                        minCount=1,
                                                        minCountLabel=0,
                                                        minn=mi,
                                                        maxn=ma,
                                                        neg=5,
                                                        wordNgrams=n,
                                                        loss='softmax', # ns, hs, softmax, ova (for multilabel classification)
                                                        bucket=200000,
                                                        lrUpdateRate=100,
                                                        t=1e-4,
                                                        label="__label__",
                                                        verbose=2,
                                                        pretrainedVectors="",
                                                    )
                                    train_res = round(sentiment_model_mc.test(train_data_path)[1], 2) #precision
                                    test_res = round(sentiment_model_mc.test(test_data_path)[1], 2)
                                    ratio = train_res/test_res
                                    if (ratio > 0.95) & (ratio < 1.06) & (train_res > 0.8):
                                        print("{}, {}: *** lr = {}, dim = {}, ws = {}, epoch = {}, minn = {}, maxn = {}, wordNgrams = {}".format(train_res, test_res, l_rate, d, s, ep, mi, ma, n))
                                    else:
                                        print("{}, {}:     lr = {}, dim = {}, ws = {}, epoch = {}, minn = {}, maxn = {}, wordNgrams = {}".format(train_res, test_res, l_rate, d, s, ep, mi, ma, n))
                                

In [None]:
# loss='softmax', minCount=1, neg=5

In [299]:
grid_search(lr=[0.9, 0.95], dim=[5, 10, 20, 40], ws=[4, 5, 6], 
            epoch=[1, 2, 3], minn=[2, 3, 4], maxn=[3, 4, 5, 6], wordNgrams=[1, 2, 3])

0.79, 0.75: *** lr = 0.9, dim = 5, ws = 4, epoch = 1, minn = 2, maxn = 3, wordNgrams = 1
0.81, 0.75:     lr = 0.9, dim = 5, ws = 4, epoch = 1, minn = 2, maxn = 3, wordNgrams = 2
0.82, 0.75:     lr = 0.9, dim = 5, ws = 4, epoch = 1, minn = 2, maxn = 3, wordNgrams = 3
0.81, 0.75:     lr = 0.9, dim = 5, ws = 4, epoch = 1, minn = 2, maxn = 4, wordNgrams = 1
0.81, 0.75:     lr = 0.9, dim = 5, ws = 4, epoch = 1, minn = 2, maxn = 4, wordNgrams = 2
0.82, 0.76:     lr = 0.9, dim = 5, ws = 4, epoch = 1, minn = 2, maxn = 4, wordNgrams = 3
0.82, 0.76:     lr = 0.9, dim = 5, ws = 4, epoch = 1, minn = 2, maxn = 5, wordNgrams = 1
0.81, 0.74:     lr = 0.9, dim = 5, ws = 4, epoch = 1, minn = 2, maxn = 5, wordNgrams = 2
0.81, 0.75:     lr = 0.9, dim = 5, ws = 4, epoch = 1, minn = 2, maxn = 5, wordNgrams = 3
0.83, 0.76:     lr = 0.9, dim = 5, ws = 4, epoch = 1, minn = 2, maxn = 6, wordNgrams = 1
0.8, 0.75:     lr = 0.9, dim = 5, ws = 4, epoch = 1, minn = 2, maxn = 6, wordNgrams = 2
0.81, 0.74:     lr = 0

In [300]:
grid_search(lr=[0.95, 1], dim=[5, 20], ws=[5, 6], 
            epoch=[1, 2], minn=[3,4], maxn=[3, 4, 5, 6], wordNgrams=[1, 2, 3])

0.85, 0.77:     lr = 0.95, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 3, wordNgrams = 1
0.85, 0.77:     lr = 0.95, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 3, wordNgrams = 2
0.87, 0.77:     lr = 0.95, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 3, wordNgrams = 3
0.84, 0.76:     lr = 0.95, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 4, wordNgrams = 1
0.84, 0.77:     lr = 0.95, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 4, wordNgrams = 2
0.85, 0.76:     lr = 0.95, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 4, wordNgrams = 3
0.85, 0.78:     lr = 0.95, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 5, wordNgrams = 1
0.85, 0.78:     lr = 0.95, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 5, wordNgrams = 2
0.85, 0.78:     lr = 0.95, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 5, wordNgrams = 3
0.85, 0.78:     lr = 0.95, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 6, wordNgrams = 1
0.85, 0.78:     lr = 0.95, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 6, wordNgrams = 2
0.84, 0.78

In [392]:
grid_search(lr=[0.05, 0.1], dim=[5, 20], ws=[5, 6], 
            epoch=[1, 2], minn=[3,4], maxn=[3, 4, 5, 6], wordNgrams=[1, 2, 3])

0.57, 0.58:     lr = 0.05, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 3, wordNgrams = 1
0.58, 0.56:     lr = 0.05, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 3, wordNgrams = 2
0.57, 0.56:     lr = 0.05, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 3, wordNgrams = 3
0.56, 0.57:     lr = 0.05, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 4, wordNgrams = 1
0.56, 0.56:     lr = 0.05, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 4, wordNgrams = 2
0.56, 0.56:     lr = 0.05, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 4, wordNgrams = 3
0.54, 0.56:     lr = 0.05, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 5, wordNgrams = 1
0.55, 0.56:     lr = 0.05, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 5, wordNgrams = 2
0.55, 0.56:     lr = 0.05, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 5, wordNgrams = 3
0.55, 0.54:     lr = 0.05, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 6, wordNgrams = 1
0.55, 0.55:     lr = 0.05, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 6, wordNgrams = 2
0.55, 0.55

In [395]:
grid_search(lr=[0.5, 2], dim=[5, 20], ws=[5, 6], 
            epoch=[1, 2], minn=[3,4], maxn=[3, 4, 5, 6], wordNgrams=[1, 2, 3])

0.74, 0.71:     lr = 0.5, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 3, wordNgrams = 1
0.71, 0.67:     lr = 0.5, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 3, wordNgrams = 2
0.68, 0.66:     lr = 0.5, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 3, wordNgrams = 3
0.67, 0.64:     lr = 0.5, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 4, wordNgrams = 1
0.65, 0.63:     lr = 0.5, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 4, wordNgrams = 2
0.64, 0.62:     lr = 0.5, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 4, wordNgrams = 3
0.65, 0.63:     lr = 0.5, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 5, wordNgrams = 1
0.64, 0.62:     lr = 0.5, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 5, wordNgrams = 2
0.62, 0.62:     lr = 0.5, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 5, wordNgrams = 3
0.63, 0.6:     lr = 0.5, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 6, wordNgrams = 1
0.62, 0.6:     lr = 0.5, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 6, wordNgrams = 2
0.61, 0.59:     lr = 0.

`0.83, 0.79: *** lr = 0.95, dim = 20, ws = 5, epoch = 1, minn = 3, maxn = 6, wordNgrams = 3`

In [397]:
all_data_path = os.path.join('./', 'sentiment.all')

sentiment_model_all = train_supervised(
    input=all_data_path,
    lr=0.95,
    dim=20,
    ws=5,
    epoch=1,
    minCount=1,
    minCountLabel=0,
    minn=3,
    maxn=6,
    neg=5,
    wordNgrams=3,
    loss='softmax',
    bucket=200000,
    lrUpdateRate=100,
    t=1e-4,
    label="__label__",
    verbose=2,
    pretrainedVectors=""
)
print_results(*sentiment_model_all.test(all_data_path))
sentiment_model_all.save_model("sentiment_model_all.bin")

N	2748
P@1	0.795
R@1	0.795


** word vectors **

In [4]:
sentiment_model_all = fastText.load_model("sentiment_model_all.bin")

In [5]:
sentiment_model_all.get_word_vector("cute")

array([ -6.85028313e-03,  -6.53951382e-03,   8.25009402e-03,
        -2.35103630e-03,   4.64555202e-03,   9.36317723e-03,
         3.48843052e-03,   1.16713457e-02,  -5.62884146e-03,
        -1.50917645e-03,  -9.67059936e-03,  -2.65773316e-03,
         9.33713527e-05,  -7.78717501e-03,  -3.55903874e-03,
         7.16797658e-04,  -7.84560945e-03,  -9.91582684e-03,
        -9.07231495e-03,   1.83775611e-02], dtype=float32)

In [6]:
sentiment_model_all.get_sentence_vector("nice item")

array([ 0.01952585, -0.00029977, -0.0116138 , -0.03219083,  0.0052972 ,
       -0.00704678,  0.05084702, -0.03865651, -0.02823191, -0.00651574,
       -0.00146935,  0.03322271,  0.01117315,  0.02758684, -0.00414562,
       -0.00177466,  0.00453784,  0.00702936,  0.00239116, -0.02514572], dtype=float32)

In [328]:
??sentiment_model_all.get_sentence_vector()

In [7]:
sentiment_model_all.get_subwords("high quality")

(['<hi',
  '<hig',
  '<high',
  '<high ',
  'hig',
  'high',
  'high ',
  'high q',
  'igh',
  'igh ',
  'igh q',
  'igh qu',
  'gh ',
  'gh q',
  'gh qu',
  'gh qua',
  'h q',
  'h qu',
  'h qua',
  'h qual',
  ' qu',
  ' qua',
  ' qual',
  ' quali',
  'qua',
  'qual',
  'quali',
  'qualit',
  'ual',
  'uali',
  'ualit',
  'uality',
  'ali',
  'alit',
  'ality',
  'ality>',
  'lit',
  'lity',
  'lity>',
  'ity',
  'ity>',
  'ty>'],
 array([ 64551, 179660, 144286,  42140,  35950, 148644,  85966, 155465,
        137226, 204512, 140607,  23998, 182437, 169092, 202335,  81922,
         84435, 117954,  65525,  61421,  98772, 190411, 104063, 114122,
         61317,  14493, 140276,  14498,  27332, 146947, 149663,  38522,
         85874, 203604, 105187, 124621, 184669,  76804,  33216,  36274,
        128470,  71191]))

In [332]:
??sentiment_model_all.get_subwords("cute")

In [339]:
words = sentiment_model_all.get_words()
for w in words[0:10]:
    print("{}  --> {}".format(w, sentiment_model_all.get_word_vector(w)))

</s>  --> [-0.04100632  0.04522638 -0.01677407  0.00292806  0.0229811   0.00338773
  0.00756884  0.02253154  0.06442698  0.04932172 -0.01690026  0.00282984
  0.04310306  0.02014819 -0.01655152 -0.01285936 -0.00584081  0.02349056
  0.00740045 -0.02155281]
the  --> [-0.0071901   0.01994328  0.02624204  0.03745943 -0.00059376 -0.00149082
 -0.06161203  0.05379729  0.0401204   0.01964574 -0.02374848 -0.01760563
 -0.01573259 -0.02479217 -0.03392079 -0.0255501   0.01105041 -0.00880719
  0.00618013  0.02632367]
and  --> [ 0.0916687  -0.05802166 -0.08009838 -0.14427777  0.01681603 -0.00543345
  0.22197804 -0.24706686 -0.15971978 -0.08586492  0.02742781  0.11533955
  0.06565166  0.11805582  0.01800646 -0.01872155 -0.03583391  0.01949752
 -0.00679308 -0.21350285]
a  --> [ 0.00700946  0.02059963  0.02920412  0.00793921 -0.00057545  0.01418647
 -0.02106544  0.03127332  0.05109321  0.01088802 -0.05195991 -0.00320869
  0.00074375 -0.01648562  0.00107517 -0.00479806 -0.00248752  0.00536313
 -0.0107729

### Check out the following for full set of functionalities you can get from fasttext

'https://fasttext.cc/docs/en/unsupervised-tutorial.html'

In case you need a smaller model to deploy to edge device:

In [315]:
sentiment_model_all.quantize(input=all_data_path, qnorm=True, retrain=True, cutoff=200000)
print_results(*sentiment_model_all.test(all_data_path))
sentiment_model_all.save_model("sentiment_model_all.ftz")

N	2748
P@1	0.900
R@1	0.900


In [348]:
from fastText import train_unsupervised

In [399]:
model = train_unsupervised(input=os.path.join('./', 'sentiment.all'), model='skipgram')

In [401]:
model.save_model("unsup_model")

In [402]:
model.get_word_vector("item")

array([ 0.11493368,  0.03844072, -0.30901629, -0.05930671,  0.17192714,
       -0.11968267, -0.02324919,  0.08157369,  0.12652105,  0.09489042,
       -0.16677582,  0.05363207,  0.09907467,  0.06755538,  0.02177368,
        0.09048193, -0.02327499,  0.09259994, -0.16422673,  0.23294239,
        0.14148104, -0.03935983,  0.13805135, -0.12262625, -0.12731853,
        0.16763139,  0.3956145 , -0.07006402,  0.13103153,  0.07855626,
       -0.21472511, -0.06144235, -0.02110785, -0.26635283,  0.03972701,
       -0.1000495 ,  0.19960244, -0.25491592, -0.1815532 , -0.05189068,
        0.0047689 , -0.01270215, -0.09911472,  0.20988707,  0.17559691,
       -0.24684374,  0.11519057, -0.11204102, -0.26574621, -0.12333245,
       -0.07794633, -0.09868582, -0.16511314,  0.04410603,  0.32537642,
       -0.27806637,  0.06206042,  0.22618234,  0.06395185, -0.03906425,
       -0.19645485,  0.05740105,  0.09258997, -0.16286848,  0.03635918,
        0.16308087, -0.16798693,  0.07991626, -0.02836273, -0.09

In [403]:
wiki_vectores_path = '/Users/hadi.minooei/Downloads/wiki-news-300d-1M.vec'

In [53]:
??train_supervised

For cells below, I downloaded english word embeddings of wikipedia (and some other sources) from fasttext website: https://fasttext.cc/docs/en/english-vectors.html

They have pretrained vectors for many other languages (157 languages as of June 2019): https://fasttext.cc/docs/en/crawl-vectors.html

In [65]:
train_data_path = os.path.join("./", 'sentiment.train')
test_data_path = os.path.join("./", 'sentiment.test')

sentiment_model_transfer = train_supervised(
    input=train_data_path,
    lr=0.95,
    dim=300,
    epoch=1,
    loss='softmax',
    bucket=1000,
    label="__label__",
    pretrainedVectors=''
)
print_results(*sentiment_model_transfer.test(train_data_path))
print_results(*sentiment_model_transfer.test(test_data_path))

N	1786
P@1	0.796
R@1	0.796
N	962
P@1	0.733
R@1	0.733


In [64]:
train_data_path = os.path.join("./", 'sentiment.train')
test_data_path = os.path.join("./", 'sentiment.test')

sentiment_model_transfer = train_supervised(
    input=train_data_path,
    lr=0.95,
    dim=300,
    epoch=1,
    loss='softmax',
    bucket=1000,
    label="__label__",
    pretrainedVectors="/Users/hadi.minooei/Downloads/wiki-news-300d-1M.vec"
)
print_results(*sentiment_model_transfer.test(train_data_path))
print_results(*sentiment_model_transfer.test(test_data_path))
sentiment_model_all.save_model("sentiment_model_transfer.bin")

N	1786
P@1	1.000
R@1	1.000
N	962
P@1	0.813
R@1	0.813


In [63]:
train_data_path = os.path.join("./", 'sentiment.train')
test_data_path = os.path.join("./", 'sentiment.test')

sentiment_model_transfer = train_supervised(
    input=train_data_path,
    lr=0.005,
    dim=300,
    epoch=1,
    loss='softmax',
    bucket=1000,
    label="__label__",
    pretrainedVectors="/Users/hadi.minooei/Downloads/wiki-news-300d-1M.vec"
)
print_results(*sentiment_model_transfer.test(train_data_path))
print_results(*sentiment_model_transfer.test(test_data_path))
sentiment_model_all.save_model("sentiment_model_transfer.bin")

N	1786
P@1	0.820
R@1	0.820
N	962
P@1	0.780
R@1	0.780


In [60]:
train_data_path = os.path.join("./", 'sentiment.train')
test_data_path = os.path.join("./", 'sentiment.test')

%time sentiment_model_all = train_supervised(input=train_data_path, lr=0.0005, dim=300, epoch=7, loss='ns', bucket=1000, label="__label__", pretrainedVectors="/Users/hadi.minooei/Downloads/wiki-news-300d-1M.vec")
print_results(*sentiment_model_all.test(train_data_path))
print_results(*sentiment_model_all.test(test_data_path))
# sentiment_model_all.save_model("sentiment_model_all.bin")

CPU times: user 3min 52s, sys: 2.58 s, total: 3min 55s
Wall time: 3min 35s
N	1786
P@1	0.822
R@1	0.822
N	962
P@1	0.770
R@1	0.770
