# Train Sense2Vec model with Fasttext

In [1]:
!pip install sense2vec fasttext



## Prepare dataset

In [2]:
import pandas as pd
import os
from sense2vec import Sense2Vec

In [3]:
!wget https://raw.githubusercontent.com/merrecalde/curso_la_plata_2019/master/simpsons_dataset.csv

--2022-07-29 09:30:40--  https://raw.githubusercontent.com/merrecalde/curso_la_plata_2019/master/simpsons_dataset.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9373196 (8,9M) [text/plain]
Saving to: ‘simpsons_dataset.csv’


2022-07-29 09:30:41 (6,67 MB/s) - ‘simpsons_dataset.csv’ saved [9373196/9373196]



In [4]:
df = pd.read_csv('./simpsons_dataset.csv')
df.shape

(158314, 2)

In [5]:
df.head()

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...


In [6]:
df.isnull().sum()

raw_character_text    17814
spoken_words          26459
dtype: int64

In [7]:
df = df.dropna().reset_index(drop=True)
df.isnull().sum()

raw_character_text    0
spoken_words          0
dtype: int64

In [8]:
df.head()

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...


In [9]:
sent = [row for row in df['spoken_words']]

In [10]:
sent[0]

"No, actually, it was a little of both. Sometimes when a disease is in all the magazines and all the news shows, it's only natural that you think you have it."

In [11]:
with open('lines.txt', 'a') as f:
    f.writelines('\n'.join(sent))

In [12]:
! head lines.txt

No, actually, it was a little of both. Sometimes when a disease is in all the magazines and all the news shows, it's only natural that you think you have it.
Where's Mr. Bergstrom?
I don't know. Although I'd sure like to talk to him. He didn't touch my lesson plan. What did he teach you?
That life is worth living.
The polls will be open from now until the end of recess. Now, just in case any of you have decided to put any thought into this, we'll have our final statements. Martin?
I don't think there's anything left to say.
Bart?
Victory party under the slide!
Mr. Bergstrom! Mr. Bergstrom!
Hey, hey, he Moved out this morning. He must have a new job -- he took his Copernicus costume.


## Download scripts for training

In [13]:
os.mkdir('scripts')
os.mkdir('parse_output')
os.mkdir('fasttext_output')
os.mkdir('s2v')

In [14]:
%cd ./scripts

/Users/meister/DOCS/ONIX/sense2vec/scripts


In [15]:
!wget https://raw.githubusercontent.com/explosion/sense2vec/master/scripts/{01_parse.py,02_preprocess.py,04_fasttext_train_vectors.py,05_export.py}

--2022-07-29 09:30:43--  https://raw.githubusercontent.com/explosion/sense2vec/master/scripts/01_parse.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2492 (2,4K) [text/plain]
Saving to: ‘01_parse.py’


2022-07-29 09:30:44 (5,09 MB/s) - ‘01_parse.py’ saved [2492/2492]

--2022-07-29 09:30:44--  https://raw.githubusercontent.com/explosion/sense2vec/master/scripts/02_preprocess.py
Reusing existing connection to raw.githubusercontent.com:443.
HTTP request sent, awaiting response... 200 OK
Length: 2512 (2,5K) [text/plain]
Saving to: ‘02_preprocess.py’


2022-07-29 09:30:44 (12,6 MB/s) - ‘02_preprocess.py’ saved [2512/2512]

--2022-07-29 09:30:44--  https://raw.githubusercontent.com/explosion/sense2vec/master/scripts/04_fasttext_train_vectors.py
Reusing existin

In [16]:
%cd ..

/Users/meister/DOCS/ONIX/sense2vec


## Train s2v

In [17]:
!python3 ./scripts/01_parse.py --help

Usage: 01_parse.py [OPTIONS] IN_FILE OUT_DIR [SPACY_MODEL]

  Step 1: Parse raw text with spaCy

  Expects an input file with one sentence per line and will output a .spacy
  file of the parsed collection of Doc objects (DocBin).

Arguments:
  IN_FILE        Path to input file  [required]
  OUT_DIR        Path to output directory  [required]
  [SPACY_MODEL]  Name of spaCy model to use  [default: en_core_web_sm]

Options:
  -n, --n-process INTEGER         Number of processes (multiprocessing)
                                  [default: 1]

  -m, --max-docs INTEGER          Maximum docs per batch  [default: 1000000]
  --install-completion [bash|zsh|fish|powershell|pwsh]
                                  Install completion for the specified shell.
  --show-completion [bash|zsh|fish|powershell|pwsh]
                                  Show completion for the specified shell, to
                                  copy it or customize the installation.

  --help          

In [18]:
!python3 ./scripts/01_parse.py ./lines.txt ./parse_output

[38;5;4mℹ Using spaCy model en_core_web_sm[0m
Preprocessing text...
Docs: 131853 [06:24, 342.52/s]
[38;5;2m✔ Complete. Saved final parsed docs to file[0m
/Users/meister/DOCS/ONIX/sense2vec/parse_output/lines-1.spacy


In [19]:
!ls -l ./parse_output

total 21424
-rw-r--r--  1 meister  staff  10966030 Jul 29 09:37 lines-1.spacy


In [20]:
!python3 ./scripts/02_preprocess.py --help

Usage: 02_preprocess.py [OPTIONS] IN_FILE OUT_DIR [SPACY_MODEL]

  Step 2: Preprocess text in sense2vec's format

  Expects a binary .spacy input file consisting of the parsed Docs (DocBin)
  and outputs a text file with one sentence per line in the expected
  sense2vec format (merged noun phrases, concatenated phrases with
  underscores and added "senses").

  Example input: Rats, mould and broken furniture: the scandal of the UK's
  refugee housing

  Example output: Rats|NOUN ,|PUNCT mould|NOUN and|CCONJ
  broken_furniture|NOUN :|PUNCT the|DET scandal|NOUN of|ADP the|DET UK|GPE
  's|PART refugee_housing|NOUN

Arguments:
  IN_FILE        Path to input file  [required]
  OUT_DIR        Path to output directory  [required]
  [SPACY_MODEL]  Name of spaCy model to use  [default: en_core_web_sm]

Options:
  -n, --n-process INTEGER         Number of processes (multiprocessing)
                                  [default: 1]

  --install-completion [bash|zsh|fish|pow

In [21]:
!python3 ./scripts/02_preprocess.py ./parse_output/lines-1.spacy ./parse_output

[38;5;4mℹ Using spaCy model en_core_web_sm[0m
[38;5;2m✔ Loaded 131853 parsed docs[0m
Docs: 131853 [00:54, 2422.74/s]
[38;5;2m✔ Successfully preprocessed 131853 docs (1711521 words)[0m
/Users/meister/DOCS/ONIX/sense2vec/parse_output/lines-1.s2v


In [22]:
!ls -l ./parse_output

total 54208
-rw-r--r--  1 meister  staff  16164081 Jul 29 09:38 lines-1.s2v
-rw-r--r--  1 meister  staff  10966030 Jul 29 09:37 lines-1.spacy


In [23]:
!head ./parse_output/lines-1.s2v

no|INTJ ,|PUNCT actually|ADV ,|PUNCT it|PRON was|AUX a|DET little|ADJ of|ADP both|PRON .|PUNCT sometimes|ADV when|SCONJ a|DET disease|NOUN is|AUX in|ADP all|DET the|DET magazines|NOUN and|CCONJ all|DET the|DET news|NOUN shows|NOUN ,|PUNCT it|PRON 's|AUX only|ADV natural|ADJ that|SCONJ you|PRON think|VERB you|PRON have|VERB it|PRON .|PUNCT
where|SCONJ 's|AUX mr.|PROPN Bergstrom|PERSON ?|PUNCT
I|PRON do|AUX n't|PART know|VERB .|PUNCT although|SCONJ I|PRON 'd|AUX sure|ADV like|VERB to|PART talk|VERB to|ADP him|PRON .|PUNCT he|PRON did|AUX n't|PART touch|VERB my|PRON lesson|NOUN plan|NOUN .|PUNCT what|PRON did|AUX he|PRON teach|VERB you|PRON ?|PUNCT
that|DET life|NOUN is|AUX worth|ADJ living|VERB .|PUNCT
the|DET polls|NOUN will|AUX be|AUX open|ADJ from|ADP now|ADV until|ADP the|DET end|NOUN of|ADP recess|NOUN .|PUNCT now|ADV ,|PUNCT just|ADV in|ADP case|NOUN any|PRON of|ADP you|PRON have|AUX decided|VERB to|PART put|VERB any|DET thought|NOUN into|ADP this|PRON ,|PUNCT we|PRON 'll|AUX h

In [24]:
!python3 ./scripts/04_fasttext_train_vectors.py --help

Usage: 04_fasttext_train_vectors.py [OPTIONS] OUT_DIR [IN_DIR]

  Step 4: Train the vectors

  Expects a directory of preprocessed .s2v input files, will concatenate
  them (using a temporary file on disk) and will use fastText to train a
  word2vec model. See here for installation instructions:
  https://github.com/facebookresearch/fastText

  Note that this script will call into fastText and expects you to pass in
  the built fasttext binary. The command will also be printed if you want to
  run it separately.

Arguments:
  OUT_DIR   Path to output directory  [required]
  [IN_DIR]  Path to directory with preprocessed .s2v file(s)

Options:
  -t, --n-threads INTEGER         Number of threads  [default: 10]
  -c, --min-count INTEGER         Minimum count for inclusion in vocab
                                  [default: 50]

  -s, --vector-size INTEGER       Dimension of word vector representations
                                  [default: 300]

  -e, --epoch

In [25]:
!python3 ./scripts/04_fasttext_train_vectors.py ./fasttext_output ./parse_output

[38;5;4mℹ Training fastText model vectors[0m
[38;5;4mℹ Created temporary merged input file[0m
parse_output/s2v_input.tmp
Read 1M words
Number of words:  2017
Number of labels: 0
Progress: 100.0% words/sec/thread:   54581 lr:  0.000000 avg.loss:  2.487400 ETA:   0h 0m 0sh 0m 6s 0m 5s 2s
[38;5;2m✔ Successfully trained fastText model vectors[0m
[38;5;2m✔ Deleted temporary input file[0m
parse_output/s2v_input.tmp
[38;5;4mℹ Creating vocabulary file[0m
[38;5;2m✔ Successfully created vocabulary file[0m
fasttext_output/vocab.txt
[38;5;4mℹ Creating vectors file[0m
[38;5;2m✔ Successfully created vectors file[0m
fasttext_output/vectors.txt


In [26]:
!ls -l ./fasttext_output

total 13984
-rw-r--r--  1 meister  staff  7116926 Jul 29 09:38 vectors.txt
-rw-r--r--  1 meister  staff    39347 Jul 29 09:38 vocab.txt


In [27]:
!python3 ./scripts/05_export.py --help

Usage: 05_export.py [OPTIONS] IN_FILE VOCAB_FILE OUT_DIR

  Step 5: Export a sense2vec component

  Expects a vectors.txt and a vocab file trained with GloVe and exports a
  component that can be loaded with Sense2vec.from_disk.

Arguments:
  IN_FILE     Vectors file (text-based)  [required]
  VOCAB_FILE  Vocabulary file  [required]
  OUT_DIR     Path to output directory  [required]

Options:
  -r, --min-freq-ratio FLOAT      Frequency ratio threshold for discarding
                                  minority senses or casings  [default: 0.0]

  -s, --min-distance FLOAT        Similarity threshold for discarding
                                  redundant keys  [default: 0.0]

  --install-completion [bash|zsh|fish|powershell|pwsh]
                                  Install completion for the specified shell.
  --show-completion [bash|zsh|fish|powershell|pwsh]
                                  Show completion for the specified shell, to
                             

In [28]:
!python3 ./scripts/05_export.py ./fasttext_output/vectors.txt ./fasttext_output/vocab.txt ./s2v

[38;5;2m✔ Created the sense2vec model[0m
[38;5;4mℹ 2016 vectors, 28 total senses[0m
[38;5;2m✔ Saved model to directory[0m
./s2v


In [29]:
!ls -l ./s2v

total 5024
-rw-r--r--  1 meister  staff      415 Jul 29 09:38 cfg
-rw-r--r--  1 meister  staff    84795 Jul 29 09:38 freqs.json
-rw-r--r--  1 meister  staff    23811 Jul 29 09:38 key2row
-rw-r--r--  1 meister  staff    31920 Jul 29 09:38 strings.json
-rw-r--r--  1 meister  staff  2419328 Jul 29 09:38 vectors
-rw-r--r--  1 meister  staff       22 Jul 29 09:38 vectors.cfg


## Check model

In [30]:
s2v = Sense2Vec().from_disk('./s2v')

In [31]:
s2v.senses

['NORP',
 'LOC',
 'NOUN',
 'SCONJ',
 'PART',
 'GPE',
 'ORG',
 'WORK OF ART',
 'ADJ',
 'PRON',
 'PROPN',
 'AUX',
 'PUNCT',
 'INTJ',
 'NUM',
 'ADV',
 'TIME',
 'CARDINAL',
 'ORDINAL',
 'VERB',
 'CCONJ',
 'ADP',
 'SYM',
 'DET',
 'X',
 'LANGUAGE',
 'PERSON',
 'DATE']

In [32]:
len(list(s2v.items()))

2016

In [33]:
for i,(key, _) in enumerate(s2v):
    print(key)
    if i >= 100:
      break

.|PUNCT
,|PUNCT
I|PRON
!|PUNCT
you|PRON
the|DET
?|PUNCT
a|DET
...|PUNCT
and|CCONJ
's|AUX
to|PART
n't|PART
it|PRON
of|ADP
"|PUNCT
we|PRON
my|PRON
is|AUX
in|ADP
-|PUNCT
me|PRON
that|PRON
your|PRON
for|ADP
'm|AUX
to|ADP
what|PRON
do|AUX
oh|INTJ
're|AUX
on|ADP
but|CCONJ
be|AUX
just|ADV
are|AUX
with|ADP
not|PART
'll|AUX
this|DET
he|PRON
have|VERB
well|INTJ
now|ADV
was|AUX
this|PRON
so|ADV
's|PART
It|PRON
here|ADV
can|AUX
get|VERB
know|VERB
've|AUX
they|PRON
got|VERB
at|ADP
hey|INTJ
no|INTJ
up|ADP
do|VERB
that|DET
out|ADP
go|VERB
how|SCONJ
--|PUNCT
our|PRON
like|ADP
let|VERB
about|ADP
uh|INTJ
who|PRON
from|ADP
see|VERB
there|PRON
yeah|INTJ
think|VERB
why|SCONJ
no|DET
will|AUX
if|SCONJ
did|AUX
good|ADJ
want|VERB
could|AUX
ca|AUX
an|DET
na|PART
'|PUNCT
look|VERB
would|AUX
/|PUNCT
gon|VERB
time|NOUN
homer|PROPN
that|SCONJ
him|PRON
little|ADJ
us|PRON
when|SCONJ
all|DET


In [34]:
s2v.get_other_senses("gon|VERB")

['gon|NOUN']

In [35]:
s2v.most_similar('homer|PROPN', n=10)

[('marge|PROPN', 0.4995),
 ('barney|PROPN', 0.4748),
 ('dad|PROPN', 0.4677),
 ('Barney|PERSON', 0.4653),
 ("ma'am|NOUN", 0.4622),
 ('homer|NOUN', 0.4591),
 ('moe|PROPN', 0.438),
 ('Carl|PERSON', 0.426),
 ('honey|PROPN', 0.4254),
 ('sweetie|NOUN', 0.4235)]