In [45]:
from src.models.algorithms import FastTextEstimator
from src.utils import save_json

In [5]:
from src.data.datasets import load_dataset
from src.data.estimators import SpacyTokenize
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS as stopwords
import string
from src.utils import save_json

In [40]:
import logging
logging.basicConfig(level='DEBUG')
logger = logging.getLogger()

In [4]:
%load_ext autoreload
%autoreload 2

## on yelp (not quite properly processed data yet)

In [6]:
ds = load_dataset('yelp', num_reviews=1000)

In [7]:
ds.data[0]

'I went as a walk-in on a random Wednesday.  Tom was busy, but he squeezed me in.  In about 15 minutes and with very little direction from me I had the tighest fade this side of the Mississippi.  It was pretty neat just to watch him work.  He\'s truly a master at what he does and I regret every dollar I\'ve ever spent at those bargain "value" chains where the stylists are typically a revolving door of mediocrity.  I challenge you to find a better cut for $14.'

In [8]:
print(ds.DESCR)


Yelp Dataset JSON

Each file is composed of a single object type, one JSON-object per-line.

Take a look at some examples to get you started: https://github.com/Yelp/dataset-examples.

Note: the follow examples contain inline comments, which are technically not valid JSON. This is done here to simplify the documentation and explaining the structure, the JSON files you download will not contain any comments and will be fully valid JSON.
business.json

Contains business data including location data, attributes, and categories.

{
    // string, 22 character unique string business id
    "business_id": "tnhfDv5Il8EaGSXZGiuQGg",

    // string, the business's name
    "name": "Garaje",

    // string, the neighborhood's name
    "neighborhood": "SoMa",

    // string, the full address of the business
    "address": "475 3rd St",

    // string, the city
    "city": "San Francisco",

    // string, 2 character state code, if applicable
    "state": "CA",

    // string, the postal code
   

In [9]:
tokenizer = SpacyTokenize(n_threads=6, punctuation=string.punctuation, stopwords=stopwords, lemmatize=True)

In [10]:
tokenizer.fit(ds.data)

SpacyTokenize(batch_size=50, language_model='en_core_web_sm', lemmatize=True,
       n_threads=6, punctuation='!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~',
       stopwords=frozenset({'now', 'anyhow', 'front', 'something', 'however', 'below', 'four', 'latterly', 'seemed', 'thereafter', 'we', 'these', 'whether', 'is', 'a', 'whereupon', 'out', 'in', 'me', 'mostly', 'if', 'becomes', 'per', 'sometimes', 'were', 'whereas', 'twelve', 'yourself', 'system', 'too', 'e...', 'after', 'could', 'towards', 'through', 'thereby', 'by', 'sometime', 'often', 'for', 'describe'}))

In [11]:
sentences = tokenizer.transform(ds.data)

In [12]:
model = FastTextEstimator(min_count=5, min_n=3, max_n=6,size=300, sg=1,window=10,word_ngrams=1, iter=5,random_state=42)
model.fit(sentences)

2018-10-05 14:45:59,057 - word2vec - INFO - collecting all words and their counts
2018-10-05 14:45:59,059 - word2vec - INFO - PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-10-05 14:45:59,074 - word2vec - INFO - collected 8006 word types from a corpus of 54651 raw words and 9747 sentences
2018-10-05 14:45:59,076 - word2vec - INFO - Loading a fresh vocabulary
2018-10-05 14:45:59,083 - word2vec - INFO - effective_min_count=5 retains 1780 unique words (22% of original 8006, drops 6226)
2018-10-05 14:45:59,084 - word2vec - INFO - effective_min_count=5 leaves 45048 word corpus (82% of original 54651, drops 9603)
2018-10-05 14:45:59,090 - word2vec - INFO - deleting the raw counts dictionary of 8006 items
2018-10-05 14:45:59,092 - word2vec - INFO - sample=0.001 downsamples 60 most-common words
2018-10-05 14:45:59,093 - word2vec - INFO - downsampling leaves estimated 39968 word corpus (88.7% of prior 45048)
2018-10-05 14:45:59,236 - fasttext - INFO - estimated required 

In [13]:
%%time
embedding = model.transform(sentences)

CPU times: user 67.5 ms, sys: 4.63 ms, total: 72.1 ms
Wall time: 70.5 ms


In [14]:
embedding.shape

(1780, 300)

## Now using the train_model script

```
## train / fit / build models
train: models/model_list.json
	$(PYTHON_INTERPRETER) -m src.models.train_model model_list.json
 ```

In [54]:
model.get_params()

{'alpha': 0.025,
 'batch_words': 10000,
 'bucket': 2000000,
 'callbacks': (),
 'cbow_mean': 1,
 'hashfxn': <function hash(obj, /)>,
 'hs': 0,
 'iter': 5,
 'max_n': 6,
 'max_vocab_size': None,
 'min_alpha': 0.0001,
 'min_count': 5,
 'min_n': 3,
 'negative': 5,
 'ns_exponent': 0.75,
 'null_word': 0,
 'random_state': 42,
 'restrict_to_corpus': True,
 'sample': 0.001,
 'sg': 1,
 'size': 300,
 'sorted_vocab': 1,
 'trim_rule': None,
 'window': 10,
 'word_ngrams': 1,
 'workers': 3}

In [24]:
json_spec =  [{'dataset': 'yelp',
  'dataset_params': {'num_reviews': 1000},
  'algorithm': 'fasttext',
  'algorithm_params': {'iter': 5,
     'max_n': 6,
     'min_count': 5,
     'min_n': 3,
     'random_state': 42,
     'sg': 1,
     'size': 300,
     'window': 10,
     'word_ngrams': 1}}]

In [25]:
from src.paths import model_path

In [26]:
model_path

PosixPath('/home/ava00125/src/devel/text_embedding/models')

In [29]:
save_json(model_path / 'test_model_list.json', json_spec)

In [55]:
!python -m src.models.train_model ../models/test_model_list.json

2018-10-05 15:30:17,188 - train_model - INFO - Building models from ../models/test_model_list.json
2018-10-05 15:30:17,213 - word2vec - INFO - collecting all words and their counts
2018-10-05 15:30:17,213 - word2vec - INFO - PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-10-05 15:30:17,334 - word2vec - INFO - collected 102 word types from a corpus of 666833 raw words and 1000 sentences
2018-10-05 15:30:17,334 - word2vec - INFO - Loading a fresh vocabulary
2018-10-05 15:30:17,335 - word2vec - INFO - effective_min_count=5 retains 90 unique words (88% of original 102, drops 12)
2018-10-05 15:30:17,335 - word2vec - INFO - effective_min_count=5 leaves 666813 word corpus (99% of original 666833, drops 20)
2018-10-05 15:30:17,336 - word2vec - INFO - deleting the raw counts dictionary of 102 items
2018-10-05 15:30:17,337 - word2vec - INFO - sample=0.001 downsamples 29 most-common words
2018-10-05 15:30:17,337 - word2vec - INFO - downsampling leaves estimated 139311 word

In [56]:
!cat ../models/trained_models.json

{
  "fasttext_yelp_0": {
    "algorithm": "fasttext",
    "algorithm_params": {
      "iter": 5,
      "max_n": 6,
      "min_count": 5,
      "min_n": 3,
      "random_state": 42,
      "sg": 1,
      "size": 300,
      "window": 10,
      "word_ngrams": 1
    },
    "data_hash": "a52cc09f060399dd42dee0ed3f214c686cd46bb0",
    "dataset": "yelp",
    "dataset_params": {
      "num_reviews": 1000
    },
    "model_hash": "1e20f8b8d4abab2722cecee94a4753bc7aef71e5",
    "run_number": 0,
    "target_hash": "38f65f3b11da4851aaaccc19b1f0cf4d3806f83b"
  }
}