# Playground
Much like my Notepad, this playground is a place for me to test things.

# Level 1

In [1]:
import pandas as pd
import numpy as np

In [2]:
labeled_products = pd.read_table('/workspace/datasets/fasttext/shuffled_labeled_products.txt',
    header = None)

In [3]:
labeled_products.head()

Unnamed: 0,0
0,__label__abcat0807005 HP - 15 Inkjet Cartridge...
1,__label__abcat0912020 Aroma - 8-Cup Rice Cooke...
2,__label__abcat0916013 Smart Choice - HEPA Filt...
3,__label__abcat0410003 Lowepro - Ridge 30 Pouch...
4,"__label__abcat0201009 8GB iPod touch® - Black,..."


In [4]:
labeled_products.shape

(115503, 1)

Doesn't look that shuffled to me...

In [5]:
labeled_products = labeled_products.sample(frac = 1).reset_index(drop = True)
labeled_products.head()

Unnamed: 0,0
0,"__label__abcat0101001 Insignia™ - 42"" Class / ..."
1,__label__abcat0503002 NETGEAR RangeMax 802.11g...
2,__label__pcmcat196400050015 Frigidaire - Galle...
3,__label__abcat0302004 Sony - Xplod 170W x 4 Am...
4,__label__pcmcat238300050017 Conair - Custom Cu...


In [6]:
training_data = labeled_products.loc[0:9999]
training_data.shape

(10000, 1)

In [7]:
training_data.head()

Unnamed: 0,0
0,"__label__abcat0101001 Insignia™ - 42"" Class / ..."
1,__label__abcat0503002 NETGEAR RangeMax 802.11g...
2,__label__pcmcat196400050015 Frigidaire - Galle...
3,__label__abcat0302004 Sony - Xplod 170W x 4 Am...
4,__label__pcmcat238300050017 Conair - Custom Cu...


In [8]:
training_data.to_csv('/workspace/datasets/fasttext/training_data.txt', header = None, index = None, mode = 'a')


In [9]:
test_data = labeled_products.tail(10000)
test_data.shape

(10000, 1)

In [10]:
test_data.to_csv('/workspace/datasets/fasttext/test_data.txt', header = None, index = None, mode = 'a')


## Train model!

In [11]:
import fasttext

In [12]:
# Train model
product_classifier = fasttext.train_supervised(input = '/workspace/datasets/fasttext/training_data.txt')

Read 0M words
Number of words:  16297
Number of labels: 1473
Progress: 100.0% words/sec/thread:   10746 lr:  0.000000 avg.loss: 11.681223 ETA:   0h 0m 0s


In [13]:
product_classifier.save_model("product_classifier")

### @1

In [14]:
product_classifier.test('/workspace/datasets/fasttext/test_data.txt')

(15901, 0.1882900446512798, 0.1882900446512798)

### @5

In [15]:
product_classifier.test('/workspace/datasets/fasttext/test_data.txt', k = 5)

(15901, 0.06780705615998994, 0.3390352807999497)

### @10

In [16]:
product_classifier.test('/workspace/datasets/fasttext/test_data.txt', k = 10)

(15901, 0.04046286397081945, 0.40462863970819446)

## Fine tunning - epoch = 25 and lr = 1.0

In [17]:
model = fasttext.train_supervised(input = '/workspace/datasets/fasttext/training_data.txt', 
                                  lr = 1, 
                                  epoch = 25)

Read 0M words
Number of words:  16297
Number of labels: 1473
Progress: 100.0% words/sec/thread:    9477 lr:  0.000000 avg.loss:  0.819866 ETA:   0h 0m 0s


In [18]:
model.test('/workspace/datasets/fasttext/test_data.txt')

(15901, 0.6884472674674549, 0.6884472674674549)

## Fine tunning - bigrams

In [19]:
model = fasttext.train_supervised(input = '/workspace/datasets/fasttext/training_data.txt', 
                                  lr = 1.0, 
                                  epoch = 25,
                                  wordNgrams = 2)

Read 0M words
Number of words:  16297
Number of labels: 1473
Progress: 100.0% words/sec/thread:    8331 lr:  0.000000 avg.loss:  1.311731 ETA:   0h 0m 0s


In [20]:
model.test('/workspace/datasets/fasttext/test_data.txt')

(15901, 0.6790767876234199, 0.6790767876234199)

## Using normalized data

`cat /workspace/datasets/fasttext/training_lite.txt |sed -e "s/\([.\!?,'/()]\)/ \1 /g" | tr "[:upper:]" "[:lower:]" | sed "s/[^[:alnum:]_]/ /g" | tr -s ' ' > /workspace/datasets/fasttext/normalized_training_lite.txt`

In [21]:
model = fasttext.train_supervised(input = '/workspace/datasets/fasttext/normalized_training_lite.txt', 
                                  lr = 1, 
                                  epoch = 25)

Read 0M words
Number of words:  8703
Number of labels: 1371
Progress: 100.0% words/sec/thread:    7942 lr:  0.000000 avg.loss:  1.038188 ETA:   0h 0m 0s


In [22]:
model.test('/workspace/datasets/fasttext/normalized_test_lite.txt')

(9683, 0.6207786842920583, 0.6207786842920583)

## Filtering out for at least N = 500 products

In [23]:
labeled_products.head()

Unnamed: 0,0
0,"__label__abcat0101001 Insignia™ - 42"" Class / ..."
1,__label__abcat0503002 NETGEAR RangeMax 802.11g...
2,__label__pcmcat196400050015 Frigidaire - Galle...
3,__label__abcat0302004 Sony - Xplod 170W x 4 Am...
4,__label__pcmcat238300050017 Conair - Custom Cu...


In [25]:
plp = create_pruned_labeled_products(labeled_products)

Mapping labels
Found labels to keep!
Saved pruned_labeled_products.txt


In [26]:
plp.head()

Unnamed: 0,0
0,"__label__abcat0101001 Insignia™ - 42"" Class / ..."
1,__label__cat09000 Best Buy GC - $30 Blue Camer...
2,__label__abcat0106001 Studio RTA - Lake Point ...
3,__label__abcat0515028 Rain Computers - Big Nam...
4,__label__abcat0301014 Garmin 010-00657-18 nüvi...


In [27]:
plp.shape

(28921, 1)

## Shuffle and split this dataset!

In [28]:
pruned_df = plp.sample(frac = 1).reset_index(drop = True)
pruned_df.head()

Unnamed: 0,0
0,"__label__abcat0904003 Frigidaire - 30"" Freesta..."
1,"__label__abcat0904003 GE - Profile 30"" Self-Cl..."
2,"__label__abcat0904003 GE - 30"" Self-Cleaning F..."
3,__label__abcat0301014 Magellan - RoadMate 800 ...
4,"__label__abcat0905001 GE - Profile 24"" Tall Tu..."


In [29]:
pruned_training_data = pruned_df.loc[0:9999]
pruned_training_data.shape

(10000, 1)

In [30]:
pruned_training_data.to_csv('/workspace/datasets/fasttext/pruned_training_data.txt', header = None, index = None, mode = 'a')

In [31]:
pruned_test_data = pruned_df.tail(10000)
pruned_test_data.shape

(10000, 1)

In [32]:
pruned_test_data.to_csv('/workspace/datasets/fasttext/pruned_test_data.txt', header = None, index = None, mode = 'a')

## Train model on pruned data

In [33]:
model = fasttext.train_supervised(input = '/workspace/datasets/fasttext/pruned_training_data.txt', 
                                  lr = 1, 
                                  epoch = 25)

Read 0M words
Number of words:  8732
Number of labels: 32
Progress: 100.0% words/sec/thread:  234478 lr:  0.000000 avg.loss:  0.023585 ETA:   0h 0m 0s


In [34]:
model.test('/workspace/datasets/fasttext/pruned_test_data.txt')

(13043, 0.9733190216974622, 0.9733190216974622)

# Level 2: Derive Synonyms from Content

`cut -d' ' -f2- /workspace/datasets/fasttext/shuffled_labeled_products.txt > /workspace/datasets/fasttext/titles.txt` I can't believe this gets the category labels.

In [35]:
model = fasttext.train_unsupervised(model = 'skipgram',
                                    input = '/workspace/datasets/fasttext/titles.txt')

Read 1M words
Number of words:  10873
Number of labels: 0
Progress: 100.0% words/sec/thread:   18657 lr:  0.000000 avg.loss:  1.370575 ETA:   0h 0m 0s


In [36]:
model.get_nearest_neighbors('iphone')

[(0.862642765045166, 'Saxophone'),
 (0.850262463092804, 'Speakerphone'),
 (0.8443905711174011, 'Telephone'),
 (0.8322615027427673, 'Microphone'),
 (0.8050101399421692, 'Earphones'),
 (0.8026003837585449, 'Telephones'),
 (0.7994995713233948, 'Headphone'),
 (0.786597728729248, 'GoPhone'),
 (0.7848433256149292, 'Phone'),
 (0.7822263836860657, 'Ozone')]

In [37]:
model.get_nearest_neighbors('iPhone')

[(0.9466185569763184, 'iPhone®,'),
 (0.9300370216369629, 'iPhone®'),
 (0.8454962372779846, 'iPod®/iPhone'),
 (0.8246084451675415, 'iPod®/iPhone®'),
 (0.8177050352096558, '3GS'),
 (0.8127597570419312, '3GS,'),
 (0.8094018697738647, '3G/3GS'),
 (0.8039283156394958, '3G/3G'),
 (0.8021930456161499, '4S'),
 (0.7951489090919495, '3G,')]

### Normalizing titles data
`cat /workspace/datasets/fasttext/titles.txt | sed -e "s/\([.\!?,'/()]\)/ \1 /g" | tr "[:upper:]" "[:lower:]" | sed "s/[^[:alnum:]]/ /g" | tr -s ' ' > /workspace/datasets/fasttext/normalized_titles.txt`

In [38]:
model = fasttext.train_unsupervised(model = 'skipgram',
                                    input = '/workspace/datasets/fasttext/normalized_titles.txt')

Read 1M words
Number of words:  8903
Number of labels: 0
Progress: 100.0% words/sec/thread:   20804 lr:  0.000000 avg.loss:  1.550549 ETA:   0h 0m 0s 37.0% words/sec/thread:   21146 lr:  0.031507 avg.loss:  2.411935 ETA:   0h 0m11s


In [39]:
model.get_nearest_neighbors('iphone')

[(0.8660082221031189, '4s'),
 (0.7941620945930481, 'apple'),
 (0.7679271101951599, '3gs'),
 (0.7466674447059631, 'ipod'),
 (0.7256027460098267, 'appleÂ'),
 (0.7083044648170471, 'ipodÂ'),
 (0.705051600933075, 'ipadÂ'),
 (0.7002970576286316, 'ifrogz'),
 (0.6987919807434082, 'ozone'),
 (0.6972020864486694, 'unobtainium')]

In [40]:
model.get_nearest_neighbors('iPhone')

[(0.9280170202255249, 'hone'),
 (0.8490738272666931, 'phone'),
 (0.8306847214698792, 'gophone'),
 (0.8143390417098999, 'ozone'),
 (0.7922474145889282, 'saxophone'),
 (0.7594821453094482, 'iphone'),
 (0.7536320090293884, 'speakerphone'),
 (0.7527094483375549, 'jawbone'),
 (0.7506572604179382, 'bone'),
 (0.712533712387085, 'alone')]

In [41]:
model.get_nearest_neighbors('Iphone')

[(0.9283570647239685, 'phone'),
 (0.9220640063285828, 'gophone'),
 (0.8217782378196716, 'speakerphone'),
 (0.812666654586792, 'saxophone'),
 (0.8052788376808167, 'hone'),
 (0.7950165271759033, 'telephone'),
 (0.7836975455284119, 'iphone'),
 (0.7821744084358215, 'ozone'),
 (0.7720203995704651, 'earphones'),
 (0.7682393193244934, 'telephones')]

### Testing tokens

#### Product types

In [42]:
model.get_nearest_neighbors('headphones')

[(0.9316138029098511, 'headphone'),
 (0.8996431827545166, 'earbud'),
 (0.853122353553772, 'ear'),
 (0.8447486758232117, 'earphones'),
 (0.7626540660858154, 'bud'),
 (0.7467065453529358, '2xl'),
 (0.7430037260055542, 'earbuds'),
 (0.7165864109992981, 'microphones'),
 (0.7073571681976318, 'behind'),
 (0.7004403471946716, 'ears')]

In [43]:
model.get_nearest_neighbors('laptop')

[(0.7322827577590942, 'laptops'),
 (0.7183035016059875, '156b'),
 (0.7051650881767273, '17r'),
 (0.6916542649269104, 'i15'),
 (0.6907384395599365, '178'),
 (0.6901426911354065, 's5919'),
 (0.6875993013381958, 'lapdesk'),
 (0.6870904564857483, 'durabook'),
 (0.6858730912208557, 't2390'),
 (0.685158908367157, '176')]

In [44]:
model.get_nearest_neighbors('freezer')

[(0.9246532917022705, 'freezers'),
 (0.8264536261558533, 'refrigerator'),
 (0.7996167540550232, 'refrigerators'),
 (0.7881417870521545, 'frost'),
 (0.7406349182128906, 'cu'),
 (0.7330312132835388, 'mug'),
 (0.7217652797698975, 'satina'),
 (0.7129825949668884, 'ft'),
 (0.7079470753669739, 'monochromatic'),
 (0.7043893337249756, 'cleansteel')]

#### Brands

In [45]:
model.get_nearest_neighbors('nintendo')

[(0.9760647416114807, 'nintendogs'),
 (0.9030267596244812, 'ds'),
 (0.8796069622039795, 'wii'),
 (0.8428035378456116, '3ds'),
 (0.77391117811203, 'gamecube'),
 (0.740477442741394, 'luigi'),
 (0.7338675260543823, 'zhu'),
 (0.7299030423164368, 'psp'),
 (0.723874568939209, 'ninjas'),
 (0.7236530184745789, 'wwii')]

In [46]:
model.get_nearest_neighbors('whirlpool')

[(0.8807986378669739, 'whirl'),
 (0.8394858837127686, 'frigidaire'),
 (0.8347935080528259, 'maytag'),
 (0.8339022397994995, 'biscuit'),
 (0.830653727054596, 'bisque'),
 (0.7638772130012512, 'gallery'),
 (0.7526147961616516, 'cleansteel'),
 (0.7462554574012756, 'nautilus'),
 (0.7413251399993896, 'inglis'),
 (0.7383004426956177, 'gas')]

In [47]:
model.get_nearest_neighbors('kodak')

[(0.8861657381057739, 'easyshare'),
 (0.8230230212211609, 'm863'),
 (0.8104571104049683, 'm763'),
 (0.8069292902946472, 'm893'),
 (0.7964211702346802, 'c813'),
 (0.7933155298233032, 'm1063'),
 (0.7896784543991089, 'm341'),
 (0.7787225246429443, 'm381'),
 (0.7619840502738953, 'm340'),
 (0.7482796311378479, 'playtouch')]

### Models

In [48]:
model.get_nearest_neighbors('ps2')

[(0.8773263096809387, 'ps3'),
 (0.8110273480415344, 'gba'),
 (0.8104366660118103, 'psp'),
 (0.8103382587432861, '2k5'),
 (0.8077419400215149, 'Â'),
 (0.8076554536819458, '2k3'),
 (0.8070772886276245, '2k8'),
 (0.8054624199867249, '2k7'),
 (0.7924477458000183, 'nhl'),
 (0.7918471097946167, 'xbox')]

In [49]:
model.get_nearest_neighbors('razr')

[(0.8866004347801208, 'a855'),
 (0.8838928937911987, 'krzr'),
 (0.8597919940948486, 'e71'),
 (0.8550978302955627, 'sgh'),
 (0.8387491106987, 't720'),
 (0.83646559715271, 'i90c'),
 (0.8319584727287292, 'a957'),
 (0.8305213451385498, 'kyocera'),
 (0.8299548029899597, '7v'),
 (0.8297775387763977, 'hd7')]

In [50]:
model.get_nearest_neighbors('stratocaster')

[(0.9143386483192444, 'telecaster'),
 (0.9054551124572754, 'starcaster'),
 (0.809802234172821, 'forecaster'),
 (0.8096722960472107, 'strat'),
 (0.8045992851257324, 'squier'),
 (0.7956680655479431, 'fender'),
 (0.776418149471283, 'synyster'),
 (0.7570697665214539, 'tremolo'),
 (0.7374893426895142, 'hss'),
 (0.7344993352890015, 'sunburst')]

#### Other

In [51]:
model.get_nearest_neighbors('holiday')

[(0.9784823060035706, 'holidays'),
 (0.8225429654121399, 'kwanzaa'),
 (0.8194125890731812, 'vibes'),
 (0.8188290596008301, 'día'),
 (0.8152854442596436, 'gracias'),
 (0.8113676905632019, 'congrats'),
 (0.8096888065338135, 'buy'),
 (0.8075286149978638, 'thanks'),
 (0.8040497899055481, 'hanukkah'),
 (0.8033755421638489, 'navidad')]

In [52]:
model.get_nearest_neighbors('plasma')

[(0.8559969067573547, '600hz'),
 (0.8138182759284973, '480hz'),
 (0.8134418725967407, '58'),
 (0.8023808002471924, 'hdtv'),
 (0.8004066944122314, 'hdtvs'),
 (0.7995370626449585, '63'),
 (0.7956316471099854, 'edtv'),
 (0.7939620614051819, '480p'),
 (0.781460702419281, 'regza'),
 (0.780785322189331, 'aquos')]

In [53]:
model.get_nearest_neighbors('leather')

[(0.9120498299598694, 'leatherskin'),
 (0.7080226540565491, 'recliner'),
 (0.6746559739112854, 'berkline'),
 (0.6687394976615906, 'hipcase'),
 (0.6561363935470581, 'magnolia'),
 (0.6516637206077576, 'curved'),
 (0.6459317207336426, 'seating'),
 (0.6455206871032715, 'sofa'),
 (0.6451421976089478, 'theaterseatstore'),
 (0.6373590230941772, 'executive')]

## Increasing epochs = 25

In [54]:
model = fasttext.train_unsupervised(model = 'skipgram',
                                    input = '/workspace/datasets/fasttext/normalized_titles.txt',
                                    epoch = 25)

Read 1M words
Number of words:  8903
Number of labels: 0
Progress: 100.0% words/sec/thread:   20200 lr:  0.000000 avg.loss:  1.137724 ETA:   0h 0m 0s


#### Product types

In [55]:
model.get_nearest_neighbors('headphones')

[(0.9157312512397766, 'earbud'),
 (0.8619622588157654, 'headphone'),
 (0.838262140750885, 'ear'),
 (0.6895906329154968, '2xl'),
 (0.6792140603065491, 'earphones'),
 (0.6706550121307373, 'over'),
 (0.6602503657341003, 'lowrider'),
 (0.6566639542579651, 'superbudz'),
 (0.6510161757469177, 'bud'),
 (0.6484337449073792, 'tunebuds')]

In [56]:
model.get_nearest_neighbors('laptop')

[(0.6585560441017151, 'netbook'),
 (0.6570000648498535, 'notebook'),
 (0.6380876302719116, '156b'),
 (0.6353214383125305, '176'),
 (0.6312924027442932, '178'),
 (0.6299413442611694, 's5919'),
 (0.6265237331390381, 'briefcase'),
 (0.613882839679718, 'zenbook'),
 (0.6091018915176392, '114'),
 (0.6087368130683899, '172')]

In [57]:
model.get_nearest_neighbors('freezer')

[(0.9040520191192627, 'freezers'),
 (0.7378726601600647, 'refrigerator'),
 (0.7015482783317566, 'mug'),
 (0.6819553375244141, 'cu'),
 (0.6666325330734253, 'ft'),
 (0.6508707404136658, 'refrigerators'),
 (0.6403297185897827, 'side'),
 (0.6242110133171082, 'ultrafinish'),
 (0.6213588118553162, 'cleansteel'),
 (0.6187864542007446, 'customstyle')]

#### Brands

In [58]:
model.get_nearest_neighbors('nintendo')

[(0.9618361592292786, 'ds'),
 (0.9464081525802612, 'wii'),
 (0.8481259346008301, 'nintendogs'),
 (0.7772269248962402, 'gamecube'),
 (0.7748440504074097, '3ds'),
 (0.7330852150917053, 'psp'),
 (0.7299896478652954, 'playstation'),
 (0.7098451852798462, 'advance'),
 (0.7001379728317261, 'xbox'),
 (0.6970630884170532, '360')]

In [59]:
model.get_nearest_neighbors('whirlpool')

[(0.7917198538780212, 'maytag'),
 (0.7578569054603577, 'biscuit'),
 (0.7387299537658691, 'frigidaire'),
 (0.724648118019104, 'ge'),
 (0.718706488609314, 'inglis'),
 (0.6954635381698608, 'bisque'),
 (0.6748238801956177, 'cleansteel'),
 (0.6696757078170776, 'hotpoint'),
 (0.6602770686149597, 'satina'),
 (0.6477392315864563, 'cabrio')]

In [60]:
model.get_nearest_neighbors('kodak')

[(0.8524447083473206, 'easyshare'),
 (0.6994882225990295, 'c813'),
 (0.6927104592323303, 'm763'),
 (0.682150661945343, 'm863'),
 (0.6723581552505493, 'm893'),
 (0.6625857949256897, 'playsport'),
 (0.6569780707359314, 'm530'),
 (0.6510109305381775, 'm341'),
 (0.6497992277145386, 'playtouch'),
 (0.6416168212890625, 'm1063')]

### Models

In [61]:
model.get_nearest_neighbors('ps2')

[(0.7692601680755615, 'playstation'),
 (0.7291615605354309, 'xbox'),
 (0.7283380031585693, 'ps3'),
 (0.7122669816017151, 'gamecube'),
 (0.6965801119804382, '360'),
 (0.6947784423828125, 'psp'),
 (0.6672265529632568, 'guide'),
 (0.6565883755683899, 'gba'),
 (0.6339210271835327, '2k7'),
 (0.6332845091819763, '2k5')]

In [62]:
model.get_nearest_neighbors('razr')

[(0.8181076049804688, 'motorola'),
 (0.7460159063339233, 'krzr'),
 (0.7260109782218933, 'droid'),
 (0.7123373746871948, 't720'),
 (0.7102821469306946, 'a855'),
 (0.6753193736076355, 'a957'),
 (0.6720890998840332, 'r225'),
 (0.668860912322998, 'cellsuit'),
 (0.6679417490959167, 'photon'),
 (0.6677144765853882, 'atrix')]

In [63]:
model.get_nearest_neighbors('stratocaster')

[(0.878308117389679, 'telecaster'),
 (0.8076531887054443, 'starcaster'),
 (0.7800866961479187, 'fretboard'),
 (0.7725774645805359, 'fender'),
 (0.7699156403541565, 'strat'),
 (0.7431489825248718, 'squier'),
 (0.7395989298820496, 'hss'),
 (0.6987872123718262, 'worn'),
 (0.6933623552322388, 'thinline'),
 (0.6825616955757141, 'rosewood')]

#### Other

In [64]:
model.get_nearest_neighbors('holiday')

[(0.9519236087799072, 'holidays'),
 (0.7104306817054749, 'vibes'),
 (0.6974205374717712, 'nobr'),
 (0.6747931241989136, 'stocking'),
 (0.6542304158210754, 'grab'),
 (0.6475882530212402, 'perfectly'),
 (0.6248204112052917, 'dreidel'),
 (0.6050228476524353, 'kwanzaa'),
 (0.6000368595123291, 'slaphappy'),
 (0.5962368845939636, 'gift')]

In [65]:
model.get_nearest_neighbors('plasma')

[(0.7891889214515686, '600hz'),
 (0.706960141658783, '480hz'),
 (0.6839281320571899, 'hdtv'),
 (0.682191014289856, 'purevision'),
 (0.6792814135551453, '42'),
 (0.6442890167236328, 'viera'),
 (0.626269519329071, '43'),
 (0.6233737468719482, 'ambilight'),
 (0.6149773597717285, 'edtv'),
 (0.6111401915550232, 'ultravision')]

In [66]:
model.get_nearest_neighbors('leather')

[(0.7888681888580322, 'leatherskin'),
 (0.6654601097106934, 'recliner'),
 (0.6512647271156311, 'armless'),
 (0.6354237794876099, 'sofa'),
 (0.6349744200706482, 'berkline'),
 (0.623550295829773, 'hipcase'),
 (0.6216261386871338, 'seating'),
 (0.6182665228843689, 'maccase'),
 (0.616023600101471, 'curved'),
 (0.5956719517707825, 'theaterseatstore')]

### Ignoring rare words: minCount = 20

In [67]:
model = fasttext.train_unsupervised(model = 'skipgram',
                                    input = '/workspace/datasets/fasttext/normalized_titles.txt',
                                    epoch = 25,
                                    minCount = 20)

Read 1M words
Number of words:  3861
Number of labels: 0
Progress: 100.0% words/sec/thread:   34771 lr:  0.000000 avg.loss:  1.217928 ETA:   0h 0m 0s


#### Product types

In [68]:
model.get_nearest_neighbors('headphones')

[(0.8929057121276855, 'earbud'),
 (0.8674094676971436, 'ear'),
 (0.8249027132987976, 'headphone'),
 (0.6970930099487305, 'lowrider'),
 (0.6865960955619812, 'earphones'),
 (0.6601977944374084, 'bud'),
 (0.6599555611610413, 'hesh'),
 (0.6537138223648071, 'canceling'),
 (0.6507418751716614, 'over'),
 (0.6371705532073975, '2xl')]

In [69]:
model.get_nearest_neighbors('laptop')

[(0.6927817463874817, 'notebook'),
 (0.6504510045051575, 'netbook'),
 (0.5768425464630127, 'biscotti'),
 (0.5515731573104858, 'mouse'),
 (0.5512543320655823, 'ultrabook'),
 (0.549037754535675, 'briefcase'),
 (0.5376833081245422, 'slip'),
 (0.5308789610862732, 'laptops'),
 (0.5232369899749756, 'macbook'),
 (0.5170558094978333, '6gb')]

In [70]:
model.get_nearest_neighbors('freezer')

[(0.7168809771537781, 'refrigerator'),
 (0.6610156893730164, 'mug'),
 (0.6522968411445618, 'cu'),
 (0.635196328163147, 'ft'),
 (0.6291569471359253, 'satina'),
 (0.6207199692726135, 'frost'),
 (0.613924503326416, 'side'),
 (0.5895519256591797, 'bottom'),
 (0.5890802145004272, 'refrigerators'),
 (0.5735819935798645, 'cleansteel')]

#### Brands

In [71]:
model.get_nearest_neighbors('nintendo')

[(0.9543527364730835, 'ds'),
 (0.9344239830970764, 'wii'),
 (0.7957473993301392, 'gamecube'),
 (0.7716276049613953, '3ds'),
 (0.734774112701416, 'psp'),
 (0.7294143438339233, 'advance'),
 (0.7243247032165527, 'boy'),
 (0.6953176259994507, 'playstation'),
 (0.6926416158676147, 'xbox'),
 (0.6734306812286377, '360')]

In [72]:
model.get_nearest_neighbors('whirlpool')

[(0.7950811982154846, 'maytag'),
 (0.7542214393615723, 'biscuit'),
 (0.7433401942253113, 'frigidaire'),
 (0.7261288166046143, 'ge'),
 (0.6821197271347046, 'inglis'),
 (0.6818539500236511, 'bisque'),
 (0.6801739931106567, 'cleansteel'),
 (0.6754910945892334, 'hotpoint'),
 (0.6274698972702026, 'bosch'),
 (0.6264026761054993, 'monochromatic')]

In [73]:
model.get_nearest_neighbors('kodak')

[(0.8262077569961548, 'easyshare'),
 (0.6823721528053284, 'm863'),
 (0.6517613530158997, 'm893'),
 (0.6382378339767456, 'm340'),
 (0.6323002576828003, 'playsport'),
 (0.6273201704025269, 'm1063'),
 (0.6023184061050415, 'playtouch'),
 (0.5708916187286377, 'canon'),
 (0.5704430937767029, 'fuji'),
 (0.5568867325782776, 'photosmart')]

### Models

In [74]:
model.get_nearest_neighbors('ps2')

[(0.7529356479644775, 'ps3'),
 (0.7486602663993835, 'playstation'),
 (0.747470498085022, 'gamecube'),
 (0.7361575961112976, 'xbox'),
 (0.7201453447341919, 'guide'),
 (0.7155444622039795, 'gba'),
 (0.6946600675582886, '360'),
 (0.6943628191947937, 'psp'),
 (0.6420919299125671, 'wii'),
 (0.6418797373771667, 'ds')]

In [75]:
model.get_nearest_neighbors('razr')

[(0.7917510867118835, 'motorola'),
 (0.7535035610198975, 'droid'),
 (0.6494126915931702, 'atrix'),
 (0.6455318331718445, 'nokia'),
 (0.6369015574455261, 'phones'),
 (0.6084145903587341, 'sph'),
 (0.595324695110321, 'mobile'),
 (0.5839547514915466, 'treo'),
 (0.582118570804596, '8530'),
 (0.5762964487075806, '8520')]

In [76]:
model.get_nearest_neighbors('stratocaster')

[(0.8569124937057495, 'telecaster'),
 (0.7751351594924927, 'squier'),
 (0.7537893056869507, 'strat'),
 (0.7519737482070923, 'fender'),
 (0.7368367910385132, 'fretboard'),
 (0.6284483671188354, 'rosewood'),
 (0.6078391075134277, 'sunburst'),
 (0.5804291367530823, 'jazz'),
 (0.5760371088981628, 'tele'),
 (0.5589219331741333, 'tobacco')]

#### Other

In [77]:
model.get_nearest_neighbors('holiday')

[(0.6082809567451477, 'kwanzaa'),
 (0.6040680408477783, 'hanukkah'),
 (0.5694826245307922, 'congratulations'),
 (0.5640594363212585, 'gift'),
 (0.5589464902877808, 'buy'),
 (0.5560659170150757, 'graduation'),
 (0.5488727688789368, 'happy'),
 (0.5465364456176758, 'connection'),
 (0.5430492758750916, 'merry'),
 (0.5429035425186157, 'navidad')]

In [78]:
model.get_nearest_neighbors('plasma')

[(0.7443578243255615, '600hz'),
 (0.6497722864151001, 'hdtv'),
 (0.6033835411071777, 'viera'),
 (0.5932702422142029, '46'),
 (0.5781123638153076, 'dlp'),
 (0.5730062127113342, '58'),
 (0.566292941570282, '42'),
 (0.5553087592124939, '63'),
 (0.5534052848815918, '720p'),
 (0.5509803295135498, 'hdtvs')]

In [79]:
model.get_nearest_neighbors('leather')

[(0.6836671233177185, 'recliner'),
 (0.6658450365066528, 'armless'),
 (0.632309079170227, 'berkline'),
 (0.6187480092048645, 'curved'),
 (0.6164443492889404, 'seating'),
 (0.592078685760498, 'sofa'),
 (0.585899829864502, 'theaterseatstore'),
 (0.5619029402732849, 'bodhi'),
 (0.5513429641723633, 'magnolia'),
 (0.5465265512466431, 'headliner')]

In [80]:
model.get_nearest_neighbors('leather')[0][1]

'recliner'

In [84]:
model.save_model('title_model.bin')

# Level 3: Integrating Synonyms with Search
`cat /workspace/datasets/fasttext/normalized_titles.txt | tr " " "\n" | grep "...." | sort | uniq -c | sort -nr | head -1000 | grep -oE '[^ ]+$' > /workspace/datasets/fasttext/top_words.txt`

The above does the following:

* Replace each space with a newline, so we get one word per line.
* Only keep words containing at least 4 characters.
* Sort the words in alphabetical order.
* Deduplicate the words and keep the counts, yielding a 2-columns file where each line is a count followed by the word.
* Sort the count-word pairs in descending order of count.
* Keep only the top 1,000 entries, i.e., the 1,000 most frequently occurring words.
* Remove the counts so we only output the words.
* Output the result of this process to /workspace/datasets/fasttext/top_words.txt.

In [85]:
skipgram_model = fasttext.load_model('title_model.bin')
top_words = pd.read_table('/workspace/datasets/fasttext/top_words.txt', header = None)
top_words.head()



Unnamed: 0,0
0,black
1,with
2,digital
3,white
4,case


In [86]:
model.get_nearest_neighbors(top_words[0][0])[0][0]

0.6813351511955261

In [87]:
synonyms_df = []
for i in range(0, len(top_words)):
   # synonyms_df.at[0, i] = top_words[0][i]
    nn = skipgram_model.get_nearest_neighbors(top_words[0][i])
    synonyms = []

    for j in range(0, len(nn)):
        similarity = nn[j][0]
        if similarity >= 0.75:
            synonyms.append(nn[j][1])
        else:
            pass
    
    if len(synonyms) > 0:
        synonyms.insert(0, top_words[0][i])
        synonyms_df.append(synonyms)
    else: 
        pass

In [88]:
synonyms_df = pd.DataFrame(synonyms_df)

In [89]:
synonyms_df.to_csv('/workspace/datasets/fasttext/synonyms.csv', sep = ',', header = None)

: 

Copy file to Docker container: `docker cp /workspace/datasets/fasttext/synonyms.csv opensearch-node1:/usr/share/opensearch/config/synonyms.csv`