# Playground
Much like my Notepad, this playground is a place for me to test things.

# Level 1

In [1]:
import pandas as pd
import numpy as np

In [38]:
labeled_products = pd.read_table('/workspace/datasets/fasttext/shuffled_labeled_products.txt',
    header = None)

In [3]:
labeled_products.head()

Unnamed: 0,0
0,__label__abcat0807005 HP - 15 Inkjet Cartridge...
1,__label__abcat0912020 Aroma - 8-Cup Rice Cooke...
2,__label__abcat0916013 Smart Choice - HEPA Filt...
3,__label__abcat0410003 Lowepro - Ridge 30 Pouch...
4,"__label__abcat0201009 8GB iPod touch® - Black,..."


In [4]:
labeled_products.shape

(115503, 1)

Doesn't look that shuffled to me...

In [5]:
labeled_products = labeled_products.sample(frac = 1).reset_index(drop = True)
labeled_products.head()

Unnamed: 0,0
0,__label__abcat0811004 Lenmar - Lithium-Ion Bat...
1,__label__pcmcat180400050006 Nikon D90 12.3MP D...
2,"__label__pcmcat164200050013 Gateway - 10.1"" Ne..."
3,__label__pcmcat226200050010 Tribeca - Oklahoma...
4,__label__abcat0707003 Little Red Riding Hood's...


In [7]:
training_data = labeled_products.loc[0:9999]
training_data.shape

(10000, 1)

In [8]:
training_data.head()

Unnamed: 0,0
0,__label__abcat0811004 Lenmar - Lithium-Ion Bat...
1,__label__pcmcat180400050006 Nikon D90 12.3MP D...
2,"__label__pcmcat164200050013 Gateway - 10.1"" Ne..."
3,__label__pcmcat226200050010 Tribeca - Oklahoma...
4,__label__abcat0707003 Little Red Riding Hood's...


In [9]:
training_data.to_csv('/workspace/datasets/fasttext/training_data.txt', header = None, index = None, mode = 'a')


In [10]:
test_data = labeled_products.tail(10000)
test_data.shape

(10000, 1)

In [17]:
test_data.to_csv('/workspace/datasets/fasttext/test_data.txt', header = None, index = None, mode = 'a')


## Train model!

In [12]:
import fasttext

In [20]:
# Train model
product_classifier = fasttext.train_supervised(input = '/workspace/datasets/fasttext/training_data.txt')

Read 0M words
Number of words:  11719
Number of labels: 1280
Progress: 100.0% words/sec/thread:    5921 lr:  0.000000 avg.loss: 13.500198 ETA:   0h 0m 0s


In [21]:
product_classifier.save_model("product_classifier")

### @1

In [22]:
product_classifier.test('/workspace/datasets/fasttext/test_data.txt')

(7737, 0.06759725991986558, 0.06759725991986558)

### @5

In [23]:
product_classifier.test('/workspace/datasets/fasttext/test_data.txt', k = 5)

(7737, 0.030528628667442163, 0.1526431433372108)

### @10

In [24]:
product_classifier.test('/workspace/datasets/fasttext/test_data.txt', k = 10)

(7737, 0.018779888845805868, 0.18779888845805867)

## Fine tunning - epoch = 25 and lr = 1.0

In [25]:
model = fasttext.train_supervised(input = '/workspace/datasets/fasttext/training_data.txt', 
                                  lr = 1, 
                                  epoch = 25)

Read 0M words
Number of words:  11719
Number of labels: 1280
Progress: 100.0% words/sec/thread:    8141 lr:  0.000000 avg.loss:  1.108078 ETA:   0h 0m 0s


In [26]:
model.test('/workspace/datasets/fasttext/test_data.txt')

(7737, 0.5929947007884193, 0.5929947007884193)

## Fine tunning - bigrams

In [27]:
model = fasttext.train_supervised(input = '/workspace/datasets/fasttext/training_data.txt', 
                                  lr = 1.0, 
                                  epoch = 25,
                                  wordNgrams = 2)

Read 0M words
Number of words:  11719
Number of labels: 1280
Progress: 100.0% words/sec/thread:   15572 lr:  0.000000 avg.loss:  1.446491 ETA:   0h 0m 0s


In [28]:
model.test('/workspace/datasets/fasttext/test_data.txt')

(7737, 0.5857567532635388, 0.5857567532635388)

## Using normalized data

`cat /workspace/datasets/fasttext/training_lite.txt |sed -e "s/\([.\!?,'/()]\)/ \1 /g" | tr "[:upper:]" "[:lower:]" | sed "s/[^[:alnum:]_]/ /g" | tr -s ' ' > /workspace/datasets/fasttext/normalized_training_lite.txt`

In [31]:
model = fasttext.train_supervised(input = '/workspace/datasets/fasttext/normalized_training_lite.txt', 
                                  lr = 1, 
                                  epoch = 25)

Read 0M words
Number of words:  8703
Number of labels: 1371
Progress: 100.0% words/sec/thread:   12713 lr:  0.000000 avg.loss:  0.910726 ETA:   0h 0m 0s


In [32]:
model.test('/workspace/datasets/fasttext/normalized_test_lite.txt')

(9683, 0.6198492202829702, 0.6198492202829702)

## Filtering out for at least N = 500 products

In [39]:
labeled_products.head()

Unnamed: 0,0
0,__label__abcat0807005 HP - 15 Inkjet Cartridge...
1,__label__abcat0912020 Aroma - 8-Cup Rice Cooke...
2,__label__abcat0916013 Smart Choice - HEPA Filt...
3,__label__abcat0410003 Lowepro - Ridge 30 Pouch...
4,"__label__abcat0201009 8GB iPod touch® - Black,..."


In [93]:
def create_pruned_labeled_products(df, min_products = 500):
    labels = []
    pruned_labeled_products = []
    count_products = 0

    print('Mapping labels')
    for row in range(0, len(df)):
        label = df[0][row].split()[0]
        labels.append(label)
        # product = df[0][row].split()[:1]
    
    labels = pd.Series(labels)
    label_counts = labels.value_counts()
    keep_labels = label_counts[label_counts >= min_products].index.tolist()

    print('Found labels to keep!')

    for row in range(0, len(df)):
        if labels[row] in keep_labels:
            pruned_labeled_products.append(df[0][row])
        else:
            pass
    
    pruned_labeled_products = pd.DataFrame(pruned_labeled_products)
    pruned_labeled_products.to_csv('/workspace/datasets/fasttext/pruned_labeled_products.txt', header = None, index = None, mode = 'a')
    print('Saved pruned_labeled_products.txt')

    return pruned_labeled_products

In [94]:
plp = create_pruned_labeled_products(labeled_products)

Mapping labels
Found labels to keep!
Saved pruned_labeled_products.txt


In [95]:
plp.head()

Unnamed: 0,0
0,__label__pcmcat165900050033 Metra - DIN Instal...
1,__label__abcat0106001 BDI - Mirage TV Stand fo...
2,"__label__abcat0904001 LG - 35"" Built-In Electr..."
3,__label__cat09000 Best Buy GC - $15 Birthday D...
4,__label__pcmcat227500050028 Nikon Coolpix L26 ...


In [96]:
plp.shape

(28921, 1)

## Shuffle and split this dataset!

In [97]:
pruned_df = plp.sample(frac = 1).reset_index(drop = True)
pruned_df.head()

Unnamed: 0,0
0,__label__pcmcat151600050006 Michael Kelly - Pa...
1,__label__abcat0901005 GE - Profile 24.6 Cu. Ft...
2,__label__abcat0706002 Baseball Blast - Nintend...
3,__label__pcmcat144700050004 Jawbone - Bluetoot...
4,__label__pcmcat171900050029 OtterBox - Impact ...


In [98]:
pruned_training_data = pruned_df.loc[0:9999]
pruned_training_data.shape

(10000, 1)

In [100]:
pruned_training_data.to_csv('/workspace/datasets/fasttext/pruned_training_data.txt', header = None, index = None, mode = 'a')

In [101]:
pruned_test_data = pruned_df.tail(10000)
pruned_test_data.shape

(10000, 1)

In [102]:
pruned_test_data.to_csv('/workspace/datasets/fasttext/pruned_test_data.txt', header = None, index = None, mode = 'a')

## Train model on pruned data

In [103]:
model = fasttext.train_supervised(input = '/workspace/datasets/fasttext/pruned_training_data.txt', 
                                  lr = 1, 
                                  epoch = 25)

Read 0M words
Number of words:  7020
Number of labels: 31
Progress: 100.0% words/sec/thread:  850783 lr:  0.000000 avg.loss:  0.054239 ETA:   0h 0m 0s


In [104]:
model.test('/workspace/datasets/fasttext/pruned_test_data.txt')

(6541, 0.9550527442287112, 0.9550527442287112)

# Level 2: Derive Synonyms from Content

`cut -d' ' -f2- /workspace/datasets/fasttext/shuffled_labeled_products.txt > /workspace/datasets/fasttext/titles.txt` I can't believe this gets the category labels.

In [107]:
model = fasttext.train_unsupervised(model = 'skipgram',
                                    input = '/workspace/datasets/fasttext/titles.txt')
model.save_model('/workspace/datasets/fasttext/title_model.bin')

Read 1M words
Number of words:  10873
Number of labels: 0
Progress: 100.0% words/sec/thread:   55620 lr:  0.000000 avg.loss:  1.364058 ETA:   0h 0m 0s100.0% words/sec/thread:   55622 lr: -0.000002 avg.loss:  1.364058 ETA:   0h 0m 0s


In [109]:
model.get_nearest_neighbors('iphone')

[(0.8572229743003845, 'Saxophone'),
 (0.846019446849823, 'Telephone'),
 (0.8430326581001282, 'Speakerphone'),
 (0.8397766351699829, 'Microphone'),
 (0.814883828163147, 'Telephones'),
 (0.8052800297737122, 'Earphones'),
 (0.7911036014556885, 'Headphone'),
 (0.7814058065414429, 'GoPhone'),
 (0.7770469784736633, 'Ozone'),
 (0.7752405405044556, 'Phone')]

In [110]:
model.get_nearest_neighbors('iPhone')

[(0.9494825005531311, 'iPhone®,'),
 (0.9345912933349609, 'iPhone®'),
 (0.8405899405479431, 'iPod®/iPhone'),
 (0.8238221406936646, 'iPod®/iPhone®'),
 (0.8042400479316711, '3GS'),
 (0.7997831702232361, '3GS,'),
 (0.79729163646698, '4S'),
 (0.7971472144126892, '3G/3GS'),
 (0.7886156439781189, '3G,'),
 (0.7885591387748718, '3G/3G')]

### Normalizing titles data
`cat /workspace/datasets/fasttext/titles.txt | sed -e "s/\([.\!?,'/()]\)/ \1 /g" | tr "[:upper:]" "[:lower:]" | sed "s/[^[:alnum:]]/ /g" | tr -s ' ' > /workspace/datasets/fasttext/normalized_titles.txt`

In [111]:
model = fasttext.train_unsupervised(model = 'skipgram',
                                    input = '/workspace/datasets/fasttext/normalized_titles.txt')

Read 1M words
Number of words:  8903
Number of labels: 0
Progress: 100.0% words/sec/thread:   20479 lr:  0.000000 avg.loss:  1.466515 ETA:   0h 0m 0s 30.1% words/sec/thread:   20762 lr:  0.034963 avg.loss:  1.959467 ETA:   0h 0m12s


In [112]:
model.get_nearest_neighbors('iphone')

[(0.8656516671180725, '4s'),
 (0.7738166451454163, 'apple'),
 (0.7621972560882568, '3gs'),
 (0.7245116829872131, 'ifrogz'),
 (0.7202589511871338, 'fabshell'),
 (0.7189897298812866, 'ipod'),
 (0.7106965184211731, 'ipadÂ'),
 (0.7083972096443176, 'appleÂ'),
 (0.6999031901359558, 'ozone'),
 (0.6925870776176453, 'ipodÂ')]

In [113]:
model.get_nearest_neighbors('iPhone')

[(0.9293962121009827, 'hone'),
 (0.8393517732620239, 'phone'),
 (0.8297415971755981, 'ozone'),
 (0.8249611854553223, 'gophone'),
 (0.7918494343757629, 'saxophone'),
 (0.7852948307991028, 'speakerphone'),
 (0.7640011310577393, 'jawbone'),
 (0.7615736126899719, 'iphone'),
 (0.7572123408317566, 'bone'),
 (0.7092849016189575, 'tecnozone')]

In [114]:
model.get_nearest_neighbors('Iphone')

[(0.9227175712585449, 'phone'),
 (0.9163171648979187, 'gophone'),
 (0.8423050045967102, 'speakerphone'),
 (0.8139309287071228, 'saxophone'),
 (0.8083683848381042, 'hone'),
 (0.7956296801567078, 'ozone'),
 (0.7944759726524353, 'telephone'),
 (0.783035933971405, 'iphone'),
 (0.755550742149353, 'earphones'),
 (0.7545182108879089, 'telephones')]

### Testing tokens

#### Product types

In [115]:
model.get_nearest_neighbors('headphones')

[(0.9264744520187378, 'headphone'),
 (0.8950048685073853, 'earbud'),
 (0.8435399532318115, 'ear'),
 (0.8406202793121338, 'earphones'),
 (0.7602767944335938, 'earbuds'),
 (0.7437191009521484, 'bud'),
 (0.7251234650611877, 'yurbuds'),
 (0.719900369644165, 'microphones'),
 (0.7193188667297363, 'ears'),
 (0.7171502709388733, 'behind')]

In [116]:
model.get_nearest_neighbors('laptop')

[(0.7285926342010498, 'durabook'),
 (0.71401047706604, '156b'),
 (0.7086651921272278, 'laptops'),
 (0.6971361041069031, '17r'),
 (0.6968522667884827, '176'),
 (0.696539580821991, 'ultrabook'),
 (0.6933403611183167, 'i15'),
 (0.6920029520988464, '179'),
 (0.6864805221557617, 's5919'),
 (0.6858465671539307, 'notebook')]

In [117]:
model.get_nearest_neighbors('freezer')

[(0.9226808547973633, 'freezers'),
 (0.826248049736023, 'refrigerator'),
 (0.8076303005218506, 'refrigerators'),
 (0.7914842963218689, 'frost'),
 (0.7268251180648804, 'cu'),
 (0.722983181476593, 'satina'),
 (0.7148715853691101, 'mug'),
 (0.7123090624809265, 'side'),
 (0.7089095711708069, 'ft'),
 (0.7036294937133789, 'cleansteel')]

#### Brands

In [118]:
model.get_nearest_neighbors('nintendo')

[(0.9755204319953918, 'nintendogs'),
 (0.9111528992652893, 'ds'),
 (0.8851426243782043, 'wii'),
 (0.810907781124115, '3ds'),
 (0.7600410580635071, 'gamecube'),
 (0.7530352473258972, 'wwii'),
 (0.7354763150215149, 'rabbids'),
 (0.730229377746582, 'ninjas'),
 (0.7279669642448425, 'psp'),
 (0.7278790473937988, 'luigi')]

In [119]:
model.get_nearest_neighbors('whirlpool')

[(0.8759189248085022, 'whirl'),
 (0.8506542444229126, 'biscuit'),
 (0.8235144019126892, 'frigidaire'),
 (0.8149738311767578, 'maytag'),
 (0.8135016560554504, 'bisque'),
 (0.7610523700714111, 'nautilus'),
 (0.7508520483970642, 'cleansteel'),
 (0.7477498054504395, 'gallery'),
 (0.7414782047271729, 'mediterranean'),
 (0.7263631820678711, 'bosch')]

In [120]:
model.get_nearest_neighbors('kodak')

[(0.8842921257019043, 'easyshare'),
 (0.8162667155265808, 'm863'),
 (0.7873838543891907, 'm763'),
 (0.7750740647315979, 'm341'),
 (0.7745869159698486, 'm893'),
 (0.7730166912078857, 'm1063'),
 (0.7658917903900146, 'c813'),
 (0.7515822052955627, 'm381'),
 (0.7489995956420898, 'm340'),
 (0.72740238904953, 'playtouch')]

### Models

In [121]:
model.get_nearest_neighbors('ps2')

[(0.8771463632583618, 'ps3'),
 (0.8322136998176575, '2k5'),
 (0.8280540704727173, '2k3'),
 (0.8255287408828735, '2k7'),
 (0.8206874132156372, '2k8'),
 (0.8190405964851379, '2k6'),
 (0.8187625408172607, '2k9'),
 (0.8139417171478271, '2k12'),
 (0.8130795955657959, 'psp'),
 (0.8069602251052856, 'Â')]

In [122]:
model.get_nearest_neighbors('razr')

[(0.8850070834159851, 'a855'),
 (0.8770463466644287, 'krzr'),
 (0.8685644865036011, 'e71'),
 (0.8587075471878052, 'i90c'),
 (0.8577662706375122, 'i60c'),
 (0.8528813719749451, 'sgh'),
 (0.8458672165870667, 'i55sr'),
 (0.8454095721244812, 'i95cl'),
 (0.8451084494590759, 'i50sx'),
 (0.8444380164146423, 'a957')]

In [123]:
model.get_nearest_neighbors('stratocaster')

[(0.9207808375358582, 'telecaster'),
 (0.9023835062980652, 'starcaster'),
 (0.8275824785232544, 'strat'),
 (0.8056992888450623, 'squier'),
 (0.7930323481559753, 'forecaster'),
 (0.7841600775718689, 'fender'),
 (0.7797226905822754, 'hss'),
 (0.7754448056221008, 'synyster'),
 (0.7649683356285095, 'sunburst'),
 (0.757795512676239, 'tremolo')]

#### Other

In [124]:
model.get_nearest_neighbors('holiday')

[(0.9776589870452881, 'holidays'),
 (0.8339588046073914, 'kwanzaa'),
 (0.8210069537162781, 'día'),
 (0.8169721364974976, 'hanukkah'),
 (0.8144399523735046, 'buy'),
 (0.8116978406906128, 'vibes'),
 (0.8087191581726074, 'cumpleaños'),
 (0.8067864179611206, 'gracias'),
 (0.8067839741706848, 'congrats'),
 (0.8001271486282349, 'navidad')]

In [125]:
model.get_nearest_neighbors('plasma')

[(0.8561182618141174, '600hz'),
 (0.8143249154090881, '480hz'),
 (0.8096691966056824, 'hdtvs'),
 (0.7964454293251038, 'edtv'),
 (0.7928016185760498, '58'),
 (0.7918755412101746, '63'),
 (0.7911931276321411, 'hdtv'),
 (0.790882408618927, '480p'),
 (0.7820399403572083, 'aquos'),
 (0.7800616025924683, '720p')]

In [126]:
model.get_nearest_neighbors('leather')

[(0.9126930832862854, 'leatherskin'),
 (0.7073385715484619, 'recliner'),
 (0.6906315684318542, 'hipcase'),
 (0.6888184547424316, 'berkline'),
 (0.6798998117446899, 'curved'),
 (0.6694930791854858, 'theaterseatstore'),
 (0.6683762669563293, 'magnolia'),
 (0.6619722843170166, 'seating'),
 (0.6597440838813782, 'armless'),
 (0.652084231376648, 'slipcase')]

## Increasing epochs = 25

In [128]:
model = fasttext.train_unsupervised(model = 'skipgram',
                                    input = '/workspace/datasets/fasttext/normalized_titles.txt',
                                    epoch = 25)

Read 1M words
Number of words:  8903
Number of labels: 0
Progress: 100.0% words/sec/thread:   19277 lr:  0.000000 avg.loss:  1.076012 ETA:   0h 0m 0s


#### Product types

In [129]:
model.get_nearest_neighbors('headphones')

[(0.9101961851119995, 'earbud'),
 (0.868574321269989, 'headphone'),
 (0.832120954990387, 'ear'),
 (0.6951853036880493, 'earphones'),
 (0.6881842613220215, 'superbudz'),
 (0.6869868636131287, '2xl'),
 (0.680428683757782, 'lowrider'),
 (0.6717972755432129, 'bud'),
 (0.6688070893287659, 'backbeat'),
 (0.6511826515197754, 'behind')]

In [130]:
model.get_nearest_neighbors('laptop')

[(0.6615343689918518, 'notebook'),
 (0.6596641540527344, '176'),
 (0.6523770093917847, 'netbook'),
 (0.6413260698318481, '178'),
 (0.6207646727561951, 'zenbook'),
 (0.6180056929588318, '179'),
 (0.6166394948959351, '156b'),
 (0.6124466061592102, 's5919'),
 (0.6123392581939697, '177'),
 (0.6059682965278625, 'ultrabook')]

In [131]:
model.get_nearest_neighbors('freezer')

[(0.9144962430000305, 'freezers'),
 (0.7456715703010559, 'refrigerator'),
 (0.6942262649536133, 'mug'),
 (0.6694881916046143, 'refrigerators'),
 (0.6641954779624939, 'cu'),
 (0.6561124324798584, 'ft'),
 (0.6533827185630798, 'side'),
 (0.6421054601669312, 'bottom'),
 (0.6369955539703369, 'satina'),
 (0.6289098262786865, 'ice2o')]

#### Brands

In [132]:
model.get_nearest_neighbors('nintendo')

[(0.9488509893417358, 'ds'),
 (0.9385260939598083, 'wii'),
 (0.8369333744049072, 'nintendogs'),
 (0.7798648476600647, '3ds'),
 (0.7521668076515198, 'gamecube'),
 (0.7356975674629211, 'psp'),
 (0.7232993841171265, 'playstation'),
 (0.7175878882408142, 'advance'),
 (0.705117404460907, 'boy'),
 (0.6871598362922668, 'xbox')]

In [133]:
model.get_nearest_neighbors('whirlpool')

[(0.7844407558441162, 'maytag'),
 (0.7664726972579956, 'biscuit'),
 (0.7518386244773865, 'frigidaire'),
 (0.7381290197372437, 'ge'),
 (0.7075919508934021, 'inglis'),
 (0.6973232626914978, 'bisque'),
 (0.67243492603302, 'hotpoint'),
 (0.6674540042877197, 'cleansteel'),
 (0.650240957736969, 'satina'),
 (0.6423548460006714, 'gas')]

In [134]:
model.get_nearest_neighbors('kodak')

[(0.856397807598114, 'easyshare'),
 (0.6832489967346191, 'c813'),
 (0.6769503355026245, 'm863'),
 (0.6702114939689636, 'm893'),
 (0.6549410223960876, 'playsport'),
 (0.6524822115898132, 'm763'),
 (0.6497923135757446, 'm1063'),
 (0.6411676406860352, 'm381'),
 (0.6410190463066101, 'playtouch'),
 (0.628979504108429, 'm341')]

### Models

In [135]:
model.get_nearest_neighbors('ps2')

[(0.743659257888794, 'ps3'),
 (0.7289877533912659, 'playstation'),
 (0.7154685258865356, 'gamecube'),
 (0.705649197101593, 'xbox'),
 (0.6996206641197205, 'psp'),
 (0.6672667264938354, 'gba'),
 (0.6635426878929138, 'guide'),
 (0.6567842364311218, '360'),
 (0.628485918045044, '2k5'),
 (0.626676082611084, 'codes')]

In [136]:
model.get_nearest_neighbors('razr')

[(0.7872809767723083, 'motorola'),
 (0.7501631379127502, 'krzr'),
 (0.726353645324707, 't720'),
 (0.7113621234893799, 'droid'),
 (0.7106636762619019, 'a855'),
 (0.6878895163536072, 'backflip'),
 (0.6824169158935547, 'captivate'),
 (0.671724259853363, 'r225'),
 (0.6716495752334595, 'katana'),
 (0.6691858768463135, 'photon')]

In [137]:
model.get_nearest_neighbors('stratocaster')

[(0.8802252411842346, 'telecaster'),
 (0.791517972946167, 'fender'),
 (0.7821407318115234, 'fretboard'),
 (0.7768542766571045, 'strat'),
 (0.7744718790054321, 'starcaster'),
 (0.7549101114273071, 'squier'),
 (0.7400326132774353, 'hss'),
 (0.7013980150222778, 'worn'),
 (0.680253267288208, 'thinline'),
 (0.6756905913352966, 'tremolo')]

#### Other

In [138]:
model.get_nearest_neighbors('holiday')

[(0.9434413909912109, 'holidays'),
 (0.7283152937889099, 'vibes'),
 (0.7019318342208862, 'nobr'),
 (0.6571457386016846, 'stocking'),
 (0.6431208848953247, 'grab'),
 (0.6324362754821777, 'dreidel'),
 (0.6274356245994568, 'reuse'),
 (0.6199308037757874, 'despicable'),
 (0.6103247404098511, 'kwanzaa'),
 (0.6097993850708008, 'hanukkah')]

In [139]:
model.get_nearest_neighbors('plasma')

[(0.7716155052185059, '600hz'),
 (0.7149290442466736, 'hdtv'),
 (0.7122379541397095, '480hz'),
 (0.6807251572608948, 'ambilight'),
 (0.656670331954956, 'viera'),
 (0.6427713632583618, 'kuro'),
 (0.6397998929023743, 'edtv'),
 (0.638934850692749, 'purevision'),
 (0.622340738773346, '58'),
 (0.6158860921859741, 'hdtvs')]

In [140]:
model.get_nearest_neighbors('leather')

[(0.7891209721565247, 'leatherskin'),
 (0.6668113470077515, 'armless'),
 (0.6656706929206848, 'recliner'),
 (0.651976466178894, 'sofa'),
 (0.6494664549827576, 'berkline'),
 (0.5985611081123352, 'curved'),
 (0.5938190221786499, 'executive'),
 (0.5930474400520325, 'maccase'),
 (0.592458188533783, 'dolan'),
 (0.5848910808563232, 'theaterseatstore')]

### Ignoring rare words: minCount = 20

In [158]:
model = fasttext.train_unsupervised(model = 'skipgram',
                                    input = '/workspace/datasets/fasttext/normalized_titles.txt',
                                    epoch = 25,
                                    minCount = 20)

Read 1M words
Number of words:  3861
Number of labels: 0
Progress: 100.0% words/sec/thread:   29796 lr:  0.000000 avg.loss:  1.194604 ETA:   0h 0m 0s


#### Product types

In [142]:
model.get_nearest_neighbors('headphones')

[(0.9077590703964233, 'earbud'),
 (0.8633512258529663, 'ear'),
 (0.8367388248443604, 'headphone'),
 (0.6919692754745483, 'earphones'),
 (0.6578373908996582, 'lowrider'),
 (0.632967472076416, 'bud'),
 (0.6313893795013428, 'canceling'),
 (0.6285480260848999, 'behind'),
 (0.6254199743270874, 'isolating'),
 (0.6225119829177856, '2xl')]

In [143]:
model.get_nearest_neighbors('laptop')

[(0.696130633354187, 'notebook'),
 (0.6253318190574646, 'netbook'),
 (0.5776479244232178, 'ultrabook'),
 (0.5627023577690125, 'laptops'),
 (0.5543901324272156, 'biscotti'),
 (0.5523476004600525, 'briefcase'),
 (0.5378716588020325, '6gb'),
 (0.5368289947509766, 'mouse'),
 (0.527636706829071, 'notebooks'),
 (0.5220662355422974, 'dell')]

In [144]:
model.get_nearest_neighbors('freezer')

[(0.7267799973487854, 'refrigerator'),
 (0.6866878867149353, 'mug'),
 (0.6628516912460327, 'cu'),
 (0.6429688334465027, 'ft'),
 (0.6273143291473389, 'satina'),
 (0.6246259808540344, 'bottom'),
 (0.6219789385795593, 'frost'),
 (0.618651270866394, 'refrigerators'),
 (0.6148020625114441, 'side'),
 (0.5872531533241272, 'customstyle')]

#### Brands

In [145]:
model.get_nearest_neighbors('nintendo')

[(0.951678991317749, 'ds'),
 (0.9297282099723816, 'wii'),
 (0.772692859172821, 'gamecube'),
 (0.7635993361473083, '3ds'),
 (0.7447153329849243, 'advance'),
 (0.7401416301727295, 'psp'),
 (0.722452700138092, 'boy'),
 (0.7012931108474731, 'playstation'),
 (0.6968538761138916, 'xbox'),
 (0.6683907508850098, '360')]

In [146]:
model.get_nearest_neighbors('whirlpool')

[(0.7700554132461548, 'maytag'),
 (0.7523990869522095, 'biscuit'),
 (0.736309826374054, 'frigidaire'),
 (0.7147598266601562, 'ge'),
 (0.708550214767456, 'inglis'),
 (0.6801213026046753, 'bisque'),
 (0.6548628807067871, 'hotpoint'),
 (0.642379641532898, 'cleansteel'),
 (0.6310959458351135, 'accubake'),
 (0.6188372373580933, 'lg')]

In [147]:
model.get_nearest_neighbors('kodak')

[(0.8246306777000427, 'easyshare'),
 (0.6692595481872559, 'm863'),
 (0.6459836959838867, 'playsport'),
 (0.6437517404556274, 'm893'),
 (0.6318531632423401, 'playtouch'),
 (0.6304375529289246, 'm1063'),
 (0.6267918944358826, 'm340'),
 (0.5941880345344543, 'esp'),
 (0.5790358185768127, 'packard'),
 (0.5651094913482666, 'canon')]

### Models

In [148]:
model.get_nearest_neighbors('ps2')

[(0.7710532546043396, 'gamecube'),
 (0.7530308961868286, 'ps3'),
 (0.7519135475158691, 'xbox'),
 (0.735420823097229, 'guide'),
 (0.7233341932296753, 'playstation'),
 (0.7192734479904175, 'gba'),
 (0.6991167664527893, '360'),
 (0.6877713799476624, 'psp'),
 (0.6404472589492798, 'wii'),
 (0.6385394334793091, 'game')]

In [149]:
model.get_nearest_neighbors('razr')

[(0.8113381862640381, 'motorola'),
 (0.7371107935905457, 'droid'),
 (0.6878838539123535, 'nokia'),
 (0.6594678163528442, 'atrix'),
 (0.621213972568512, 'sph'),
 (0.6165000200271606, 'phones'),
 (0.595081090927124, '9700'),
 (0.5907431244850159, 'treo'),
 (0.5896117687225342, 'cell'),
 (0.5837921500205994, '8530')]

In [150]:
model.get_nearest_neighbors('stratocaster')

[(0.8738535642623901, 'telecaster'),
 (0.7752670645713806, 'strat'),
 (0.7598609924316406, 'fender'),
 (0.7562196254730225, 'squier'),
 (0.7503111362457275, 'fretboard'),
 (0.6443420052528381, 'sunburst'),
 (0.6072401404380798, 'tele'),
 (0.606623649597168, 'rosewood'),
 (0.5614346265792847, 'jazz'),
 (0.5499764680862427, 'burst')]

#### Other

In [151]:
model.get_nearest_neighbors('holiday')

[(0.623079776763916, 'hanukkah'),
 (0.6095627546310425, 'kwanzaa'),
 (0.5833070874214172, 'congratulations'),
 (0.5734252333641052, 'graduation'),
 (0.5693238973617554, 'buy'),
 (0.5664883852005005, 'merry'),
 (0.5622283816337585, 'navidad'),
 (0.5601946115493774, 'gift'),
 (0.5568721890449524, 'happy'),
 (0.547725260257721, 'cumpleaños')]

In [152]:
model.get_nearest_neighbors('plasma')

[(0.7620000243186951, '600hz'),
 (0.6796069145202637, 'hdtv'),
 (0.6122992038726807, 'viera'),
 (0.5990015864372253, 'dlp'),
 (0.5792747735977173, '58'),
 (0.5751847624778748, 'hdtvs'),
 (0.5733826756477356, '63'),
 (0.5717700123786926, 'vivitek'),
 (0.5701650381088257, '46'),
 (0.5644906163215637, 'tvs')]

In [186]:
model.get_nearest_neighbors('leather')

[(0.6537390947341919, 'armless'),
 (0.6365533471107483, 'recliner'),
 (0.5998279452323914, 'berkline'),
 (0.5972900986671448, 'seating'),
 (0.5934311747550964, 'curved'),
 (0.5850229263305664, 'theaterseatstore'),
 (0.5704469680786133, 'magnolia'),
 (0.5590958595275879, 'bodhi'),
 (0.5548452138900757, 'sofa'),
 (0.5489217042922974, 'headliner')]

In [188]:
model.get_nearest_neighbors('leather')[0][1]

'armless'

In [159]:
model.save_model('workspace/datasets/fasttext//title_model.bin')

ValueError: workspace/datasets/fasttext/title_model.bin cannot be opened for saving!

# Level 3: Integrating Synonyms with Search
`cat /workspace/datasets/fasttext/normalized_titles.txt | tr " " "\n" | grep "...." | sort | uniq -c | sort -nr | head -1000 | grep -oE '[^ ]+$' > /workspace/datasets/fasttext/top_words.txt`

The above does the following:

* Replace each space with a newline, so we get one word per line.
* Only keep words containing at least 4 characters.
* Sort the words in alphabetical order.
* Deduplicate the words and keep the counts, yielding a 2-columns file where each line is a count followed by the word.
* Sort the count-word pairs in descending order of count.
* Keep only the top 1,000 entries, i.e., the 1,000 most frequently occurring words.
* Remove the counts so we only output the words.
* Output the result of this process to /workspace/datasets/fasttext/top_words.txt.

In [160]:
skipgram_model = model# fastText.load_model('workspace/datasets/fasttext/title_model.bin')
top_words = pd.read_table('/workspace/datasets/fasttext/top_words.txt', header = None)
top_words.head()

Unnamed: 0,0
0,black
1,with
2,digital
3,white
4,case


In [208]:
model.get_nearest_neighbors(top_words[0][0])[0][0]

0.6925215721130371

In [211]:
synonyms_df = []
for i in range(0, len(top_words)):
   # synonyms_df.at[0, i] = top_words[0][i]
    nn = skipgram_model.get_nearest_neighbors(top_words[0][i])
    synonyms = []

    for j in range(0, len(nn)):
        similarity = nn[j][0]
        if similarity >= 0.75:
            synonyms.append(nn[j][1])
        else:
            pass
    
    if len(synonyms) > 0:
        synonyms.insert(0, top_words[0][i])
        synonyms_df.append(synonyms)
    else: 
        pass

In [214]:
synonyms_df = pd.DataFrame(synonyms_df)

In [215]:
synonyms_df.to_csv('/workspace/datasets/fasttext/synonyms.csv', sep = ',', header = None)

: 

Copy file to Docker container: `docker cp /workspace/datasets/fasttext/synonyms.csv opensearch-node1:/usr/share/opensearch/config/synonyms.csv`