# Playground
Much like my Notepad, this playground is a place for me to test things.

In [27]:
import pandas as pd
import numpy as np

In [28]:
labeled_products = pd.read_table('/workspace/datasets/fasttext/shuffled_labeled_products.txt',
    header = None)

In [29]:
labeled_products.head()

Unnamed: 0,0
0,__label__abcat0102006 Insignia® - Progressive-...
1,__label__pcmcat247400050000 Sony - VAIO Laptop...
2,__label__pcmcat180400050006 NIKON D3000 Digita...
3,"__label__pcmcat200900050014 LG - 27"" Widescree..."
4,__label__pcmcat167300050040 Samsung - 1000W 5....


In [30]:
labeled_products.shape

(115503, 1)

In [31]:
training_data = labeled_products.loc[0:9999]
training_data.shape

(10000, 1)

In [32]:
training_data.head()

Unnamed: 0,0
0,__label__abcat0102006 Insignia® - Progressive-...
1,__label__pcmcat247400050000 Sony - VAIO Laptop...
2,__label__pcmcat180400050006 NIKON D3000 Digita...
3,"__label__pcmcat200900050014 LG - 27"" Widescree..."
4,__label__pcmcat167300050040 Samsung - 1000W 5....


In [33]:
training_data.to_csv('/workspace/datasets/fasttext/training_data.txt', header = None, index = None, mode = 'a')


In [34]:
test_data = labeled_products.tail(10000)
test_data.shape

(10000, 1)

In [35]:
test_data.to_csv('/workspace/datasets/fasttext/test_data.txt', header = None, index = None, mode = 'a')


## Train model!

In [36]:
import fasttext

In [37]:
# Train model
product_classifier = fasttext.train_supervised(input = '/workspace/datasets/fasttext/training_data.txt')

Read 4M words
Number of words:  38798
Number of labels: 1897
Progress: 100.0% words/sec/thread:   12092 lr:  0.000000 avg.loss:  2.152814 ETA:   0h 0m 0s


In [38]:
product_classifier.save_model("product_classifier")

### @1

In [39]:
product_classifier.test('/workspace/datasets/fasttext/test_data.txt')

(480640, 0.7850282956058589, 0.7850282956058589)

### @5

In [40]:
product_classifier.test('/workspace/datasets/fasttext/test_data.txt', k = 5)

(480640, 0.18833305592543276, 0.9416652796271637)

### @10

In [41]:
product_classifier.test('/workspace/datasets/fasttext/test_data.txt', k = 10)

(480640, 0.0962970206391478, 0.962970206391478)

## Fine tunning - epoch = 25 and lr = 1.0

In [42]:
model = fasttext.train_supervised(input = '/workspace/datasets/fasttext/training_data.txt', 
                                  lr = 1, 
                                  epoch = 25)

Read 4M words
Number of words:  38798
Number of labels: 1897
Progress:   1.5% words/sec/thread:   11792 lr:  0.984695 avg.loss:  4.290653 ETA:   0h 9m22s

RuntimeError: Encountered NaN.

In [107]:
model.test('/workspace/datasets/fasttext/test_data.txt')

(378757, 0.9202575793978726, 0.9202575793978726)

## Fine tunning - bigrams

In [108]:
model = fasttext.train_supervised(input = '/workspace/datasets/fasttext/training_data.txt', 
                                  lr = 1.0, 
                                  epoch = 25,
                                  wordNgrams = 2)

Read 3M words
Number of words:  37232
Number of labels: 1882
Progress: 100.0% words/sec/thread:   10814 lr:  0.000000 avg.loss:  0.193674 ETA:   0h 0m 0s 37.4% words/sec/thread:   10929 lr:  0.626164 avg.loss:  0.417298 ETA:   0h 5m42s 0.476706 avg.loss:  0.325601 ETA:   0h 4m24s


In [109]:
model.test('/workspace/datasets/fasttext/test_data.txt')

(378757, 0.9307603555841872, 0.9307603555841872)

## Using normalized data

In [110]:
model = fasttext.train_supervised(input = '/workspace/datasets/fasttext/normalized_training.txt', 
                                  lr = 0.8, 
                                  epoch = 25)

Read 1M words
Number of words:  23322
Number of labels: 1939
Progress: 100.0% words/sec/thread:    8073 lr:  0.000000 avg.loss:  0.801934 ETA:   0h 0m 0s 0.429383 avg.loss:  1.441951 ETA:   0h 2m19sh 1m25s


In [111]:
model.test('/workspace/datasets/fasttext/normalized_test.txt')

(240950, 0.9425274953309816, 0.9425274953309816)

## Filtering out for at least N = 500 products

In [None]:
labeled_products.head()

Unnamed: 0,0
0,__label__pcmcat180700050008 Bosch - 500 Series...
1,__label__abcat0701003 Bayonetta - Xbox 360
2,"__label__abcat0205003 Sony - 8"" 3-Way Floor Sp..."
3,__label__pcmcat251500050009 Alfred - Advanced ...
4,__label__pcmcat226200050016 Tribeca - Detroit ...


In [None]:
N = 500

