## Data Extraction & Analysis

In [5]:
# https://www.kaggle.com/rmisra/news-category-dataset -> archive.zip
# https://dl.fbaipublicfiles.com/fasttext/data/cooking.stackexchange.tar.gz -> cooking.stackexchange.txt

!unzip data/archive.zip -d data/
!mv data/News_Category_Dataset_v2.json data/news-articles.jsonl

Archive:  data/archive.zip
  inflating: data/News_Category_Dataset_v2.json  


In [17]:
!head -2 data/news-articles.jsonl

{"category": "CRIME", "headline": "There Were 2 Mass Shootings In Texas Last Week, But Only 1 On TV", "authors": "Melissa Jeltsen", "link": "https://www.huffingtonpost.com/entry/texas-amanda-painter-mass-shooting_us_5b081ab4e4b0802d69caad89", "short_description": "She left her husband. He killed their children. Just another day in America.", "date": "2018-05-26"}
{"category": "ENTERTAINMENT", "headline": "Will Smith Joins Diplo And Nicky Jam For The 2018 World Cup's Official Song", "authors": "Andy McDonald", "link": "https://www.huffingtonpost.com/entry/will-smith-joins-diplo-and-nicky-jam-for-the-official-2018-world-cup-song_us_5b09726fe4b0fdb2aa541201", "short_description": "Of course it has a song.", "date": "2018-05-26"}


In [12]:
!head -5 data/cooking.stackexchange.txt

__label__sauce __label__cheese How much does potato starch affect a cheese sauce recipe?
__label__food-safety __label__acidity Dangerous pathogens capable of growing in acidic environments
__label__cast-iron __label__stove How do I cover up the white spots on my cast iron stove?
__label__restaurant Michelin Three Star Restaurant; but if the chef is not there
__label__knife-skills __label__dicing Without knife skills, how can I quickly and accurately dice vegetables?


## Data Preparation & Standardization

In [28]:
import json
fileReader = open("data/news-articles.jsonl", "r")
fileWriter = open("data/news-articles.txt", "w")
for line in fileReader:
    news = dict(json.loads(line))
    fileWriter.write("__label__"+news["category"].lower()+" "+news["headline"].lower()+"\n")

In [30]:
!head -5 data/news-articles.txt

__label__crime there were 2 mass shootings in texas last week, but only 1 on tv
__label__entertainment will smith joins diplo and nicky jam for the 2018 world cup's official song
__label__entertainment hugh grant marries for the first time at age 57
__label__entertainment jim carrey blasts 'castrato' adam schiff and democrats in new artwork
__label__entertainment julianna margulies uses donald trump poop bags to pick up after her dog


## Splitting dataset into training & testing

In [32]:
!wc data/news-articles.txt

  200832  2189821 15670354 data/news-articles.txt


In [34]:
!head -n 160000 data/news-articles.txt > data/news-articles.train
!tail -n 40832 data/news-articles.txt > data/news-articles.valid

## Usage FastText basic

In [35]:
import fasttext

model = fasttext.train_supervised(input="data/news-articles.train")

In [36]:
model.save_model("model/news-classifier-v1.bin")

In [39]:
modelLoaded = fasttext.load_model("model/news-classifier-v1.bin")



In [40]:
model.predict("Roger Federer wins US Grand Slam Men's final")

(('__label__sports',), array([0.91453463]))

In [41]:
model.predict("North Korea threatens Japan with back to back 4 nuclear tests")

(('__label__politics',), array([0.88016534]))

In [42]:
model.predict("Avengers becomes the highest grossing film")

(('__label__entertainment',), array([0.87965882]))

In [44]:
model.test("data/news-articles.valid")

(40832, 0.5363685344827587, 0.5363685344827587)

In [50]:
model.test("data/news-articles.valid", k=5)

(40832, 0.1458170062695925, 0.7290850313479624)

In [48]:
model.predict("Britain exit from the European Union confirmed", k=5)

(('__label__politics',
  '__label__worldpost',
  '__label__impact',
  '__label__business',
  '__label__religion'),
 array([0.41946396, 0.15596035, 0.13890333, 0.09830396, 0.02962857]))

# Tweaking parameters in FastText

In [71]:
model.predict("india won by 42 runs in the final of ICC cricket world cup", k=5)

(('__label__sports',
  '__label__worldpost',
  '__label__travel',
  '__label__religion',
  '__label__media'),
 array([0.57772529, 0.15812632, 0.05788837, 0.02945946, 0.02899473]))

In [57]:
modelv2 = fasttext.train_supervised(input="data/news-articles.train", epoch=25)

In [58]:
modelv2.test("data/news-articles.valid")

(40832, 0.617701802507837, 0.617701802507837)

In [70]:
modelv2.predict("india won by 42 runs in the final of ICC cricket world cup", k=5)

(('__label__sports',
  '__label__worldpost',
  '__label__travel',
  '__label__politics',
  '__label__impact'),
 array([0.72516674, 0.16883804, 0.05571109, 0.03433479, 0.00498041]))

In [72]:
modelv3 = fasttext.train_supervised(input="data/news-articles.train", lr=1.0)

In [73]:
modelv3.test("data/news-articles.valid")

(40832, 0.698226880877743, 0.698226880877743)

In [74]:
modelv3.predict("india won by 42 runs in the final of ICC cricket world cup", k=5)

(('__label__sports',
  '__label__worldpost',
  '__label__religion',
  '__label__impact',
  '__label__taste'),
 array([0.64999002, 0.09997158, 0.06355541, 0.04865587, 0.04250089]))

In [75]:
modelv4 = fasttext.train_supervised(input="data/news-articles.train", epoch=25, lr=1.0)

In [76]:
modelv4.test("data/news-articles.valid")

(40832, 0.5116085423197492, 0.5116085423197492)

In [91]:
modelv4.predict("narendra modi aquitted for gujarat riots by the court", k=5)

(('__label__politics',
  '__label__healthy',
  '__label__style',
  '__label__money',
  '__label__culture'),
 array([1.00001001e+00, 1.00000034e-05, 1.00000034e-05, 1.00000034e-05,
        1.00000034e-05]))

In [92]:
modelv5 = fasttext.train_supervised(input="data/news-articles.train", epoch=25, wordNgrams=2)

In [93]:
modelv5.test("data/news-articles.valid")

(40832, 0.694283894984326, 0.694283894984326)

In [94]:
modelv6 = fasttext.train_supervised(input="data/news-articles.train", epoch=25, lr=1.0, wordNgrams=2)

In [95]:
modelv6.test("data/news-articles.valid")

(40832, 0.6719974529780565, 0.6719974529780565)

In [96]:
modelv7 = fasttext.train_supervised(input="data/news-articles.train", lr=1.0, wordNgrams=2)

In [97]:
modelv7.test("data/news-articles.valid")

(40832, 0.6640380094043887, 0.6640380094043887)

In [104]:
news = "narendra modi aquitted for gujarat riots by the court"
print(modelv5.predict(news, k=-1, threshold=0.1))
print(modelv6.predict(news, k=-1, threshold=0.1))
print(modelv7.predict(news, k=-1, threshold=0.1))

(('__label__worldpost', '__label__crime', '__label__politics'), array([0.30460522, 0.25598988, 0.14045343]))
(('__label__politics', '__label__crime', '__label__worldpost'), array([0.55176157, 0.24307342, 0.18037586]))
(('__label__politics', '__label__worldpost'), array([0.6828118 , 0.20768003]))


### Tricks for production scalability

In [105]:
modelv8 = fasttext.train_supervised(input="data/news-articles.train", lr=1.0, epoch=25, wordNgrams=2, bucket=200000, dim=50, loss='hs')

In [106]:
modelv8.test("data/news-articles.valid")

(40832, 0.6167221786833855, 0.6167221786833855)

In [107]:
modelv9 = fasttext.train_supervised(input="data/news-articles.train", lr=1.0, epoch=25, wordNgrams=2, bucket=200000, dim=50, loss='ova')

In [108]:
modelv9.test("data/news-articles.valid")

(40832, 0.6457435344827587, 0.6457435344827587)

In [109]:
news = "justin beiber and selena gomez splits after 4 years of relationship"
print(modelv8.predict(news, k=-1, threshold=0.1))
print(modelv9.predict(news, k=-1, threshold=0.1))

(('__label__entertainment',), array([1.00003827]))
(('__label__entertainment',), array([0.99663341]))
