## Data Extraction & Analysis

In [None]:
# https://www.kaggle.com/rmisra/news-category-dataset -> archive.zip
# https://dl.fbaipublicfiles.com/fasttext/data/cooking.stackexchange.tar.gz -> cooking.stackexchange.txt

!unzip data/archive.zip -d data/
!mv data/News_Category_Dataset_v2.json data/news-articles.jsonl

In [1]:
!head -2 data/news-articles.jsonl

{"category": "CRIME", "headline": "There Were 2 Mass Shootings In Texas Last Week, But Only 1 On TV", "authors": "Melissa Jeltsen", "link": "https://www.huffingtonpost.com/entry/texas-amanda-painter-mass-shooting_us_5b081ab4e4b0802d69caad89", "short_description": "She left her husband. He killed their children. Just another day in America.", "date": "2018-05-26"}
{"category": "ENTERTAINMENT", "headline": "Will Smith Joins Diplo And Nicky Jam For The 2018 World Cup's Official Song", "authors": "Andy McDonald", "link": "https://www.huffingtonpost.com/entry/will-smith-joins-diplo-and-nicky-jam-for-the-official-2018-world-cup-song_us_5b09726fe4b0fdb2aa541201", "short_description": "Of course it has a song.", "date": "2018-05-26"}


In [2]:
!head -5 data/cooking.stackexchange.txt

__label__sauce __label__cheese How much does potato starch affect a cheese sauce recipe?
__label__food-safety __label__acidity Dangerous pathogens capable of growing in acidic environments
__label__cast-iron __label__stove How do I cover up the white spots on my cast iron stove?
__label__restaurant Michelin Three Star Restaurant; but if the chef is not there
__label__knife-skills __label__dicing Without knife skills, how can I quickly and accurately dice vegetables?


## Data Preparation & Standardization

In [3]:
import json
fileReader = open("data/news-articles.jsonl", "r")
fileWriter = open("data/news-articles.txt", "w")
for line in fileReader:
    news = dict(json.loads(line))
    fileWriter.write("__label__"+news["category"].lower()+" "+news["headline"].lower()+"\n")

In [4]:
!head -5 data/news-articles.txt

__label__crime there were 2 mass shootings in texas last week, but only 1 on tv
__label__entertainment will smith joins diplo and nicky jam for the 2018 world cup's official song
__label__entertainment hugh grant marries for the first time at age 57
__label__entertainment jim carrey blasts 'castrato' adam schiff and democrats in new artwork
__label__entertainment julianna margulies uses donald trump poop bags to pick up after her dog


## Splitting dataset into training & testing

In [5]:
!wc data/news-articles.txt

  200832  2189821 15670354 data/news-articles.txt


In [6]:
!head -n 160000 data/news-articles.txt > data/news-articles.train
!tail -n 40832 data/news-articles.txt > data/news-articles.valid

## Usage FastText basic

In [7]:
import fasttext

model = fasttext.train_supervised(input="data/news-articles.train")

In [8]:
model.save_model("model/news-classifier-v1.bin")

In [9]:
modelLoaded = fasttext.load_model("model/news-classifier-v1.bin")



In [10]:
model.predict("Roger Federer wins US Grand Slam Men's final")

(('__label__sports',), array([0.98920584]))

In [11]:
model.predict("North Korea threatens Japan with back to back 4 nuclear tests")

(('__label__politics',), array([0.87190288]))

In [12]:
model.predict("Avengers becomes the highest grossing film")

(('__label__entertainment',), array([0.94130796]))

In [13]:
model.test("data/news-articles.valid")

(40832, 0.7142437304075235, 0.7142437304075235)

In [14]:
model.test("data/news-articles.valid", k=5)

(40832, 0.1792662617554859, 0.8963313087774295)

In [15]:
model.predict("finance minister proposed union budget for 2020-21", k=5)

(('__label__politics',
  '__label__worldpost',
  '__label__business',
  '__label__green',
  '__label__media'),
 array([0.9599784 , 0.02671684, 0.00617307, 0.00214208, 0.00102552]))

# Tweaking parameters in FastText

In [16]:
modelv2 = fasttext.train_supervised(input="data/news-articles.train", epoch=25)
modelv2.test("data/news-articles.valid")

(40832, 0.6333512931034483, 0.6333512931034483)

In [17]:
modelv3 = fasttext.train_supervised(input="data/news-articles.train", lr=1.0)
modelv3.test("data/news-articles.valid")

(40832, 0.6725852272727273, 0.6725852272727273)

In [18]:
modelv4 = fasttext.train_supervised(input="data/news-articles.train", epoch=25, lr=1.0)
modelv4.test("data/news-articles.valid")

(40832, 0.5824843260188087, 0.5824843260188087)

In [19]:
modelv2.predict("india won by 42 runs in the final of ICC cricket world cup", k=5)

(('__label__worldpost',
  '__label__sports',
  '__label__travel',
  '__label__politics',
  '__label__taste'),
 array([0.44614202, 0.41810945, 0.06781565, 0.04007414, 0.00905261]))

In [20]:
modelv3.predict("india won by 42 runs in the final of ICC cricket world cup", k=5)

(('__label__sports',
  '__label__travel',
  '__label__worldpost',
  '__label__impact',
  '__label__taste'),
 array([0.73859113, 0.13595396, 0.05320239, 0.02036113, 0.01442484]))

In [21]:
modelv4.predict("narendra modi aquitted for gujarat riots by the court", k=5)

(('__label__politics',
  '__label__healthy',
  '__label__style',
  '__label__money',
  '__label__culture'),
 array([1.00001001e+00, 1.00000034e-05, 1.00000034e-05, 1.00000034e-05,
        1.00000034e-05]))

## wordNgrams usage

In [22]:
modelv5 = fasttext.train_supervised(input="data/news-articles.train", epoch=25, wordNgrams=2)
modelv5.test("data/news-articles.valid")

(40832, 0.6777282523510971, 0.6777282523510971)

In [23]:
modelv6 = fasttext.train_supervised(input="data/news-articles.train", epoch=25, lr=1.0, wordNgrams=2)
modelv6.test("data/news-articles.valid")

(40832, 0.670846394984326, 0.670846394984326)

In [24]:
modelv7 = fasttext.train_supervised(input="data/news-articles.train", lr=1.0, wordNgrams=2)
modelv7.test("data/news-articles.valid")

(40832, 0.6840958072100314, 0.6840958072100314)

In [25]:
news = "narendra modi aquitted for gujarat riots by the court"
print(modelv5.predict(news, k=-1, threshold=0.1))
print(modelv6.predict(news, k=-1, threshold=0.1))
print(modelv7.predict(news, k=-1, threshold=0.1))

(('__label__worldpost', '__label__crime'), array([0.32675183, 0.29385334]))
(('__label__politics', '__label__worldpost'), array([0.52305132, 0.39108673]))
(('__label__crime', '__label__politics', '__label__worldpost'), array([0.42876673, 0.31186277, 0.14133641]))


## Tricks for production scalability & multi label classification

In [26]:
modelv8 = fasttext.train_supervised(input="data/news-articles.train", lr=1.0, epoch=25, wordNgrams=2, bucket=200000, dim=50, loss='hs')
modelv8.test("data/news-articles.valid")

(40832, 0.6174568965517241, 0.6174568965517241)

In [27]:
modelv9 = fasttext.train_supervised(input="data/news-articles.train", lr=1.0, epoch=25, wordNgrams=2, bucket=200000, dim=50, loss='ova')
modelv9.test("data/news-articles.valid")

(40832, 0.655907131661442, 0.655907131661442)

In [28]:
news = "justin beiber and selena gomez splits after 4 years of relationship"
print(modelv8.predict(news, k=-1, threshold=0.1))
print(modelv9.predict(news, k=-1, threshold=0.1))

(('__label__entertainment',), array([0.99964833]))
(('__label__entertainment',), array([0.95397609]))


## Using Autotuning feature for hyperparameters

In [29]:
modelv10 = fasttext.train_supervised(input='data/news-articles.train', autotuneValidationFile='data/news-articles.valid')
modelv10.test("data/news-articles.valid")

(40832, 0.70579447492163, 0.70579447492163)

In [30]:
modelv11 = fasttext.train_supervised(input='data/news-articles.train', autotuneValidationFile='data/news-articles.valid', autotuneDuration=600)
modelv11.test("data/news-articles.valid")

(40832, 0.6934022335423198, 0.6934022335423198)

In [31]:
modelv12 = fasttext.train_supervised(input='data/news-articles.train', autotuneValidationFile='data/news-articles.valid', autotuneModelSize="2M")
modelv12.test("data/news-articles.valid")

(40832, 0.6831406739811913, 0.6831406739811913)