# Dataset load

In [85]:
import pandas as pd

df = pd.read_json('News_Category_Dataset_v3_balanced.json', lines=True)
display(df)

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22
...,...,...,...,...,...,...
209522,https://www.huffingtonpost.com/entry/rim-ceo-t...,RIM CEO Thorsten Heins' 'Significant' Plans Fo...,SCIENCE & TECH,Verizon Wireless and AT&T are already promotin...,"Reuters, Reuters",2012-01-28
209523,https://www.huffingtonpost.com/entry/maria-sha...,Maria Sharapova Stunned By Victoria Azarenka I...,SPORTS,"Afterward, Azarenka, more effusive with the pr...",,2012-01-28
209524,https://www.huffingtonpost.com/entry/super-bow...,"Giants Over Patriots, Jets Over Colts Among M...",SPORTS,"Leading up to Super Bowl XLVI, the most talked...",,2012-01-28
209525,https://www.huffingtonpost.com/entry/aldon-smi...,Aldon Smith Arrested: 49ers Linebacker Busted ...,SPORTS,CORRECTION: An earlier version of this story i...,,2012-01-28


In [86]:
text = df.iloc[1]['short_description']
text

"He was subdued by passengers and crew when he fled to the back of the aircraft after the confrontation, according to the U.S. attorney's office in Los Angeles."

In [87]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2, random_state = 1)
train = train.reset_index()
test = test.reset_index()

# Naive_bayes

In [88]:
import nltk
nltk.download("popular")

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /home/iknow/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     /home/iknow/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     /home/iknow/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     /home/iknow/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     /home/iknow/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /home/iknow/nltk_data...
[nltk_data]    |   Package movie_reviews is already up-to-date!

True

In [89]:
from nltk.tokenize import word_tokenize

In [90]:
print(word_tokenize(text))

['He', 'was', 'subdued', 'by', 'passengers', 'and', 'crew', 'when', 'he', 'fled', 'to', 'the', 'back', 'of', 'the', 'aircraft', 'after', 'the', 'confrontation', ',', 'according', 'to', 'the', 'U.S.', 'attorney', "'s", 'office', 'in', 'Los', 'Angeles', '.']


In [91]:
import sys, math

In [111]:
class Naive_bayes: 
    def __init__(self):
        self.words = set() # list of word 
        self.word_freq = {} # frequency of word
        self.category_freq = {} # frequency of category

    def word_count(self, word, category):
        if not category in self.word_freq: # If category doesn't exist in word_freq, add category
            self.word_freq[category] = {}
        if not word in self.word_freq[category]: # If word doesn't exist in word_freq[category], add word
            self.word_freq[category][word] = 0
        
        self.word_freq[category][word] += 1
        self.words.add(word)
    
    def category_count(self, category):
        if not category in self.category_freq:
            self.category_freq[category] = 0
        self.category_freq[category] += 1
    
    def train(self, text, category):
        word_list = word_tokenize(text)
        for word in word_list:
            self.word_count(word, category)
        self.category_count(category)

    def score(self, words, category):
        prob = math.log(self.category_prob(category))
        for word in words:
            prob *= math.log(self.word_prob(word, category))
        
        return prob

    def predict(self, text):
        most_category = None
        high_score = -sys.maxsize
        words = word_tokenize(text)
        score_list = []

        for category in self.category_freq.keys():
            score = self.score(words, category)
            score_list.append((category, score))
            if score > high_score:
                high_score = score
                most_category = category
        
        return most_category, score_list

    def get_word_count(self, word, category):
        if word in self.word_freq[category]:
            return self.word_freq[category][word]
        else:
            return 0
        
    def category_prob(self, category):
        category_sum = sum(self.category_freq.values())
        category_num = self.category_freq[category]
        return category_num / category_sum

    def word_prob(self, word, category):
        # smoothing
        word_num = self.get_word_count(word, category) + 1
        word_sum = sum(self.word_freq[category].values()) + len(self.words)
        return word_num / word_sum

In [112]:
from tqdm import tqdm

In [113]:
Nb = Naive_bayes()

for i in tqdm(range(len(train))):
    Nb.train(train.iloc[i]['headline'] + train.iloc[i]['short_description'], train.iloc[i]['category'])


100%|██████████| 167621/167621 [01:23<00:00, 2006.08it/s]


In [116]:
pred, scorelist = Nb.predict(test.iloc[0]['headline'] + test.iloc[0]['short_description'])
print("result: ", pred)
print("ground_truth: " + test.iloc[0]['headline'] + test.iloc[0]['short_description'] + test.iloc[0]['category'] )
print(scorelist)

result:  U.S. NEWS
ground_truth: As Pay Cards Replace Paychecks, Bank Fees Hurt Workers - NYTimes.comA growing number of American workers are confronting a frustrating predicament on payday: to get their wages, they must firstBUSINESS & FINANCES
[('COMEDY', 2.5796653553333055e+32), ('WORLD NEWS', 1.3981169414712611e+31), ('WOMEN', 1.9692069591544758e+32), ('PARENTING', 1.382275949125197e+31), ('SCIENCE & TECH', 1.279938911787744e+32), ('EDUCATION', 2.2609709477411015e+32), ('WELLNESS', 5.196671828144507e+30), ('MEDIA', 5.837809070643403e+32), ('TRAVEL', 1.264597136731212e+31), ('IMPACT', 4.402181620358873e+31), ('POLITICS', 2.198328108740048e+30), ('CRIME', 3.404395600742321e+32), ('BUSINESS & FINANCES', 1.435128783349627e+30), ('ENTERTAINMENT', 4.183287226579558e+31), ('FOOD & DRINK', 7.780630034910895e+31), ('STYLE & BEAUTY', 4.441194284951859e+31), ('RELIGION', 3.870496157457652e+32), ('DIVORCE', 8.550594039389763e+31), ('ARTS & CULTURE', 1.6480965735584998e+32), ('WEIRD NEWS', 1.76

In [117]:
test

Unnamed: 0,index,link,headline,category,short_description,authors,date
0,161038,https://www.huffingtonpost.comhttp://www.nytim...,"As Pay Cards Replace Paychecks, Bank Fees Hurt...",BUSINESS & FINANCES,A growing number of American workers are confr...,,2013-06-30
1,24711,https://www.huffingtonpost.com/entry/aung-san-...,Aung San Suu Kyi's Speech On The Rohingya Cris...,WORLD NEWS,Imagine there were an ongoing humanitarian cri...,"Michael Shammas, ContributorWriter & Lawyer",2017-09-20
2,54649,https://www.huffingtonpost.com/entry/calls-to-...,Calls To Decriminalize Sex Work Are Growing Lo...,WORLD NEWS,As South Africa’s government debates whether o...,"Jen Thorpe, Women & Girls Hub",2016-10-07
3,199823,https://www.huffingtonpost.com/entry/fico-8-co...,"FICO 8, Not FICO, Is Used In Recent Credit Com...",BUSINESS & FINANCES,And the credit industry is not helping their o...,Catherine New,2012-05-11
4,19088,https://www.huffingtonpost.com/entry/18-year-o...,18-Year-Old Confesses To Molesting 'Upwards Of...,CRIME,Joseph Hayden Boston said he was 10 years old ...,Dominique Mosbergen,2017-12-04
...,...,...,...,...,...,...,...
41901,28017,https://www.huffingtonpost.com/entry/birth-con...,Birth Control Now Available Without Doctor Vis...,WELLNESS,Women in Colorado can now buy birth control wi...,"David Pakman, ContributorHost & Executive Prod...",2017-08-09
41902,20417,https://www.huffingtonpost.com/entry/larry-dav...,Larry David Shows His Enthusiasm Reading Mean ...,COMEDY,"This is pretty, pretty funny.",Lee Moran,2017-11-15
41903,130323,https://www.huffingtonpost.com/entry/let-the-c...,Let The Controversy Commence!,EDUCATION,"History shows us that critics--outliers, whist...","Timothy Patrick McCarthy, Contributor",2014-05-28
41904,69773,https://www.huffingtonpost.com/entry/duke-univ...,Duke University Urges Repeal Of North Carolina...,POLITICS,The renowned university says HB 2 is causing p...,Tyler Kingkade,2016-04-18


In [122]:
category_list = {}

for k in tqdm(range(len(test))):
    category_list[test.iloc[k]['headline'] + test.iloc[k]['short_description']] = test.iloc[k]['category']

100%|██████████| 41906/41906 [00:09<00:00, 4250.01it/s]


In [123]:
len(test)

41906

In [124]:
len(category_list)

41852

In [126]:
correct = 0

for key, value in tqdm(category_list.items()):
    pred, scorelist = Nb.predict(key)
    
    if(pred == value):
        correct += 1

acc = correct/len(test)

100%|██████████| 41852/41852 [1:50:43<00:00,  6.30it/s]  


In [127]:
correct

1937

In [128]:
print("Acc: ", acc)

Acc:  0.04622249797165084


In [81]:
test['category'].value_counts()

POLITICS               7173
WELLNESS               4874
ENTERTAINMENT          3512
PARENTING              2631
STYLE & BEAUTY         2452
GROUPS VOICES          2407
TRAVEL                 1967
WORLD NEWS             1842
FOOD & DRINK           1702
BUSINESS & FINANCES    1518
SPORTS                 1072
COMEDY                 1055
SCIENCE & TECH          882
HOME & LIVING           869
ENVIRONMENT             841
ARTS & CULTURE          750
WEDDINGS                734
WOMEN                   712
DIVORCE                 704
CRIME                   687
IMPACT                  678
MISCELLANEOUS           554
WEIRD NEWS              552
MEDIA                   549
RELIGION                513
EDUCATION               410
U.S. NEWS               266
Name: category, dtype: int64

# Evaluation

In [None]:
def true_positive(y_true, y_pred):
    
    tp = 0
    
    for yt, yp in zip(y_true, y_pred):
        
        if yt == 1 and yp == 1:
            tp += 1
    
    return tp

def true_negative(y_true, y_pred):
    
    tn = 0
    
    for yt, yp in zip(y_true, y_pred):
        
        if yt == 0 and yp == 0:
            tn += 1
            
    return tn

def false_positive(y_true, y_pred):
    
    fp = 0
    
    for yt, yp in zip(y_true, y_pred):
        
        if yt == 0 and yp == 1:
            fp += 1
            
    return fp

def false_negative(y_true, y_pred):
    
    fn = 0
    
    for yt, yp in zip(y_true, y_pred):
        
        if yt == 1 and yp == 0:
            fn += 1
            
    return fn

In [None]:
def macro_precision(y_true, y_pred):

    num_classes = len(y_true.unique())
    
    precision = 0
    
    for classes in tqdm(list(y_true.unique())):
        
        temp_true = [1 if p == classes else 0 for p in y_true]
        temp_pred = [1 if p == classes else 0 for p in y_pred]
        
        tp = true_positive(temp_true, temp_pred)
        
        fp = false_positive(temp_true, temp_pred)
        
        temp_precision = tp / (tp + fp + 1e-6)
        precision += temp_precision
    
    precision /= num_classes
    
    return precision

In [None]:
def macro_recall(y_true, y_pred):

    num_classes = len(y_true.unique())

    recall = 0

    for classes in tqdm(list(y_true.unique())):
        
        temp_true = [1 if p == classes else 0 for p in y_true]
        temp_pred = [1 if p == classes else 0 for p in y_pred]
        
        
        tp = true_positive(temp_true, temp_pred)
        
        fn = false_negative(temp_true, temp_pred)
        
        
        temp_recall = tp / (tp + fn + 1e-6)
        
        recall += temp_recall
        
    recall /= num_classes
    
    return recall