In [228]:
import json
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import numpy as np

# Read the JSON File as a DataFrame
print("Loading file...")
f = pd.read_json('News_Category_Dataset_v2.json', lines=True)
df = f.sample(frac=0.01)

# Use of the NLTK package to "clean" data by removing stop-words, and then assess most common remaining words
ENGLISH_RE = re.compile(r'[a-z]+')
stop_words = stopwords.words("english")
stop_words.extend(['said'])  # Removal of the word "said", a common stop-word not in the stop word corpus

# Concatenate the headline and short descriptions and then parse only the columns we care about
print("Formatting content...")
df = df.assign(content=(df["headline"] + " " + df["short_description"]))
df = df[["category", "content"]]

# Remove stop words, numbers, and punctuation.
# Note: We have to be careful with em dash characters because they will join words when removed, so we handle those separately.
df['content'] = df['content'].apply(lambda x: ' '.join([re.sub('[^a-z\s]+', '', re.sub('[-]+', ' ', word.lower())) for word in x.split() if word.lower() not in stop_words and ENGLISH_RE.match(word.lower())]))

# Tokenize words
print("Tokenizing words...")
word_freq = df.groupby(['category'], as_index = False)['content'].apply(' '.join)
word_freq['word'] = word_freq['content'].apply(lambda x: nltk.FreqDist(x.split(' ')).items())
word_freq = word_freq.explode('word').reset_index(drop = True)
word_freq['freq'] = word_freq.word.apply(lambda pair: pair[1])
word_freq['word'] = word_freq.word.apply(lambda pair: pair[0])
del word_freq['content']

# Fill remaining words with zero values
word_freq = word_freq.pivot(*word_freq).stack(dropna=False).fillna(0).reset_index(name='freq')

# Laplacian Smoothing
word_freq['freq'] += 1

# Predict Categories for Articles
def p_word_given_cat(category_filter, word):
    word_filter = category_filter.loc[category_filter['word'] == word]
    return word_filter.iloc[0]['p_given_cat']

print("Predicting Categories (this is going to be a while)...")
df['content'] = df['content'].apply(lambda x: x.split())
df = df.reset_index(drop = True)

category_totals = df.groupby('category').size()
category_totals += 1
word_freq['p_given_cat'] = word_freq.groupby('category')['freq'].transform(lambda x: x / category_totals[x.name])

curr = 1
categories = df['category'].unique()

articles_expanded = df.explode('content')
for category in categories:
    print("Calculating category probability: " + str(curr) + "/" + str(categories.size) + " - " + category)
    category_filter = word_freq.loc[word_freq['category'] == category]
    cat_articles = df.loc[df['category'] == category].shape[0]
    p_cat = cat_articles / df.shape[0]
    df[category] = df['content'].apply(lambda x: category_filter[category_filter['word'].isin(x)]['p_given_cat'].prod()) * p_cat
    curr += 1

df['predicted_category'] = df[df.columns.difference(['category', 'content'])].idxmax(axis=1)
df = df.drop(df.columns.difference(['category', 'content', 'predicted_category']), axis=1).reset_index(drop = True)

print(df.head(50))

Loading file...
Formatting content...
Tokenizing words...
Predicting Categories (this is going to be a while)...
Calculating category probability: 1/41 - PARENTING
Calculating category probability: 2/41 - TECH
Calculating category probability: 3/41 - BUSINESS
Calculating category probability: 4/41 - POLITICS
Calculating category probability: 5/41 - FOOD & DRINK
Calculating category probability: 6/41 - WELLNESS
Calculating category probability: 7/41 - HEALTHY LIVING
Calculating category probability: 8/41 - ENTERTAINMENT
Calculating category probability: 9/41 - MEDIA
Calculating category probability: 10/41 - FIFTY
Calculating category probability: 11/41 - PARENTS
Calculating category probability: 12/41 - STYLE
Calculating category probability: 13/41 - COMEDY
Calculating category probability: 14/41 - SPORTS
Calculating category probability: 15/41 - STYLE & BEAUTY
Calculating category probability: 16/41 - WORLDPOST
Calculating category probability: 17/41 - TASTE
Calculating category probab