# <center> News Classification with NLP and Neural Networks </center>

### Imports

In [1]:
import warnings
import numpy as np
import pandas as pd
import re
import plotly.express as px
from sklearn import preprocessing
import plotly.graph_objects as go
warnings.filterwarnings('ignore')

### Data
- Import JSON file
- Limit to top 5 categories

In [2]:
# Read in json file to DataFrame
df = pd.read_json('News_Category_Dataset_v2.json', lines=True)

# Limit to the top 5 categories
df = df[df['category'].isin(['POLITICS','WELLNESS','ENTERTAINMENT','TRAVEL','STYLE & BEAUTY'])]

# View Results
print(df.shape)
df.sample(5)

(86160, 6)


Unnamed: 0,category,headline,authors,link,short_description,date
6825,ENTERTAINMENT,Try Not To Cry At This Surprise Military Reuni...,Lee Moran,https://www.huffingtonpost.com/entry/ellen-mil...,Seriously. This gets emotional.,2018-01-26
74102,ENTERTAINMENT,Kylie Jenner's Snapchats May Be A Message To T...,Bill Bradley,https://www.huffingtonpost.com/entry/kylie-jen...,"Kylie is moving on, and she wants Tyga to know...",2015-11-21
31126,POLITICS,Senate Republicans Vote To Overturn Internet P...,"David Shepardson, Reuters",https://www.huffingtonpost.com/entry/senate-re...,The vote was a victory for internet providers ...,2017-03-23
29566,POLITICS,Nancy Pelosi Calls For Sean Spicer's Ouster Am...,Igor Bobic,https://www.huffingtonpost.com/entry/nancy-pel...,"""Either he is speaking for the president, or t...",2017-04-11
60426,POLITICS,Women Claim They Were Kicked Out Of A Cafe For...,Christopher Mathias,https://www.huffingtonpost.com/entry/muslim-wo...,"""Beautiful location, mediocre boba, all served...",2016-04-26


# Cleaning

#### Check data types

#### Check NaNs 

#### Check Duplicates

In [3]:
print(df.duplicated().sum())
df = df.drop_duplicates()
df.duplicated().sum()

7


0

### Dates

In [4]:
df['date'] = df['date'].dt.year

#### Authors 
- The `authors` field is a list containing:
    - Name(s)
    - Titles
    - Organizations
    - Misc comments
- It also contains many NaNs in the form of empty strings
- Approach:
    - Replace NaNs with 'unknown'
    - Extract author names from the field, create new field named `author_names` to be used as a bigram
    - Leave the rest of the information in a new field named `author_notes`

### Author Names

In [5]:
# Replace missing authors with 'unknown'
df['authors'] = df['authors'].apply(lambda x: x.replace('','unknown') if x == '' else x)

In [6]:
# Get author names from list
# Split, get first, title(), split 
df['author_names'] = df['authors'].apply(lambda x: x.replace('By','').strip().split(',')[0].title().split(' And '))

In [7]:
# Store the rest of the authors field without the names into a new field
df['author_notes'] = df['authors'].apply(lambda x: ''.join(x.replace('By','').replace('\n','').replace('Contributor','Contributor ').strip().split(',')[1:]).strip())

# Type cast from list to string
df['author_notes'] = df['author_notes'].astype(str)

### Links
- The links are not helpful in their current form, need to extract keyworks from them

In [8]:
# Replace charaters we want to split on with commas, then split, only get the relevant entries from the resulting list
df['link_keywords'] = df['link'].apply(lambda x: x.replace('-',',').replace("_",',').replace("entry/",',').split(',')[1:-2])

# Typecast from list to string
df['link_keywords'] = df['link_keywords'].apply(lambda x: ' '.join(x))

# View Sample
df['link_keywords'].sample(5)

47511                 donald trump skittle photo refugee
21034       donald trump marshall mcluhan and healthcare
165224                                             lying
70862     benedict cumberbatch doctor strange first look
125437                         beyonce solange coachella
Name: link_keywords, dtype: object

### Join all text columns
- Leave `author_names` until after tokenization to preserve full name

In [10]:
df['text'] = df['headline'] + ' ' + df['short_description'] + ' ' + df['link_keywords'] + ' ' + df['author_notes'] + ' ' + df['date'].astype(str)

### Drop unwanted features

In [11]:
# Drop the feature if it appear in the colums (done this way so the cell can be re-run)
df = df.drop(columns=[col for col in df.columns if col in ['link','authors','headline','short_description','date','link_keywords','author_notes']])

# View Sample
df.sample(5)

Unnamed: 0,category,author_names,text
140168,WELLNESS,[Dawn Jackson Blatner],The Great Pumpkin Guide Although I am a huge f...
187211,WELLNESS,[Gangaji],"Freedom in Prison: Grace Is Here, Too We can e..."
26983,POLITICS,[Lydia O'Connor],ACLU Issues Travel Warning For Texas Amid New ...
115350,ENTERTAINMENT,[Leigh Blickley],The Worst Onscreen Portrayals Of Famous Book C...
177325,WELLNESS,[Pavel Somov],A Reinvented Meal: First Course -- Relaxation ...


# <center> -------------------------------------------------------------------- </center>

# EDA

#### View categories

In [None]:
print(f"There are {len(df['category'].value_counts())} unique categories including the following:")
px.bar(df['category'].value_counts(), 
       title='Unique New Categories', 
       labels = {"value": "Number of Articles","index": "Category"},
       width = 800, height = 450)

#### View length of texts

In [None]:
headline_lengths = df['text'].apply(lambda x: len(x))
px.histogram(df, 
             x = headline_lengths, 
             marginal = 'box', 
             title = 'Text Lengths',
             labels = {'x':'Number of Characters'},
             color = 'category',
             height = 500, width = 800)

### Author Activity

In [None]:
unique_authors_vc = df['author_names'].explode().value_counts()

print(f"There are {len(unique_authors_vc)} unique authors, {unique_authors_vc[0]} ({round(unique_authors_vc[0]/len(unique_authors_vc),2)}%) are unknown." )

px.bar(unique_authors_vc[1:25], 
       title='Unique Authors', 
       labels = {"value": "Number of Articles Written","index": "Author"},
       width = 1200, height = 600)

In [11]:
df

Unnamed: 0,category,author_names,text
1,ENTERTAINMENT,[Andy Mcdonald],Will Smith Joins Diplo And Nicky Jam For The 2...
2,ENTERTAINMENT,[Ron Dicker],Hugh Grant Marries For The First Time At Age 5...
3,ENTERTAINMENT,[Ron Dicker],Jim Carrey Blasts 'Castrato' Adam Schiff And D...
4,ENTERTAINMENT,[Ron Dicker],Julianna Margulies Uses Donald Trump Poop Bags...
5,ENTERTAINMENT,[Ron Dicker],Morgan Freeman 'Devastated' That Sexual Harass...
...,...,...,...
200804,TRAVEL,[Stefani Jackenthal],Tea Time Beneath The Western Ghats (PHOTOS) As...
200805,WELLNESS,[Ellie Knaus],This Is Only the Beginning: Surprising Advice ...
200806,STYLE & BEAUTY,[Sarah Leon],"Cheryl Tiegs In A Sauna: A Look Back ""A Look B..."
200838,ENTERTAINMENT,[Courtney Garcia],"Sundance, Ice-T, and Shades of the American Ra..."


# <center> -------------------------------------------------------------------- </center>

# Preprocessing Text Data
- Lower all words
- Handle misspellings?
- Stop words
- Stem / Lemmatize text
- Tokenization or specialized regex?
- Use all words or just most frequent?
- Use bigrams, POS taggins, Mutual information Scores?
- What sort of vectorization? (Boolean / Count / TF-IDF / Word2Vec)

In [15]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk import word_tokenize, FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer

### Encode Targets

In [17]:
le = preprocessing.LabelEncoder()

df['class_label'] = le.fit_transform(df['category'])

### Get Stopwords

In [18]:
stopwords_list = stopwords.words('english') + list(string.punctuation)
stopwords_list += ["''", '""', '...', '``']

### Tokenize
- Create function then use apply on DataFrame
- The function will
    - Accept an input text
    - Tokenize using nltk
    - Lower the words
    - Remove stop words

In [19]:
def tokenize_text(input_text):
    tokens = nltk.word_tokenize(input_text)
    lower_and_stopwords_removed = [token.lower() for token in tokens if token.lower() not in stopwords_list]
    return lower_and_stopwords_removed

In [20]:
%%time
df['tokens'] = df['text'].apply(tokenize_text)
df.sample(5)

CPU times: user 28.4 s, sys: 63.3 ms, total: 28.5 s
Wall time: 28.6 s


Unnamed: 0,category,author_names,text,class_label,tokens
193502,WELLNESS,[Janice Van Dyck],"When Your Loved Ones Hold Your Life in Their Hands National Healthcare Decisions Day (NHDD) falls conveniently every April 16 so you can deal with the difficult matters of death and taxes all in one week. NHDD is one day when we're asked to put our own discomfort aside and think about the loved ones we leave behind. advanced care directive Contributor Author ""Finding Frances"" 2012",4,"[loved, ones, hold, life, hands, national, healthcare, decisions, day, nhdd, falls, conveniently, every, april, 16, deal, difficult, matters, death, taxes, one, week, nhdd, one, day, 're, asked, put, discomfort, aside, think, loved, ones, leave, behind, advanced, care, directive, contributor, author, finding, frances, 2012]"
157637,ENTERTAINMENT,[Unknown],"Mark Hamill On The 'Star Wars' Franchise At CapeTown Film Fest This year, EW celebrated May the Fourth with a full day of ""Return of the Jedi,"" screening the end of the first (and definitely wars mark hamill 2013",0,"[mark, hamill, 'star, wars, franchise, capetown, film, fest, year, ew, celebrated, may, fourth, full, day, return, jedi, screening, end, first, definitely, wars, mark, hamill, 2013]"
180458,WELLNESS,[Unknown],"Yoga At The Huffington Post's Oasis: Attendees Encouraged To Take A Class (PHOTOS) The benefits of yoga are plenty: from managing anxiety to combatting back pain, it's no wonder attendees of The Huffington yoga oasis off the mat 2012",4,"[yoga, huffington, post, 's, oasis, attendees, encouraged, take, class, photos, benefits, yoga, plenty, managing, anxiety, combatting, back, pain, 's, wonder, attendees, huffington, yoga, oasis, mat, 2012]"
77988,ENTERTAINMENT,[Carly Ledbetter],"The Backstreet Boys And The Spice Girls Might Reunite For An Epic Tour ""Oh my God, they're back again."" backstreet boy spice girls tour 2015",0,"[backstreet, boys, spice, girls, might, reunite, epic, tour, oh, god, 're, back, backstreet, boy, spice, girls, tour, 2015]"
142625,STYLE & BEAUTY,[Chris Formosa],"Add New Facets to Your Personality With Perfumes Obviously, there are plenty of perfumes characterized by an incredible versatility, but sometimes it's better to focus on the ones that do a single thing. add new facets to your pe Contributor Video Blogger FragranceTalk.com 2013",2,"[add, new, facets, personality, perfumes, obviously, plenty, perfumes, characterized, incredible, versatility, sometimes, 's, better, focus, ones, single, thing, add, new, facets, pe, contributor, video, blogger, fragrancetalk.com, 2013]"


### Add in author names to tokens

In [22]:
df['tokens'] = df['tokens'] + df['author_names']
df = df.drop(columns=['author_names'])
df = df[['category','class_label','text','tokens']]
df.sample(5)

Unnamed: 0,category,class_label,text,tokens
29862,POLITICS,1,CNN Analyst Compares Syria Strike To Kentucky Basketball Apparently they're two different things! james marks cnn syria basketball 2017,"[cnn, analyst, compares, syria, strike, kentucky, basketball, apparently, 're, two, different, things, james, marks, cnn, syria, basketball, 2017, Sebastian Murdock]"
5940,ENTERTAINMENT,0,"What Exactly Is 'The Bachelor Winter Games'? Contestant Lesley Murphy gives the inside scoop on this mysterious new ""Bachelor"" spinoff. the bachelor winter games Emma Gray and Nick Offenberg 2018","[exactly, 'the, bachelor, winter, games, contestant, lesley, murphy, gives, inside, scoop, mysterious, new, bachelor, spinoff, bachelor, winter, games, emma, gray, nick, offenberg, 2018, Claire Fallon]"
48590,POLITICS,1,"Harry Reid: 'Donald Trump Is An American Nightmare' ""He’s a bigot, he’s a scam artist, he won’t show us his tax return, and Sen. Grassley is holding the Supreme Court vacant for this man,"" Reid said. harry reid donald trump american nightmare 2016","[harry, reid, 'donald, trump, american, nightmare, ’, bigot, ’, scam, artist, ’, show, us, tax, return, sen., grassley, holding, supreme, court, vacant, man, reid, said, harry, reid, donald, trump, american, nightmare, 2016, Michael Mcauliff]"
87216,POLITICS,1,Even Ralph Nader And Grover Norquist Agree On Open Government open government 2015,"[even, ralph, nader, grover, norquist, agree, open, government, open, government, 2015, Alexander Howard]"
40380,POLITICS,1,"Senate Democrats Give Up On Coal Miner Health Benefits, Averting Government Shutdown (UPDATE) Sen. Joe Manchin said he hopes to enlist President-elect Donald Trump's help for the miners. government shutdown coal miners 2016","[senate, democrats, give, coal, miner, health, benefits, averting, government, shutdown, update, sen., joe, manchin, said, hopes, enlist, president-elect, donald, trump, 's, help, miners, government, shutdown, coal, miners, 2016, Laura Barrón-López]"


### View total vocabulary

In [31]:
all_words = [item for sublist in df['tokens'] for item in sublist]
unique_words = set(all_words)
print(f'There are {len(all_words)} total words and {len(unique_words)} unique words')

There are 2249440 total words and 91787 unique words


### Frequencies

In [34]:
articles_freqdist = FreqDist(all_words)
articles_freqdist.most_common(25)

[('contributor', 29753),
 ('trump', 21792),
 ('2012', 17807),
 ('2013', 17117),
 ('2016', 15152),
 ('2017', 14457),
 ('Unknown', 14204),
 ('2014', 11742),
 ('new', 10994),
 ('2015', 10895),
 ('photos', 8325),
 ('donald', 8096),
 ('one', 7004),
 ('author', 6154),
 ('health', 5716),
 ('time', 5561),
 ('people', 5402),
 ('travel', 5160),
 ('2018', 5104),
 ('us', 5090),
 ('like', 5084),
 ('life', 5067),
 ('president', 4847),
 ('day', 4810),
 ('world', 4630)]

- Many of these tokens such as "'s" are not meaningful
- Create a function to remove them

### Create Function to remove meaningless tokens

In [29]:
tokens_to_remove = ["'s","n't","’","--","'re","“","'ve"]

df['tokens'] = df['tokens'].apply(lambda x: [i for i in x if i not in tokens_to_remove])

### Bag of Words Counts (Count Vectorization)
- The count vectorizer performs tokenization on its own (why do we do it also?)

In [35]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

def c_vectorize(tokens):
    count_vectorizer = CountVectorizer()

    results = count_vectorizer.fit_transform(tokens)

    return results, count_vectorizer

X = df["text"].tolist()
y = df["class_label"].tolist()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=8)

X_train_counts, count_vectorizer = c_vectorize(X_train)
X_test_counts = count_vectorizer.transform(X_test)

### Model: Multinomial Bayes

### Evaluation

In [69]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train_counts, y_train)
y_predicted_counts = clf.predict(X_test_counts)

In [70]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report

def get_metrics(y_test, y_predicted):  
    # true positives / (true positives+false positives)
    precision = precision_score(y_test, y_predicted, pos_label=None,
                                    average='weighted')             
    # true positives / (true positives + false negatives)
    recall = recall_score(y_test, y_predicted, pos_label=None,
                              average='weighted')
    
    # harmonic mean of precision and recall
    f1 = f1_score(y_test, y_predicted, pos_label=None, average='weighted')
    
    # true positives + true negatives/ total
    accuracy = accuracy_score(y_test, y_predicted)
    return accuracy, precision, recall, f1

accuracy, precision, recall, f1 = get_metrics(y_test, y_predicted_counts)
print("accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))

accuracy = 0.936, precision = 0.936, recall = 0.936, f1 = 0.936
