# <center> News Classification with NLP and Neural Networks </center>

### Imports

In [1]:
import warnings
import numpy as np
import pandas as pd
import re
import plotly.express as px
import plotly.graph_objects as go
warnings.filterwarnings('ignore')

### Data

In [2]:
df = pd.read_json('News_Category_Dataset_v2.json', lines=True)
df.sample(5)

Unnamed: 0,category,headline,authors,link,short_description,date
27988,PARENTS,Why I Called 911 On My Autistic Son (And I’d D...,"Teresa Cooper, ContributorTeresa is a wife, m...",https://www.huffingtonpost.com/entry/why-i-cal...,It was not a beautiful day in the neighborhood...,2017-04-29
41044,LATINO VOICES,What Mexicans Have To Say To Americans Worried...,Carolina Moreno,https://www.huffingtonpost.com/entry/what-mexi...,Vox's Liz Plank visited Puebla to get some fre...,2016-12-01
30169,HEALTHY LIVING,Online Therapy Necessary To Address Growing Me...,,https://www.huffingtonpost.comhttp://www.reute...,"As of now, the global mental health system is ...",2017-04-04
110154,GOOD NEWS,Hong Kong Protester Proposes To Girlfriend Ami...,Sarah Barness,https://www.huffingtonpost.com/entry/hong-kong...,,2014-10-06
152267,HOME & LIVING,Weekly Roundup of eBay Vintage Home Finds (PHO...,"Mary Kincaid, Contributor\nFounder and Editor ...",https://www.huffingtonpost.com/entry/weekly-ro...,Want to add some fab eco-friendly style to you...,2013-07-01


### --- Data Thoughts ---
- Should I combine the headline + authors + short_description?
    - If not, built 3 different models and combine probabilities? 
- Should I use date?

# Cleaning

#### Check data types

#### Check NaNs 

#### Check Duplicates

In [3]:
print(df.duplicated().sum())
df = df.drop_duplicates()
df.duplicated().sum()

13


0

### Dates

In [None]:
df['date'] = df['date'].dt.year

#### Authors 
- The `authors` field is a list containing:
    - Name(s)
    - Titles
    - Organizations
    - Misc comments
- It also contains many NaNs in the form of empty strings
- Approach:
    - Replace NaNs with 'unknown'
    - Extract author names from the field, create new field named `author_names`
    - Leave the rest of the information in a new field named `author_notes`

### Author Names

In [6]:
# Replace missing authors with 'unknown'
df['authors'] = df['authors'].apply(lambda x: x.replace('','unknown') if x == '' else x)

In [7]:
# Get author names from list
# Split, get first, title(), split 
df['author_names'] = df['authors'].apply(lambda x: x.replace('By','').strip().split(',')[0].title().split(' And '))

In [8]:
df['author_notes'] = df['authors'].apply(lambda x: ''.join(x.replace('By','').replace('\n','').replace('Contributor','Contributor ').strip().split(',')[1:]).strip())

### Links
- The links are not helpful in their current form, need to extract keyworks from them

In [9]:
df['link_keywords'] = df['link'].apply(lambda x: x.replace('-',',').replace("_",',').replace("entry/",',').split(',')[1:-2])
df['link_keywords'].sample(5)

72551     [police, officer, comforts, dying, horse]
35404                      [drunk, feminist, films]
31696         [donald, trump, refugee, ban, ruling]
185711                 [princess, names, fictional]
192188                      [technology, parenting]
Name: link_keywords, dtype: object

### Drop unwanted features

In [10]:
df = df.drop(columns=[col for col in df.columns if col in ['link','authors']])
df.sample(5)

Unnamed: 0,category,headline,short_description,date,author_names,author_notes,link_keywords
167925,WELLNESS,"Mental Illness, Patient Confidentiality and Gu...",The requirement to discretely disclose individ...,2013,[Dj Jaffe],Contributor Exec. Dir. Mental Illness Policy O...,"[mental, illness, gun, control]"
98600,RELIGION,Episcopal Church Takes A Hard Look At Alcohol ...,,2015,[Unknown],,"[episcopal, church, alcohol]"
150441,TECH,Sprint and IBM Create Platform for Car Apps,"If you hate the idea of red-light cameras, wai...",2013,[Larry Magid],Contributor Technology journalist,"[sprint, and, ibm, create, pla]"
113171,ARTS,Conor Walton: Contemplating Higher Things,"Conor Walton, one of Ireland's leading represe...",2014,[John Seed],Contributor Professor of Art and Art History M...,"[conor, walton, contemplatin]"
132044,DIVORCE,8 Cocktails To Toast Your Terrible Ex (NSFW),Keep in touch! Check out HuffPost Divorce on F...,2014,[Unknown],,"[divorce, party]"


# <center> -------------------------------------------------------------------- </center>

# EDA

#### View categories

In [None]:
print(f"There are {len(df['category'].value_counts())} unique categories including the following:")
px.bar(df['category'].value_counts(), 
       title='Unique New Categories', 
       labels = {"value": "Number of Articles","index": "Category"},
       width = 1700, height = 600)

#### View length of headlines

In [None]:
headline_lengths = df['headline'].apply(lambda x: len(x))
px.histogram(df, 
             x = headline_lengths, 
             marginal = 'box', 
             title = 'Headline Lengths',
             labels = {'x':'Number of Characters'},
             height = 400, width = 700)

### Author Activity

In [None]:
unique_authors_vc = df['author_names'].explode().value_counts()

print(f"There are {len(unique_authors_vc)} unique authors, {unique_authors_vc[0]} ({round(unique_authors_vc[0]/len(unique_authors_vc),2)}%) are unknown." )

px.bar(unique_authors_vc[1:25], 
       title='Unique Authors', 
       labels = {"value": "Number of Articles Written","index": "Author"},
       width = 1200, height = 600)

# <center> -------------------------------------------------------------------- </center>

# Preprocessing Text Data
- Lower all words
- Handle misspellings?
- Stop words
- Stem / Lemmatize text
- Tokenization or specialized regex?
- Use all words or just most frequent?
- Use bigrams, POS taggins, Mutual information Scores?
- What sort of vectorization? (Boolean / Count / TF-IDF / Word2Vec)

In [11]:
import nltk
#nltk.download('stopwords')
#nltk.download('punkt')
from nltk.corpus import stopwords
import string
from nltk import word_tokenize, FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer

### Copy DataFrame to Preserve Original

In [12]:
df = df.copy()
df2.head()

Unnamed: 0,category,headline,short_description,date,author_names,author_notes,link_keywords
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,She left her husband. He killed their children...,2018,[Melissa Jeltsen],,"[texas, amanda, painter, mass, shooting]"
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Of course it has a song.,2018,[Andy Mcdonald],,"[will, smith, joins, diplo, and, nicky, jam, f..."
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,The actor and his longtime girlfriend Anna Ebe...,2018,[Ron Dicker],,"[hugh, grant, marries]"
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,The actor gives Dems an ass-kicking for not fi...,2018,[Ron Dicker],,"[jim, carrey, adam, schiff, democrats]"
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,"The ""Dietland"" actress said using the bags is ...",2018,[Ron Dicker],,"[julianna, margulies, trump, poop, bag]"


### Get Stopwords

In [13]:
stopwords_list = stopwords.words('english') + list(string.punctuation)
stopwords_list += ["''", '""', '...', '``']

### Tokenize
- Create function then use apply on DataFrame

In [14]:
def tokenize_text(input_text):
    tokens = nltk.word_tokenize(input_text)
    stopwords_removed = [token.lower() for token in tokens if token.lower() not in stopwords_list]
    return stopwords_removed

In [15]:
%%time
df2['headline'] = df['headline'].apply(tokenize_text)
df2['short_description'] = df['short_description'].apply(tokenize_text)
df2['author_notes'] = df['author_notes'].apply(tokenize_text)
df2.head()

CPU times: user 1min 7s, sys: 394 ms, total: 1min 7s
Wall time: 1min 8s


Unnamed: 0,category,headline,short_description,date,author_names,author_notes,link_keywords
0,CRIME,"[2, mass, shootings, texas, last, week, 1, tv]","[left, husband, killed, children, another, day...",2018,[Melissa Jeltsen],[],"[texas, amanda, painter, mass, shooting]"
1,ENTERTAINMENT,"[smith, joins, diplo, nicky, jam, 2018, world,...","[course, song]",2018,[Andy Mcdonald],[],"[will, smith, joins, diplo, and, nicky, jam, f..."
2,ENTERTAINMENT,"[hugh, grant, marries, first, time, age, 57]","[actor, longtime, girlfriend, anna, eberstein,...",2018,[Ron Dicker],[],"[hugh, grant, marries]"
3,ENTERTAINMENT,"[jim, carrey, blasts, 'castrato, adam, schiff,...","[actor, gives, dems, ass-kicking, fighting, ha...",2018,[Ron Dicker],[],"[jim, carrey, adam, schiff, democrats]"
4,ENTERTAINMENT,"[julianna, margulies, uses, donald, trump, poo...","[dietland, actress, said, using, bags, really,...",2018,[Ron Dicker],[],"[julianna, margulies, trump, poop, bag]"


### View total vocabulary

In [16]:
all_words = [item for sublist in df2['headline'] for item in sublist]
unique_words = set(all_words)
print(f'There are {len(all_words)} total words and {len(unique_words)} unique words')

There are 1396311 total words and 71377 unique words


In [17]:
df3 = df2.copy()
df3['text'] = df3['headline'] + df3['short_description'] + df3['author_names'] + df3['author_notes'] + df3['link_keywords']
df3 = df3[['category','text']]
df3.sample(5)

Unnamed: 0,category,text
153955,WELLNESS,"[statins, cancer, questions, answers, present,..."
59312,POLITICS,"[trump, ’, think, republican, party, ‘, unifie..."
145580,PARENTING,"[study, shows, dangers, teens, face, using, te..."
38735,TASTE,"[one, resolution, could, help, save, money, ea..."
145011,WELLNESS,"[knocking, heaven, 's, door, near-death, exper..."
