# <center> News Classification with NLP and Neural Networks </center>

### Imports

In [1]:
import warnings
import numpy as np
import pandas as pd
import re
import plotly.express as px
warnings.filterwarnings('ignore')

### Data

In [38]:
df = pd.read_json('News_Category_Dataset_v2.json', lines=True)
df.sample(5)

Unnamed: 0,category,headline,authors,link,short_description,date
74161,POLITICS,Loretta Lynch: Tone Of Refugee Debate Goes Aga...,Ryan J. Reilly,https://www.huffingtonpost.com/entry/syrian-re...,"""Our values are not secondary considerations i...",2015-11-20
17976,GREEN,"After Harvey, Threat Of 'Dramatic Flooding' Co...","David Lohr, Kate Sheppard, Paige Lavender, Roq...",https://www.huffingtonpost.com/entry/harvey-sa...,One meteorologist estimated 20 trillion gallon...,2017-08-26
50697,SPORTS,Ryan Lochte Robbed At Gunpoint In Rio With 3 O...,Nina Golgowski,https://www.huffingtonpost.com/entry/ryan-loch...,"Lochte, who was held up with teammates Gunnar ...",2016-08-14
169966,HOME & LIVING,6 Weird Craigslist Ads From Across The Country...,Amy Marturana,https://www.huffingtonpost.com/entry/weird-cra...,And 5 other totally strange things you can buy...,2012-12-26
48847,POLITICS,"Donald Trump, The Most Dangerous Salesman In T...","Sydney Cicourel, ContributorI'm a blogger, a p...",https://www.huffingtonpost.com/entry/donald-tr...,He is just throwing things out there.,2016-09-04


# Cleaning

#### Check data types

#### Check NaNs 

#### Check Duplicates

In [39]:
print(df.duplicated().sum())
df = df.drop_duplicates()
df.duplicated().sum()

13


0

#### Authors 
- The `authors` field is a list containing:
    - Name(s)
    - Titles
    - Organizations
    - Misc comments
- It also contains many NaNs in the form of empty strings
- Approach:
    - Replace NaNs with 'unknown'
    - Extract author names from the field, create new field named `author_names`
    - Leave the rest of the information in a new field named `author_notes`

### Author Names

In [40]:
# Replace missing authors with 'unknown'
df['authors'] = df['authors'].apply(lambda x: x.replace('','unknown') if x == '' else x)

In [41]:
# Get author names from list
# Split, get first, title(), split 
df['author_names'] = df['authors'].apply(lambda x: x.replace('By','').strip().split(',')[0].title().split(' And '))

In [42]:
df['author_notes'] = df['authors'].apply(lambda x: ''.join(x.replace('By','').replace('\n','').replace('Contributor','Contributor ').strip().split(',')[1:]).strip())

### Links
- The links are not helpful in their current form, need to extract keyworks from them

In [45]:
df['link_keywords'] = df['link'].apply(lambda x: x.replace('-',',').replace("_",',').replace("entry/",',').split(',')[1:-2])
df['link_keywords'].sample(5)

14033                     [diversity, in, fashion]
30302         [dunkin, donuts, butter, settlement]
123403                     [john, kasich, primary]
91105              [dont, call, it, flesheating, ]
114276    [egypt, government, criticism, ferguson]
Name: link_keywords, dtype: object

In [46]:
df = df.drop(columns=[col for col in df.columns if col in ['link','authors']])
df.sample(5)

Unnamed: 0,category,headline,short_description,date,author_names,author_notes,link_keywords
200053,WELLNESS,A Secret to More Happiness and Energy? Give Yo...,"As a result of my happiness project, I've beco...",2012-02-06,[Gretchen Rubin],Contributor Writer The Happiness Project,"[adult, bedtime]"
196850,COMEDY,'Comedy Bang! Bang!' IFC Show 10-Minute Promo ...,The premise for the podcast is simple: Scott p...,2012-03-11,[Ross Luippold],,"[comedy, bang, bang, ifc, show]"
80137,BUSINESS,3 Tools To Make You Feel Better At Work,Because your office doesn't have to suck the l...,2015-09-15,[Alexander C. Kaufman],,"[workplace, wellness, tips]"
60939,POLITICS,"With His New York Walkover, Donald Trump Takes...",The GOP primary calendar is now working to the...,2016-04-20,[Scott Conroy],,"[new, york, donald, trump, gop, nomination]"
190313,WELLNESS,Preventing Degenerative Brain Disease in Our C...,If the experts in traumatic brain injury think...,2012-05-21,"[Anne Wojcicki, Linda Avey]",Contributor Contributor,"[brain, injury]"


# EDA

#### View categories

#### View length of headlines

### Author Activity

In [None]:
unique_authors_vc = df['author_names'].explode().value_counts()

print(f"There are {len(unique_authors_vc)} unique authors, {unique_authors_vc[0]} ({round(unique_authors_vc[0]/len(unique_authors_vc),2)}%) are unknown." )

px.bar(unique_authors_vc[1:50], 
       title='Unique Authors', 
       labels = {"value": "Number of Articles Written","index": "Author"},
       width = 1700, height = 600)

### Category Prevalence Over Time

In [62]:
df['year'] = df['date'].dt.year
df_year = df.groupby(['year','category'])['category'].count()
df_year.head()

year  category      
2012  BLACK VOICES      307
      BUSINESS          679
      COMEDY            604
      CRIME             199
      CULTURE & ARTS    343
Name: category, dtype: int64

In [74]:
for year in df_year.index:
    print(year,df_year.loc[year])

(2012, 'BLACK VOICES') 307
(2012, 'BUSINESS') 679
(2012, 'COMEDY') 604
(2012, 'CRIME') 199
(2012, 'CULTURE & ARTS') 343
(2012, 'DIVORCE') 1256
(2012, 'ENTERTAINMENT') 562
(2012, 'ENVIRONMENT') 482
(2012, 'FOOD & DRINK') 1823
(2012, 'HOME & LIVING') 1620
(2012, 'IMPACT') 332
(2012, 'MONEY') 1434
(2012, 'PARENTING') 3127
(2012, 'QUEER VOICES') 474
(2012, 'SCIENCE') 347
(2012, 'SPORTS') 271
(2012, 'STYLE & BEAUTY') 5190
(2012, 'TECH') 440
(2012, 'TRAVEL') 3229
(2012, 'WEDDINGS') 1493
(2012, 'WELLNESS') 7137
(2013, 'BLACK VOICES') 320
(2013, 'BUSINESS') 846
(2013, 'COMEDY') 487
(2013, 'CRIME') 236
(2013, 'CULTURE & ARTS') 558
(2013, 'DIVORCE') 1772
(2013, 'ENTERTAINMENT') 1015
(2013, 'ENVIRONMENT') 654
(2013, 'FOOD & DRINK') 3420
(2013, 'HOME & LIVING') 2306
(2013, 'IMPACT') 394
(2013, 'MONEY') 81
(2013, 'PARENTING') 4200
(2013, 'QUEER VOICES') 681
(2013, 'SCIENCE') 360
(2013, 'SPORTS') 269
(2013, 'STYLE & BEAUTY') 3482
(2013, 'TECH') 332
(2013, 'TRAVEL') 3384
(2013, 'WEDDINGS') 1755
(2013