# <center> News Classification with NLP and Neural Networks </center>

### Imports

In [1]:
import warnings
import numpy as np
import pandas as pd
import re
import plotly.express as px
from sklearn import preprocessing
import plotly.graph_objects as go
warnings.filterwarnings('ignore')

### Data
- Import JSON file
- Limit to top categories

In [2]:
# Read in json file to DataFrame
df = pd.read_json('News_Category_Dataset_v2.json', lines=True)

# Get the top categories by article count
top_5_categories = df['category'].value_counts()[:5].index

# Limit the DateFrame to the top categories
df = df[df['category'].isin(top_5_categories)]

# View Results
print(df.shape)
df.sample(5)

(86160, 6)


Unnamed: 0,category,headline,authors,link,short_description,date
21734,POLITICS,"News Roundup for July 12, 2017","Outspeak, ContributorOutspeak™ is the opinion ...",https://www.huffingtonpost.com/entry/news-roun...,But his emails. 1. The saga of the health care...,2017-07-12
174026,TRAVEL,Do You Need A Suncation?,"Vanessa Van Edwards , Contributor\nModern-day ...",https://www.huffingtonpost.com/entry/do-you-ne...,After the torment of Hurricane Sandy and the l...,2012-11-12
143670,STYLE & BEAUTY,Whitney Thompson Of 'Top Model' Fame Lands Plu...,,https://www.huffingtonpost.com/entry/whitney-t...,More plus-size role models: Last we heard from...,2013-09-30
123426,TRAVEL,3 Ways to Make Your Trip Gay -- Even If You're...,"Mark Chesnut, ContributorFounder and editor, L...",https://www.huffingtonpost.com/entry/3-ways-to...,,2014-05-07
67854,POLITICS,Bernie Sanders Rocks Out With Vampire Weekend ...,Kim Bellware,https://www.huffingtonpost.com/entry/bernie-sa...,The song is actually part of the Democratic pr...,2016-01-31


# Cleaning

#### Check data types

#### Check NaNs 

#### Check Duplicates

In [3]:
print(f"There are {df.duplicated().sum()} duplicated rows")

# Drop duplicates
df = df.drop_duplicates()

print(f"There are now {df.duplicated().sum()} duplicated rows")

There are 7 duplicated rows
There are now 0 duplicated rows


### Dates

In [4]:
df['date'] = df['date'].dt.year

#### Authors 
- The `authors` field is a list containing:
    - Name(s)
    - Titles
    - Organizations
    - Misc comments
- It also contains many NaNs in the form of empty strings
- Approach:
    - Replace NaNs with 'unknown'
    - Extract author names from the field, create new field named `author_names` to be used as a bigram
    - Leave the rest of the information in a new field named `author_notes`

### Author Names
- Replace missing
- Get Names
- Clean non-name details
- Cast as strings

In [5]:
# Replace missing authors with 'unknown'
df['authors'] = df['authors'].apply(lambda x: x.replace('','unknown') if x == '' else x)

In [6]:
# Get author names from list
# Split, get first, title(), split 
df['author_names'] = df['authors'].apply(lambda x: x.replace('By','').strip().split(',')[0].lower().split(' And '))

In [7]:
# Store the rest of the authors field without the names into a new field
df['author_notes'] = df['authors'].apply(lambda x: ''.join(x.replace('By','').replace('\n','').replace('Contributor','Contributor ').strip().split(',')[1:]).strip())

# Type cast from list to string
df['author_notes'] = df['author_notes'].astype(str)

### Links
- The links are not helpful in their current form, need to extract keyworks from them

In [8]:
# Replace charaters we want to split on with commas, then split, only get the relevant entries from the resulting list
df['link_keywords'] = df['link'].apply(lambda x: x.replace('-',',').replace("_",',').replace("entry/",',').split(',')[1:-2])

# Typecast from list to string
df['link_keywords'] = df['link_keywords'].apply(lambda x: ' '.join(x))

# View Sample
df['link_keywords'].sample(5)

122221                            what the media miss about
194966                fashionably late style news this week
10064     franken to address senate amid groping and for...
186692                            a supposedly fun thing th
116609                            miles teller cant find th
Name: link_keywords, dtype: object

### Join all text columns

In [9]:
df['text'] = df['headline'] + ' ' + df['short_description'] + ' ' + df['link_keywords'] + ' ' + df['author_names'].astype(str) + ' ' + df['author_notes'] + ' ' + df['date'].astype(str)

### Drop unwanted features

In [10]:
# Drop the feature if it appear in the colums (done this way so the cell can be re-run)
df = df.drop(columns=[col for col in df.columns if col in ['link','authors','headline','short_description','date','link_keywords','author_notes','author_names']])

# View Sample
df.sample(5)

Unnamed: 0,category,text
86575,POLITICS,"Obama Honors Military Families, Cheers On Team..."
191849,TRAVEL,Alaska Airlines Flight Attendants Say New Unif...
3782,ENTERTAINMENT,Billy Bush Reveals Why He Didn’t Shut Down Don...
29278,ENTERTAINMENT,Jay Pharoah Says 'Saturday Night Live' Cast Me...
37652,ENTERTAINMENT,Bella Thorne Says She Received Death Threats A...


# <center> -------------------------------------------------------------------- </center>

# EDA

#### View categories

#### View length of texts

### Author Activity

# <center> -------------------------------------------------------------------- </center>

# Preprocessing Text Data
- Lower all words
- Handle misspellings?
- Stop words
- Stem / Lemmatize text
- Tokenization or specialized regex?
- Use all words or just most frequent?
- Use bigrams, POS taggins, Mutual information Scores?
- What sort of vectorization? (Boolean / Count / TF-IDF / Word2Vec)

In [11]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk import word_tokenize, FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer

### Encode Target Variable

In [12]:
# Instantiate label encoder 
le = preprocessing.LabelEncoder()

# Apply
df['class_label'] = le.fit_transform(df['category'])

### Get Stopwords

In [13]:
stopwords_list = stopwords.words('english') + list(string.punctuation)
stopwords_list += ["''", '""', '...', '``']

### Bag of Words Counts (Count Vectorization)
- The count vectorizer performs tokenization on its own (why do we do it also?)

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

def c_vectorize(text):
    count_vectorizer = CountVectorizer(stop_words= stopwords_list)

    tokens = count_vectorizer.fit_transform(text)

    return tokens, count_vectorizer

X = df["text"]
y = df["class_label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=8)

X_train_tokenized, count_vectorizer = c_vectorize(X_train)
X_test_tokenized = count_vectorizer.transform(X_test)

### Model: Multinomial Bayes

### Evaluation

In [22]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train_tokenized, y_train)
y_predicted_counts = clf.predict(X_test_tokenized)

In [23]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
from sklearn.metrics import confusion_matrix

def get_metrics(y_test, y_predicted):  

    precision = round(precision_score(y_test, y_predicted, average='weighted'),3)
    recall = round(recall_score(y_test, y_predicted, average='weighted'),3)
    f1 = round(f1_score(y_test, y_predicted, average='weighted'),3)
    accuracy = round(accuracy_score(y_test, y_predicted),3)
    
    return accuracy, precision, recall, f1

accuracy, precision, recall, f1 = get_metrics(y_test, y_predicted_counts)

print(f"Accuracy = {accuracy} \nPrecision = {precision} \nRecall = {recall} \nf1 = {f1}")

Accuracy = 0.954 
Precision = 0.954 
Recall = 0.954 
f1 = 0.954


In [None]:
def get_most_important_features(vectorizer, model, n=5):
    index_to_word = {v:k for k,v in vectorizer.vocabulary_.items()}
    
    # loop for each class
    classes = {}
    for class_index in range(model.coef_.shape[0]):
        word_importances = [(el, index_to_word[i]) for i,el in enumerate(model.coef_[class_index])]
        sorted_coeff = sorted(word_importances, key = lambda x : abs(x[0]))
        tops = sorted(sorted_coeff[:n], key = lambda x : x[0])
        bottom = sorted_coeff[-n:]
        classes[class_index] = {
            'tops':tops,
            'bottom':bottom
        }
    return classes

importance = get_most_important_features(count_vectorizer, clf, 10)

In [None]:
importance