# <center> News Classification with NLP and Neural Networks</center>

### Imports

In [1]:
import warnings
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn import preprocessing
import plotly.graph_objects as go
warnings.filterwarnings('ignore')

### Data
- Import JSON file
- Limit to top categories

In [2]:
# Read in json file to DataFrame
df = pd.read_json('News_Category_Dataset_v2.json', lines=True)

# View Results
print(df.shape)
df.sample(5)

(200853, 6)


Unnamed: 0,category,headline,authors,link,short_description,date
195206,WELLNESS,Farming 101: Growing Pains,"Leona Palmer, Contributor\nEx-model, current f...",https://www.huffingtonpost.com/entry/farming_u...,"Farming, I'm learning, is one part obstacle co...",2012-03-29
37669,WEIRD NEWS,Man Who Says He Changed 'Hollywood' Sign To 'H...,David Moye,https://www.huffingtonpost.com/entry/zachary-c...,“It was something to smile and laugh out loud ...,2017-01-10
23544,WORLD NEWS,"On Day One Of The Brexit Talks, EU Reminds Bri...",,https://www.huffingtonpost.com/entry/brexit-ta...,“I am neither optimistic nor pessimistic. I am...,2017-06-19
200508,WEDDINGS,6 Creative Ways To Cut Wedding Costs,,https://www.huffingtonpost.comhttp://money.usn...,Brides may dream of a wedding with an endless ...,2012-02-01
59814,TRAVEL,Japan's Huis Ten Bosch Theme Park Is Opening A...,Suzy Strutner,https://www.huffingtonpost.com/entry/huis-ten-...,...and that's not even the weirdest part.,2016-05-03


# Cleaning

#### Check data types

#### Check NaNs 

#### Check Duplicates

In [3]:
print(f"There are {df.duplicated().sum()} duplicated rows")

# Drop duplicates
df = df.drop_duplicates()

print(f"There are now {df.duplicated().sum()} duplicated rows")

There are 13 duplicated rows
There are now 0 duplicated rows


### Dates

In [4]:
df['date'] = df['date'].dt.year

#### Authors 
- The `authors` field is a list containing:
    - Name(s)
    - Titles
    - Organizations
    - Misc comments
- It also contains many NaNs in the form of empty strings
- Approach:
    - Replace NaNs with 'unknown'
    - Extract author names from the field, create new field named `author_names` to be used as a bigram
    - Leave the rest of the information in a new field named `author_notes`

### Author Names
- Replace missing
- Get Names
- Clean non-name details
- Cast as strings

In [5]:
# Replace missing authors with 'unknown'
df['authors'] = df['authors'].apply(lambda x: x.replace('','unknown') if x == '' else x)

In [6]:
# Get author names from list
df['author_names'] = df['authors'].apply(lambda x: x.replace('By','').strip().split(',')[0].lower().split(' And '))

In [7]:
# Clean up common formatting issues
df['authors'] = df['authors'].apply(lambda x: x.replace('By','').replace('\n','').replace('Contributor','Contributor ').strip())

### Links
- The links are not helpful in their current form, need to extract keyworks from them

In [16]:
# Replace charaters we want to split on with commas, then split, only get the relevant entries from the resulting list
df['link_keywords'] = df['link'].apply(lambda x: x.replace('-',',').replace("_",',').replace("entry/",',').split(',')[1:-2])

# Typecast from list to string
df['link_keywords'] = df['link_keywords'].apply(lambda x: ' '.join(x))

### Join all text columns

In [17]:
df['text'] = df['headline'] + ' ' + df['short_description'] + ' ' + df['link_keywords'] + ' ' + df['authors'] + ' ' + df['date'].astype(str)

### Drop unwanted features

In [18]:
# Drop the feature if it appear in the colums (done this way so the cell can be re-run)
df = df.drop(columns=[col for col in df.columns if col in ['link','authors','headline','short_description','date','link_keywords','author_notes','author_names']])

# View Sample
df.sample(5)

Unnamed: 0,category,text
134521,COMEDY,"The Best Dogs-With-Human-Hands Music Video You'll See Today This music video by Brussels-based band Recorders is from early 2013, but it just caught our attention today on Reddit. Since dogs human hands music videos unknown 2014"
93486,HEALTHY LIVING,"From Dormitory to Culinary With the many followers and the attention that her website has garnered, Emily realizes the impact that college eating has during one's college years and beyond and has already had a very positive impact on public health. from dormitory to culinar Bruce Y. Lee, Contributor Director of the Global Obesity Prevention Center (GOPC) and As... 2015"
126723,SPORTS,"Bo Ryan Cutting Down The Nets To Celebrate His First Final Four Trip Is What Sports Is All About (PHOTOS) ""Today would have been my dad's 90th birthday,"" Ryan reminded reporters after his team's win over Arizona in the Elite Eight bo ryan final four cutting down nets Chris Greenberg 2014"
80710,BLACK VOICES,Is Reality TV Ready To Embrace Gay Black Men? There's potential to break down stereotypes -- but also reinforce them. is reality tv ready to embrace black gay men Zeba Blay 2015
146570,TRAVEL,"September Getaways for the Vino Lover Summer might be coming to an end but Californians are excited because September equals wine, and lots of it. september getaways for th FlipKey, Contributor Vacation rental marketplace 2013"


# <center> -------------------------------------------------------------------- </center>

# EDA

#### View categories

#### View length of texts

### Author Activity

# <center> -------------------------------------------------------------------- </center>

# Preprocessing Text Data
- Lower all words
- Handle misspellings?
- Stop words
- Stem / Lemmatize text
- Tokenization or specialized regex?
- Use all words or just most frequent?
- Use bigrams, POS taggins, Mutual information Scores?
- What sort of vectorization? (Boolean / Count / TF-IDF / Word2Vec)

In [19]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk import word_tokenize, FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer

### Encode Target Variable
- Assign a unique value to each categories

In [20]:
# Instantiate label encoder 
le = preprocessing.LabelEncoder()

# Label Encode categories
df['class_label'] = le.fit_transform(df['category'])

# Create dictionary to map labels to categories
label_to_category = dict(df.groupby('class_label')['category'].first())

### Get Stopwords

In [21]:
stopwords_list = stopwords.words('english') + list(string.punctuation)
stopwords_list += ["''", '""', '...', '``']

### Test Train Split

In [22]:
from sklearn.model_selection import train_test_split

# Define dependent and independent variables
X = df["text"]
y = df["class_label"]

# Perform Test Train Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=8)

### Count Vectorization

In [28]:
from sklearn.feature_extraction.text import CountVectorizer

# Instantiate CountVectorizer with custom list of stop words
count_vectorizer = CountVectorizer(stop_words= stopwords_list)

# Fit the tokenizer on the training data
count_vectorizer.fit(X_train)

# Apply the tokenizer to the training and testing data
X_train_counts = count_vectorizer.transform(X_train)
X_test_counts = count_vectorizer.transform(X_test)

# View Results
print(f'There are {X_train_counts.shape[0]} observations and {X_train_counts.shape[1]} features')

There are 160672 observations and 102965 features


## Model: Neural Network

In [29]:
from keras import models
from keras import layers
from keras import optimizers
from keras.utils.np_utils import to_categorical

### OHE target

In [30]:
from keras.utils.np_utils import to_categorical

# OHE for netural network
y_train_ohe = to_categorical(y_train)

### Create model
- Fully connected (dense) layer network with relu activation
- 2 hidden layers with 50 units in 1st and 25 in second
- Softmax classifier for nulticlass problem

In [31]:
# Initialize a sequential model
model = models.Sequential()

# Two layers with relu activation
model.add(layers.Dense(50, activation='relu'))
model.add(layers.Dense(25, activation='relu'))

# One layer with softmax activation 
model.add(layers.Dense(41, activation='softmax'))

### Compiling the model
- Loss function = categorical crossentropy
- Optimizer = stochastic gradient descent

In [32]:
model.compile(optimizer='SGD',
              loss='categorical_crossentropy',
              metrics=['acc'])

In [33]:
# Train the model 
history = model.fit(X_train_counts, y_train_ohe, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


### Evaluation

In [34]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report

def get_model_metrics(y_test, y_predicted, full_report = False):  

    y_pred_bool = np.argmax(y_predicted, axis=1)
    
    precision = round(precision_score(y_test, y_pred_bool, average='weighted'),3)
    recall = round(recall_score(y_test, y_pred_bool, average='weighted'),3)
    f1 = round(f1_score(y_test, y_pred_bool, average='weighted'),3)
    accuracy = round(accuracy_score(y_test, y_pred_bool),3)
    
    if full_report == False:
        print(f"Accuracy = {accuracy} \nPrecision = {precision} \nRecall = {recall} \nf1 = {f1}")
    else:
        print(classification_report(y_test, y_pred_bool))

In [37]:
get_model_metrics(y_test, model.predict(X_test_counts), full_report = 1)

              precision    recall  f1-score   support

           0       0.69      0.60      0.64       315
           1       0.76      0.61      0.68       283
           2       0.61      0.52      0.56       909
           3       0.60      0.61      0.61      1242
           4       0.57      0.48      0.53       227
           5       0.66      0.65      0.65      1037
           6       0.58      0.57      0.58       676
           7       0.66      0.54      0.60       189
           8       0.82      0.81      0.81       637
           9       0.61      0.50      0.55       206
          10       0.74      0.81      0.77      3205
          11       0.63      0.54      0.58       283
          12       0.53      0.47      0.50       290
          13       0.84      0.84      0.84      1239
          14       0.57      0.48      0.52       275
          15       0.50      0.56      0.53       498
          16       0.59      0.69      0.64      1327
          17       0.83    