# <center> News Classification with NLP and Neural Networks</center>

### Imports

In [None]:
import warnings
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn import preprocessing
import plotly.graph_objects as go
warnings.filterwarnings('ignore')

### Data
- Import JSON file
- Limit to top categories

In [None]:
# Read in json file to DataFrame
df = pd.read_json('News_Category_Dataset_v2.json', lines=True)

# View Results
print(df.shape)
df.sample(5)

# Cleaning

#### Check data types

#### Check NaNs 

#### Check Duplicates

In [None]:
print(f"There are {df.duplicated().sum()} duplicated rows")

# Drop duplicates
df = df.drop_duplicates()

print(f"There are now {df.duplicated().sum()} duplicated rows")

### Dates

In [None]:
df['date'] = df['date'].dt.year

#### Authors 
- The `authors` field is a list containing:
    - Name(s)
    - Titles
    - Organizations
    - Misc comments
- It also contains many NaNs in the form of empty strings
- Approach:
    - Replace NaNs with 'unknown'
    - Extract author names from the field, create new field named `author_names` to be used as a bigram
    - Leave the rest of the information in a new field named `author_notes`

### Author Names
- Replace missing
- Get Names
- Clean non-name details
- Cast as strings

In [None]:
# Replace missing authors with 'unknown'
df['authors'] = df['authors'].apply(lambda x: x.replace('','unknown') if x == '' else x)

In [None]:
# Get author names from list
df['author_names'] = df['authors'].apply(lambda x: x.replace('By','').strip().split(',')[0].lower().split(' And '))

In [None]:
# Clean up common formatting issues
df['authors'] = df['authors'].apply(lambda x: x.replace('By','').replace('\n','').replace('Contributor','Contributor ').strip())

### Links
- The links are not helpful in their current form, need to extract keyworks from them

In [None]:
# Replace charaters we want to split on with commas, then split, only get the relevant entries from the resulting list
df['link_keywords'] = df['link'].apply(lambda x: x.replace('-',',').replace("_",',').replace("entry/",',').split(',')[1:-2])

# Typecast from list to string
df['link_keywords'] = df['link_keywords'].apply(lambda x: ' '.join(x))

### Join all text columns

In [None]:
df['text'] = df['headline'] + ' ' + df['short_description'] + ' ' + df['link_keywords'] + ' ' + df['authors'] + ' ' + df['date'].astype(str)

### Drop unwanted features

In [None]:
# Drop the feature if it appear in the colums (done this way so the cell can be re-run)
df = df.drop(columns=[col for col in df.columns if col in ['link','authors','headline','short_description','date','link_keywords','author_notes','author_names']])

# View Sample
df.sample(5)

# <center> -------------------------------------------------------------------- </center>

# EDA

#### View categories

#### View length of texts

### Author Activity

# <center> -------------------------------------------------------------------- </center>

# Preprocessing Text Data
- Stop words
- Stem / Lemmatize text
- Tokenization
- Use all words or just most frequent?
- Use bigrams, POS taggins, Mutual information Scores?
- What sort of vectorization? (Boolean / Count / TF-IDF / Word2Vec)

In [None]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk import word_tokenize, FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer

### Encode Target Variable
- Assign a unique value to each categories

In [None]:
# Instantiate label encoder 
le = preprocessing.LabelEncoder()

# Label Encode categories
df['class_label'] = le.fit_transform(df['category'])

# Create dictionary to map labels to categories
label_to_category = dict(df.groupby('class_label')['category'].first())

### Get Stopwords

In [None]:
stopwords_list = stopwords.words('english') + list(string.punctuation)
stopwords_list += ["''", '""', '...', '``']

### Test Train Split

In [None]:
from sklearn.model_selection import train_test_split

# Define dependent and independent variables
X = df["text"]
y = df["class_label"]

# Perform Test Train Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=8)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=8)

### Count Vectorization

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Instantiate CountVectorizer with custom list of stop words
count_vectorizer = CountVectorizer(stop_words= stopwords_list)

# Fit the tokenizer on the training data
count_vectorizer.fit(X_train)

# Apply the tokenizer to the training and testing data
X_train_counts = count_vectorizer.transform(X_train)
X_test_counts = count_vectorizer.transform(X_test)
X_val_counts = count_vectorizer.transform(X_val)

# View Results
print(f'There are {X_train_counts.shape[0]} observations and {X_train_counts.shape[1]} features')
pd.DataFrame(data= X_val_counts[:5].toarray(), columns = count_vectorizer.get_feature_names())

## Model: Neural Network

In [None]:
from keras import models
from keras import layers
from keras import optimizers
from keras.utils.np_utils import to_categorical

### OHE target

In [None]:
from keras.utils.np_utils import to_categorical

# OHE for netural network
y_train_ohe = to_categorical(y_train)
y_val_ohe = to_categorical(y_val)

### Create model
- Fully connected (dense) layer network with relu activation
- 2 hidden layers with 50 units in 1st and 25 in second
- Softmax classifier for nulticlass problem

In [None]:
# Initialize a sequential model
model = models.Sequential()

# One layer with softmax activation 
model.add(layers.Dense(41, activation='softmax'))

### Compiling the model
- Loss function = categorical crossentropy
- Optimizer = stochastic gradient descent

In [None]:
model.compile(optimizer='SGD',
              loss='categorical_crossentropy',
              metrics=['acc'])

In [None]:
# Train the model 
n_epochs = 10
history = model.fit(X_train_counts, y_train_ohe, epochs= n_epochs, validation_data = (X_val_counts, y_val_ohe))

### Evaluation

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report

def get_model_metrics(actuals, predictions, full_report = False, plot = False):  

    predictions_bool = np.argmax(predictions, axis=1)
    
    precision = round(precision_score(actuals, predictions_bool, average='weighted'),3)
    recall = round(recall_score(actuals, predictions_bool, average='weighted'),3)
    f1 = round(f1_score(actuals, predictions_bool, average='weighted'),3)
    accuracy = round(accuracy_score(actuals, predictions_bool),3)
    
    if full_report == False:
        print(f"Accuracy = {accuracy} \nPrecision = {precision} \nRecall = {recall} \nf1 = {f1}")
    else:
        print(classification_report(actuals, predictions_bool))
    if plot == True:
        fig = go.Figure()

        fig.add_trace(go.Scatter(x=[i for i in range(20)], y=history.history['acc'],
                            mode='lines+markers', name='Train Accuracy'))
        fig.add_trace(go.Scatter(x=[i for i in range(20)], y=history.history['val_acc'],
                            mode='lines+markers', name='Validation Accuracy'))

        fig.update_layout(height= 500, width= 700)

        fig.show()

In [None]:
get_model_metrics(y_val, model.predict(X_val_counts), full_report = 0, plot = 1)