# <center> News Classification with NLP </center>

### Imports

In [None]:
import warnings
import numpy as np
import pandas as pd
import re
import plotly.express as px
from sklearn import preprocessing
import plotly.graph_objects as go
warnings.filterwarnings('ignore')

### Data
- Import JSON file
- Limit to top categories

In [None]:
# Read in json file to DataFrame
df = pd.read_json('News_Category_Dataset_v2.json', lines=True)

# Get the top categories by article count
#top_5_categories = df['category'].value_counts()[:10].index

# Limit the DateFrame to the top categories
#df = df[df['category'].isin(top_5_categories)]

# View Results
print(df.shape)
df.sample(5)

# Cleaning

#### Check data types

#### Check NaNs 

#### Check Duplicates

In [None]:
print(f"There are {df.duplicated().sum()} duplicated rows")

# Drop duplicates
df = df.drop_duplicates()

print(f"There are now {df.duplicated().sum()} duplicated rows")

### Dates

In [None]:
df['date'] = df['date'].dt.year

#### Authors 
- The `authors` field is a list containing:
    - Name(s)
    - Titles
    - Organizations
    - Misc comments
- It also contains many NaNs in the form of empty strings
- Approach:
    - Replace NaNs with 'unknown'
    - Extract author names from the field, create new field named `author_names` to be used as a bigram
    - Leave the rest of the information in a new field named `author_notes`

### Author Names
- Replace missing
- Get Names
- Clean non-name details
- Cast as strings

In [None]:
# Replace missing authors with 'unknown'
df['authors'] = df['authors'].apply(lambda x: x.replace('','unknown') if x == '' else x)

In [None]:
# Get author names from list
# Split, get first, title(), split 
df['author_names'] = df['authors'].apply(lambda x: x.replace('By','').strip().split(',')[0].lower().split(' And '))

In [None]:
# Store the rest of the authors field without the names into a new field
df['author_notes'] = df['authors'].apply(lambda x: ''.join(x.replace('By','').replace('\n','').replace('Contributor','Contributor ').strip().split(',')[1:]).strip())

# Type cast from list to string
df['author_notes'] = df['author_notes'].astype(str)

### Links
- The links are not helpful in their current form, need to extract keyworks from them

In [None]:
# Replace charaters we want to split on with commas, then split, only get the relevant entries from the resulting list
df['link_keywords'] = df['link'].apply(lambda x: x.replace('-',',').replace("_",',').replace("entry/",',').split(',')[1:-2])

# Typecast from list to string
df['link_keywords'] = df['link_keywords'].apply(lambda x: ' '.join(x))

# View Sample
df['link_keywords'].sample(5)

### Join all text columns

In [None]:
df['text'] = df['headline'] + ' ' + df['short_description'] + ' ' + df['link_keywords'] + ' ' + df['author_names'].astype(str) + ' ' + df['author_notes'] + ' ' + df['date'].astype(str)

### Drop unwanted features

In [None]:
# Drop the feature if it appear in the colums (done this way so the cell can be re-run)
df = df.drop(columns=[col for col in df.columns if col in ['link','authors','headline','short_description','date','link_keywords','author_notes','author_names']])

# View Sample
df.sample(5)

# <center> -------------------------------------------------------------------- </center>

# EDA

#### View categories

#### View length of texts

### Author Activity

# <center> -------------------------------------------------------------------- </center>

# Preprocessing Text Data
- Lower all words
- Handle misspellings?
- Stop words
- Stem / Lemmatize text
- Tokenization or specialized regex?
- Use all words or just most frequent?
- Use bigrams, POS taggins, Mutual information Scores?
- What sort of vectorization? (Boolean / Count / TF-IDF / Word2Vec)

In [None]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk import word_tokenize, FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer

### Encode Target Variable / OHE

In [None]:
# Instantiate label encoder 
le = preprocessing.LabelEncoder()

# Apply
df['class_label'] = le.fit_transform(df['category'])

# Create dictionary to map labels to categories
label_to_category = dict(df.groupby('class_label')['category'].first())

### Get Stopwords

In [None]:
stopwords_list = stopwords.words('english') + list(string.punctuation)
stopwords_list += ["''", '""', '...', '``']

### Test Train Split

In [None]:
from sklearn.model_selection import train_test_split

# Define dependent and independent variables
X = df["text"]
y = df["class_label"]

# Perform Test Train Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=8)

### OHE target for NN

In [None]:
# OHE for netural network
y_train_ohe = to_categorical(y_train)

### Count Vectorization

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Instantiate CountVectorizer with custom list of stop words
count_vectorizer = CountVectorizer(stop_words= stopwords_list)

# Fit the tokenizer on the training data
count_vectorizer.fit(X_train)

# Apply the tokenizer to the training and testing data
X_train_tokenized = count_vectorizer.transform(X_train)
X_test_tokenized = count_vectorizer.transform(X_test)

## Model: Multinomial Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB
MNB_clf = MultinomialNB()
MNB_clf.fit(X_train_tokenized, y_train)
y_predicted_counts = MNB_clf.predict(X_test_tokenized)

### Evaluation

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
from sklearn.metrics import confusion_matrix

def get_metrics(y_test, y_predicted):  

    precision = round(precision_score(y_test, y_predicted, average='weighted'),3)
    recall = round(recall_score(y_test, y_predicted, average='weighted'),3)
    f1 = round(f1_score(y_test, y_predicted, average='weighted'),3)
    accuracy = round(accuracy_score(y_test, y_predicted),3)
    
    return accuracy, precision, recall, f1

accuracy, precision, recall, f1 = get_metrics(y_test, y_predicted_counts)

print(f"Accuracy = {accuracy} \nPrecision = {precision} \nRecall = {recall} \nf1 = {f1}")

## Model: Neural Network

In [None]:
from keras import models
from keras import layers
from keras import optimizers
from keras.utils.np_utils import to_categorical

### Creat model and layers

In [None]:
X_train_tokenized.shape

In [None]:
# Initialize a sequential model
model = models.Sequential()

# Two layers with relu activation
model.add(layers.Dense(50, activation='relu', input_shape=(103006,)))
model.add(layers.Dense(25, activation='relu'))

# One layer with softmax activation 
model.add(layers.Dense(41, activation='softmax'))

In [None]:
model.compile(optimizer='SGD',
              loss='categorical_crossentropy',
              metrics=['acc'])

In [None]:
# Train the model 
history = model.fit(X_train_tokenized, y_train_ohe, epochs=20)