In [1]:
#The goal of this program is to continue my exploration into Natural Language Processing and try out additional pre-processing
#techniques. In my previous NLP project, a logistic regression model was built that classified reviews of prescription drugs.
#The reviews were placed into a TF-IDF vectorizer, and classified from there. In this project, I am building a neural net that
#takes portions of text from BBC articles and places them into one of three different classes based on the content. In contrast
#to my previous project, I will also be employing lemmatization and Principle Component Analysis within this code.

#Note: The original dataset can be found here: https://www.kaggle.com/datasets/shivamkushwaha/bbc-full-text-document-classification



In [2]:
#Importing all the modules used for this project

import pandas as pd
import tensorflow as tf
import keras
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import math
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
import re

In [3]:
#Loading the articles labeled as business, entertainment, or politics. 

#Note: All the files used in this program originally consisted of hundreds of individual .txt files that loaded poorly into 
#dataframes and were difficult to pre-process. As such, I manually split the files into training and testing folders, and 
#merged each .txt file into a single massive document using a free tool online to save time.
#This tool can be found at the following link: https://it365.gitlab.io/txt-merge/

business = pd.read_fwf(r"C:\Users\hecto\OneDrive\Documents\Jupyter Notebook\BBC Text Data\training\merged_business.txt")
entertainment = pd.read_fwf(r"C:\Users\hecto\OneDrive\Documents\Jupyter Notebook\BBC Text Data\training\merged_entertainment.txt")
politics = pd.read_fwf(r"C:\Users\hecto\OneDrive\Documents\Jupyter Notebook\BBC Text Data\training\merged_politics.txt")


In [4]:
#Converting our dataframes to numpy arrays for convenience when building the master text array. 

business = business.to_numpy()
business = business.flatten()

entertainment = entertainment.to_numpy()
entertainment = entertainment.flatten()

politics = politics.to_numpy()
politics = politics.flatten()

#Building the master array
texts = np.array([])
texts = np.append(texts, business, axis=0)
texts = np.append(texts, entertainment, axis=0)
texts = np.append(texts, politics, axis=0)


#Let's verify the master dataset has the correct size by checking it against the sum of the component datasets
if (business.shape[0] + entertainment.shape[0] + politics.shape[0]) == texts.shape[0]:
    print("Success - Array sizes match")

Success - Array sizes match


In [5]:
#Here, we'll examine the content from our various text classes. As can be seen, many NaN values were imported into the
#dataframe, meaning we'll need to perform some substantial pre-processing.
print(business, "\n")
print(entertainment, "\n")
print(politics, "\n")

['Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.'
 nan nan ... nan nan nan] 

["A Christmas tree that can receive text messages has been unveiled at London's Tate Britain art gallery."
 nan nan ... nan nan nan] 

['Maternity pay for new mothers is to rise by £1,400 as part of new proposals announced by the Trade and Industry Secretary Patricia Hewitt.'
 nan nan ... nan nan nan] 



In [6]:
#Here we'll build the labels which correspond to the writing we've stored in the "texts" array. We'll create a second array
#where we append labels in a way that follows the same order in which the "texts" array was built, ensuring the label correctly 
#corresponds to the text.

labels = []

for i in range(business.shape[0]):
    labels.append("Business")
   

for i in range(entertainment.shape[0]):
    labels.append("Entertainment")
   

for i in range(politics.shape[0]):
    labels.append("Politics")
    
    
#We'll also check the array sizes to ensure a 1-1 correspondence between our predictors and labels.
if len(labels) == texts.shape[0]:
    print("Success - Array sizes match")
    


Success - Array sizes match


In [7]:
#With the target labels created, we'll append them to our predictor text, and build a new dataframe to facilitate preprocesing
labels = np.array(labels)
texts = np.append(texts.reshape(-1,1), labels.reshape(-1,1), axis = 1)

texts = pd.DataFrame(texts, columns = ['Text', "Label"])
texts = texts.dropna().reset_index(drop=True)
texts['Label'] = texts['Label'].astype('category')

In [8]:
#Here, we'll remove unnecessary tokens which act as noise in the text data to facilitate the learning process
print("Original Article:" + "\n" + texts['Text'][1] + "\n")


def clean_data(review):
    
    no_punc = re.sub(r'[^\w\s]', '', review)
    no_digits = ''.join([i for i in no_punc if not i.isdigit()])
    no_dbl_space = re.sub("  ", " ", no_digits)
    return(no_dbl_space)

texts['Text'] = texts['Text'].astype(str)
texts['Text'] = texts['Text'].apply(clean_data)

print("Cleaned Article:" + "\n" + texts['Text'][1])

#Success! Unnecessary noise has been removed from our text! Next, lets lemmatize our reviews for training

Original Article:
The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.

Cleaned Article:
The firm which is now one of the biggest investors in Google benefited from sales of highspeed internet connections and higher advert sales TimeWarner said fourth quarter sales rose to bn from bn Its profits were buoyed by oneoff gains which offset a profit dip at Warner Bros and less users for AOL


In [9]:
#Here, we lemmatize each token and recombine them into a full string for processing by TF-IDF vectorization 
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('wordnet')

vectorized_texts = texts["Text"].to_numpy()
lemmatizer = WordNetLemmatizer()
lemmatized_texts = []

for i in range(len(vectorized_texts)):
    tokens = word_tokenize(vectorized_texts[i])
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    lemmatized_texts.append(lemmatized_tokens)

delimiter = " "
for i in range(len(lemmatized_texts)):
    lemmatized_texts[i] = delimiter.join(lemmatized_texts[i])
    
vectorizer = TfidfVectorizer()
lemmatized_texts = vectorizer.fit_transform(lemmatized_texts)
print("Lemmatized data shape before PCA: " + str(lemmatized_texts.shape))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hecto\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Lemmatized data shape before PCA: (8296, 20121)


In [10]:
#Finally, we'll reduce the dimensionality of the data to increase the efficiency of the training process, as well as enable 
#ourselves to use a simpler ML model. 
from sklearn.decomposition import IncrementalPCA
DimReducer = IncrementalPCA(n_components=100)

lemmatized_texts = DimReducer.fit_transform(lemmatized_texts)
print("Lemmatized data shape after PCA: " + str(lemmatized_texts.shape))
x_train, x_test, y_train, y_test = train_test_split(lemmatized_texts, texts['Label'], test_size = 0.15)



Lemmatized data shape after PCA: (8296, 100)


In [11]:
#At last we train our Neural Net, and observe the test accuracy to see how well the model generalizes to the test data
myNN = MLPClassifier(hidden_layer_sizes=(50,30,40), activation='tanh', alpha=0.001,max_iter=300)
myNN.fit(x_train, y_train)
predicted = myNN.predict(x_test)
print("Your model testing accuracy: " + str(accuracy_score(predicted, y_test)))


Your model testing accuracy: 0.8128514056224899


In [None]:
#This concludes this exploration in NLP. Whilst it isn't visble from this notebook, I spent many hours experimenting with 
#various model parameters, including the PCA process, model architecture, learning rate, activation functions, and more. 
#Throughout this testing, the testing accuracy did not rise much greater than 80%, so I suspect a more complex model
#architecture is required to achieve my goal of +90% accuracy. As such, I am planning to revisit this project by building a 
#recurrent neural net in Keras with activation functions that vary throughout the layers, and seeing how this effects 
#our predictive capabilities. As always, all suggestions are welcomed, and thanks for taking a look!