# News Classification with DistilBERT and comparison across mulitple models

# Acknowledgement
- https://www.kaggle.com/code/vbmokin/nlp-for-en-bert-cls-10-classifiers


# Description

The original Kaggle notebook applied these binary classification with DistilBERT word embeddings across every label. I'm repurposing this work to apply a multinomial classification for all labels.

In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
import numpy as np
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

# models
from sklearn.linear_model import LinearRegression, Perceptron, RidgeClassifier, SGDClassifier, LassoCV
from sklearn.svm import SVC, LinearSVC, SVR
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier 
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn import metrics

import xgboost as xgb
from xgboost import XGBClassifier


# NN models
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense
from keras import optimizers
from keras.wrappers.scikit_learn import KerasClassifier

import torch
import transformers as ppb

import warnings
warnings.filterwarnings('ignore')


2023-03-13 13:26:08.110546: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


In [3]:
torch.cuda.is_available()

True

In [4]:
df = pd.read_pickle('../../data/raw/raw.p')

In [5]:
df.head()

Unnamed: 0,title,description,published date,url,publisher,text,topic
0,Plight of homeless deepens as Turkey-Syria ear...,Plight of homeless deepens as Turkey-Syria ear...,"Thu, 09 Feb 2023 12:23:00 GMT",https://news.google.com/rss/articles/CBMiaGh0d...,"{'href': 'https://www.reuters.com', 'title': '...",Summary\n\nSummary Companies Death toll reache...,WORLD
1,Zelenskyy makes heartfelt call for Ukraine's E...,Zelenskyy makes heartfelt call for Ukraine's E...,"Thu, 09 Feb 2023 14:55:00 GMT",https://news.google.com/rss/articles/CBMiQGh0d...,"{'href': 'https://www.cnbc.com', 'title': 'CNBC'}",Ukrainian President Volodymyr Zelensky and Pre...,WORLD
2,Huge haul of cocaine floating at sea seized - CNN,Huge haul of cocaine floating at sea seized C...,"Thu, 09 Feb 2023 11:17:00 GMT",https://news.google.com/rss/articles/CBMiUGh0d...,"{'href': 'https://www.cnn.com', 'title': 'CNN'}",CNN —\n\nMore than 3 tons of cocaine floating ...,WORLD
3,"5 things to know for February 9: Earthquake, C...","5 things to know for February 9: Earthquake, C...","Thu, 09 Feb 2023 11:53:00 GMT",https://news.google.com/rss/articles/CBMiSGh0d...,"{'href': 'https://www.cnn.com', 'title': 'CNN'}",CNN —\n\nGet '5 Things' in your inbox If your ...,WORLD
4,Kremlin Endorses Report on U.S. Involvement in...,Kremlin Endorses Report on U.S. Involvement in...,"Thu, 09 Feb 2023 11:36:32 GMT",https://news.google.com/rss/articles/CBMicmh0d...,"{'href': 'https://www.themoscowtimes.com', 'ti...",The Kremlin on Thursday endorsed a blog post b...,WORLD


# Category Distribution

In [6]:
df.topic.value_counts()

ENTERTAINMENT    490
NATION           470
TECHNOLOGY       454
BUSINESS         437
HEALTH           404
WORLD            391
SCIENCE          388
Name: topic, dtype: int64

Luckily a pretty even distribution especially given the nature of the scraper.

# ML Task

Train a classifier for the categories.

In [7]:
df = df[['topic', 'title']]
# # Manual OneHotEncoding because sklearns was taking me a bit.
# for topic in df.topic.unique():
#     df.loc[:,topic] = (df.topic == topic).astype(int)
    
    
# df.drop(['topic'], axis=1, inplace=True)

le = LabelEncoder()
le.fit(df.topic)

df.topic = le.transform(df.topic)

## BERT: Data preparing and modeling

In [8]:
# For pre-trained DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

# Other models: https://huggingface.co/transformers/pretrained_models.html

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
# Tokenization the sentences - break them up into word and subwords in the format BERT is comfortable with
tokenized = df['title'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
np.array(padded).shape

(3034, 39)

In [10]:
# Creation variable to ignore (mask) the data padding
attention_mask = np.where(padded != 0, 1, 0)
print(attention_mask.shape)
attention_mask

(3034, 39)


array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])

In [11]:
# Modeling
input_ids = torch.tensor(padded).to(torch.int64)
attention_mask = torch.tensor(attention_mask).to(torch.int64)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [12]:
# Last hidden states
features = last_hidden_states[0][:,0,:].numpy()

In [13]:
features

array([[-0.36741638, -0.08812499, -0.24700838, ..., -0.40141216,
         0.3454349 , -0.01795575],
       [-0.23706903, -0.07861285, -0.0465744 , ..., -0.30821347,
         0.40797278,  0.14277637],
       [-0.22179267, -0.3567314 , -0.11802878, ..., -0.3067955 ,
         0.53071135,  0.12337755],
       ...,
       [-0.42635   , -0.10425763, -0.02006005, ..., -0.16933376,
         0.6849675 ,  0.04143865],
       [ 0.01284773, -0.0919233 , -0.10722934, ..., -0.24347584,
         0.54869324,  0.25608742],
       [-0.30738497, -0.11996962,  0.07331155, ..., -0.26930004,
         0.5723503 ,  0.21982545]], dtype=float32)

## Text classification and prediction by many models

In [14]:
random_state = 0

In [15]:
# Set parameters of models
models = pd.DataFrame(columns = ['name', 'model', 'param_grid'])

# # Linear Regression
# n = 0
# models.loc[n, 'name'] = 'Linear Regression'
# models.at[n, 'model'] = LinearRegression()
# models.at[n, 'param_grid'] = {}

# # Logistic Regression
# n = 1
# models.loc[n, 'name'] = 'Logistic Regression'
# models.at[n, 'model'] = LogisticRegression()
# models.at[n, 'param_grid'] = {'C': np.linspace(0.0001, 100, 20)
#                              }

# Support Vector Machines
n = 2
models.loc[n, 'name'] = 'Support Vector Machines'
models.at[n, 'model'] = SVC()
models.at[n, 'param_grid'] = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
                              'tol': [1e-3]
                             }

# Linear SVC
n = 3
models.loc[n, 'name'] = 'Linear SVC'
models.at[n, 'model'] = LinearSVC()
models.at[n, 'param_grid'] = {'dual':[False],
                              'C': np.linspace(1, 15, 15)
                             }

# Random Forest Classifier
n = 4
models.loc[n, 'name'] = 'Random Forest Classifier'
models.at[n, 'model'] = RandomForestClassifier()
models.at[n, 'param_grid'] = {'n_estimators': [40, 50, 60, 100, 500], 
                              'min_samples_split': [30, 40, 50, 100, 200], 
                              'min_samples_leaf': [10, 12, 15, 20, 50],
                              'max_features': ['auto'], 
                              'max_depth': [3, 4, 5, 6], 
                              'criterion': ['gini'], 
                              'bootstrap': [False]                              
                             }

# Bagging Classifier
n = 5
models.loc[n, 'name'] = 'Bagging Classifier'
models.at[n, 'model'] = BaggingClassifier()
models.at[n, 'param_grid'] = {'max_features': np.linspace(0.05, 0.8, 1),
                              'n_estimators': [3, 4, 5, 6],
                              'warm_start' : [False]
                             }

# XGB Classifier
n = 6
models.loc[n, 'name'] = 'XGB Classifier'
models.at[n, 'model'] = xgb.XGBClassifier(objective='binary:hinge')  # or binary:logistic
models.at[n, 'param_grid'] = {'n_estimators': [50, 70, 90], 
                              'learning_rate': [0.01, 0.05, 0.1, 0.2],
                              'max_depth': [3, 4, 5]
                             }

# LGBM Classifier
# n = 7
# models.loc[n, 'name'] = 'LGBM Classifier'
# models.at[n, 'model'] = lgb.LGBMClassifier(boosting_type='gbdt',  
#                                            objective='binary' 
#                                            )
# models.at[n, 'param_grid'] = {'n_estimators': [50, 100, 500, 1000], 
#                               'num_leaves': [30, 50, 100, 200], 
#                               'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.5]
#                              }

# # MLP Classifier
# n = 8
# models.loc[n, 'name'] = 'MLPClassifier'
# models.at[n, 'model'] = MLPClassifier()
# models.at[n, 'param_grid'] = {'hidden_layer_sizes': [i for i in range(2,10)],
#                               'solver': ['sgd'],
#                               'learning_rate': ['adaptive'],
#                               'max_iter': [1000, 2000]
#                              }

# Avg values
models.loc[9, 'name'] = 'Mean values'

# Max values
models.loc[10, 'name'] = 'Max values'

In [16]:
models

Unnamed: 0,name,model,param_grid
2,Support Vector Machines,SVC(),"{'kernel': ['linear', 'poly', 'rbf', 'sigmoid'..."
3,Linear SVC,LinearSVC(),"{'dual': [False], 'C': [1.0, 2.0, 3.0, 4.0, 5...."
4,Random Forest Classifier,RandomForestClassifier(),"{'n_estimators': [40, 50, 60, 100, 500], 'min_..."
5,Bagging Classifier,BaggingClassifier(),"{'max_features': [0.05], 'n_estimators': [3, 4..."
6,XGB Classifier,"XGBClassifier(base_score=None, booster=None, c...","{'n_estimators': [50, 70, 90], 'learning_rate'..."
9,Mean values,,
10,Max values,,


In [35]:
def model_prediction(models, features, labels, test_size, verbose):
    # Models training and data prediction for all models from DataFrame models
    
    # Splitting train data for model tuning with cross-validation
    #cv_train = ShuffleSplit(n_splits=cv_n_split, test_size=0.2, random_state=random_state)
    train_features, test_features, train_labels, test_labels = train_test_split(features, 
                                                                                labels, 
                                                                                test_size=test_size,
                                                                                random_state=random_state)
    # Total list of prediction by all models
    total_train_pred = []
    total_test_pred = []
    
    # Results
    results = models[['name']].copy()
    results['acc_train'] = results['acc_test'] = 0

    for i in models.index[:-2]:
        # Training
        model = GridSearchCV(models.at[i, 'model'], models.at[i, 'param_grid'])
        model.fit(train_features, train_labels)
        
        # Prediction
        train_pred = model.predict(train_features).round(0).astype('int')
        total_train_pred.append(train_pred)
        test_pred = model.predict(test_features).round(0).astype('int')
        total_test_pred.append(test_pred)
        
        # Scoring
        acc_train = accuracy_score(train_labels, train_pred)
        acc_test = accuracy_score(test_labels, test_pred)
        
        if verbose:
            print(f"Model - {models.loc[i, 'name']}")
            print("Best parameters:", model.best_params_)
            print("Accuracy for training data: %0.3f" % acc_train)
            print("Accuracy for test data: %0.3f" % acc_test)
            print('\n')
        
        # Save results
        results.loc[i,'acc_train'] = acc_train
        results.loc[i,'acc_test'] = acc_test

    # MEAN values
    # Calc prediction
    mean_train_pred = (np.mean(np.array(total_train_pred), axis=0)).astype(int)
    mean_test_pred = (np.mean(np.array(total_test_pred), axis=0)).astype(int)    
    
    # Scoring
    acc_train = accuracy_score(train_labels, mean_train_pred)
    acc_test = accuracy_score(test_labels, mean_test_pred)
    if verbose:
        print(f"Mean prediction values")
        print("Accuracy for training data: %0.3f" % acc_train)
        print("Accuracy for test data: %0.3f" % acc_test)
        
    # Save results
    n = len(results)-2
    results.loc[n,'acc_train'] = acc_train
    results.loc[n,'acc_test'] = acc_test

    # MAX values
    # Calc prediction
    max_train_pred = (np.max(np.array(total_train_pred), axis=0))
    max_test_pred = (np.max(np.array(total_test_pred), axis=0))
    
    # Scoring
    acc_train = accuracy_score(train_labels, max_train_pred)
    acc_test = accuracy_score(test_labels, max_test_pred)
    if verbose:
        print(f"Maximum prediction values")
        print("Accuracy for training data: %0.3f" % acc_train)
        print("Accuracy for test data: %0.3f" % acc_test)
    
    # Save results
    n = len(results)-1
    results.loc[n,'acc_train'] = acc_train
    results.loc[n,'acc_test'] = acc_test

    return results



In [36]:
def target_prediction(models, df, features, test_size=0.2, verbose=True):
    # Text classification model and prediction for given feature "target" (with labels) in df
    
    # Target
    labels = df.topic
    
#     # Extracting the number of examples of each class
#     Relevant_len = df[labels == 1].shape[0]
#     Not_len = df[df[target] == 0].shape[0]
    
#     # Draw bar plot
#     plt.rcParams['figure.figsize'] = (7, 5)
#     plt.bar(10, Relevant_len, 3, label="Relevant", color='green')
#     plt.bar(15, Not_len, 3, label="Not", color='red')
#     plt.legend(loc='upper center')
#     plt.ylabel('Number of examples')
#     plt.title('Proportion of examples for ' + target)
#     plt.show()
    
    # Models training, prediction and save results
    results = model_prediction(models, features, labels, test_size, verbose=verbose)
    results = results.sort_values(by=['acc_test', 'acc_train'], ascending=False)
    results.to_csv(f'models-scoring.csv', index=False)
    display(results)

In [37]:
# Solving NLP Classification tasks
print('Solving NLP Classification tasks')
target_prediction(models, df, features, test_size=0.4, verbose=True)

Solving NLP Classification tasks
Model - Support Vector Machines
Best parameters: {'kernel': 'linear', 'tol': 0.001}
Accuracy for training data: 0.999
Accuracy for test data: 0.984


Model - Linear SVC
Best parameters: {'C': 13.0, 'dual': False}
Accuracy for training data: 0.999
Accuracy for test data: 0.982


Model - Random Forest Classifier
Best parameters: {'bootstrap': False, 'criterion': 'gini', 'max_depth': 6, 'max_features': 'auto', 'min_samples_leaf': 15, 'min_samples_split': 30, 'n_estimators': 500}
Accuracy for training data: 0.966
Accuracy for test data: 0.942


Model - Bagging Classifier
Best parameters: {'max_features': 0.05, 'n_estimators': 5, 'warm_start': False}
Accuracy for training data: 0.997
Accuracy for test data: 0.954


Model - XGB Classifier
Best parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 70}
Accuracy for training data: 0.999
Accuracy for test data: 0.979


Mean prediction values
Accuracy for training data: 0.979
Accuracy for test data: 0

Unnamed: 0,name,acc_train,acc_test
2,Support Vector Machines,0.999451,0.983526
3,Linear SVC,0.999451,0.981878
6,XGB Classifier,0.982967,0.948105
5,Bagging Classifier,0.979121,0.943163
4,Random Forest Classifier,0.966484,0.942339
9,Mean values,0.0,0.0
10,Max values,0.0,0.0
