## Vectorizing Apporach

In [1]:
#### import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# feature_extraction is used for converting text into numerical features 
from sklearn import feature_extraction
# linear_model contains various linear models for regression and classification
from sklearn import linear_model
# import train_test_split() function module
from sklearn import model_selection
# load module including utilities for scaling, normalization, ect
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix

In [2]:
# load the train and test datasets
train_df = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test_df = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

In [3]:
# gain basic structure of the train dataset
print('*' * 100)
print(f'The concise summary of the train dataset:')
train_df.info()

print('*' * 100)
print(f'The shape the train dataset: {train_df.shape}')

print('*' * 100)
print(f'The first 5 samples of the dataset: {train_df.head()}')

print('*' * 100)
print(f'The text in first 5 samples of the dataset:')
print({i : train_df.iloc[i]['text'] for i in range(5)})

****************************************************************************************************
The concise summary of the train dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB
****************************************************************************************************
The shape the train dataset: (7613, 5)
****************************************************************************************************
The first 5 samples of the dataset:    id keyword location                                               text  \
0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
1 

In [4]:
# create an instance of the CountVectorizer()
vectorizer = feature_extraction.text.CountVectorizer()

# fit the vectorizer to the text data
vectorizer.fit(train_df['text'])

# print the vocabulary of vectorizer built from the tweets
print('*' * 100)
print(f'The vocabularies of tweets: \n{vectorizer.get_feature_names_out()[450:500]}')

# transform the text dataset to a sparse matrix of word counts
# spare matrix only store non-zero values to save memory
X_train = vectorizer.transform(train_df['text'])
# using toarray() to convert spare matrix into normal matrix
print('*' * 100)
print(f'The shape of the numerical vectors of tweets: \n{X_train.toarray().shape}')

****************************************************************************************************
The vocabularies of tweets: 
['2fenu1syu6' '2fggzqn1v4' '2fibe2haxc' '2for1lapdances' '2fs649qdwx'
 '2gljhvead9' '2h0dpmv2ef' '2hocep41kh' '2hours' '2hv2y2m2oz' '2i4eoggo5j'
 '2iafpmqjep' '2ii3brc7nx' '2jbibeib9g' '2jgvhw7yzs' '2jhtlwuey0'
 '2jr3yo55dr' '2jxkmkpalp' '2k13' '2k15' '2kdq56xtws' '2lbtshxi3c'
 '2leezy' '2lgtzkwmqw' '2liwkjybe9' '2lqyxzq5dn' '2m1gneaifl' '2minutemix'
 '2mnqc73hfk' '2mwc9ywjzy' '2nd' '2nip3d15dx' '2nndbgwyei' '2ns5tfnxpa'
 '2o7eva1coe' '2okscwyohc' '2oqsgzqlbz' '2oroyunym2' '2pack' '2pcs'
 '2pimg9bice' '2pm' '2ppzgpxybi' '2q3fuerey5' '2racaivffq' '2rtq9qmgpb'
 '2sdmichb2z' '2sgdofsmrq' '2slow2report' '2snyghaivs']
****************************************************************************************************
The shape of the numerical vectors of tweets: 
(7613, 21637)


In [5]:
# transform the text dataset by vectorizer
# only fit the vectorizer with train data to prevent data leakage,
# which is using information outside the training dataset to create the model
X_test = vectorizer.transform(test_df['text'])

In [6]:
# initialize the RidgeClassifier
clf = linear_model.RidgeClassifier()

# performing cross-validation on a classifier to evaluate its performance of the F1 score
# cross-validation provides a more accurate and reliable measure of model performance
scores = model_selection.cross_val_score(clf, X_train, train_df["target"], cv=3, scoring="f1")
print(f'F1 scores for each fold: {scores}')

F1 scores for each fold: [0.59453669 0.5642787  0.64082434]


In [7]:
# train the classifier on the training dataset
clf.fit(X_train, train_df["target"])
# save the prediction
sample_submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")
sample_submission["target"] = clf.predict(X_test)
sample_submission.to_csv("submission.csv", index=False)

## Pretrained DistilBERT Approach

In [8]:
# pgrade the keras-core package to the latest version
!pip install keras-core --upgrade
# upgrade the keras-nlp package to the latest version with minimal output during the installation process
!pip install -q keras-nlp --upgrade
import os
# set the environment variable KERAS_BACKEND to 'tensorflow' to specify TensorFlow as the backend for Keras
os.environ['KERAS_BACKEND'] = 'tensorflow'



In [9]:
# TensorFlow is an open-source library for numerical computation and machine learning
import tensorflow as tf
import keras_core as keras
# keras_nlp is a library providing Natural Language Processing (NLP) tools and models for use with Keras
import keras_nlp
from tensorflow.keras.optimizers import Adam
# these are used to visualize and calculate a confusion matrix, 
# which is a table used to describe the performance of a classification model.
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
# seaborn is a visualization tool based on matplotlib
import seaborn as sns

print("TensorFlow version:", tf.__version__)
print("KerasNLP version:", keras_nlp.__version__)

2024-02-15 18:01:06.834620: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-15 18:01:06.834725: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-15 18:01:06.971666: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Using TensorFlow backend
TensorFlow version: 2.15.0
KerasNLP version: 0.7.0


In [10]:
# explore the statistic information
train_df["length"] = train_df["text"].apply(lambda x : len(x))
test_df["length"] = test_df["text"].apply(lambda x : len(x))

print('*' * 100)
print("Train Length Stat")
print(train_df["length"].describe())

print('*' * 100)
print("Test Length Stat")
print(test_df["length"].describe())

****************************************************************************************************
Train Length Stat
count    7613.000000
mean      101.037436
std        33.781325
min         7.000000
25%        78.000000
50%       107.000000
75%       133.000000
max       157.000000
Name: length, dtype: float64
****************************************************************************************************
Test Length Stat
count    3263.000000
mean      102.108183
std        33.972158
min         5.000000
25%        78.000000
50%       109.000000
75%       134.000000
max       151.000000
Name: length, dtype: float64


In [11]:
BATCH_SIZE = 32
NUM_TRAINING_EXAMPLES = train_df.shape[0]
TRAIN_SPLIT = 0.8
VAL_SPLIT = 0.2
# number of batches of data
STEPS_PER_EPOCH = int(NUM_TRAINING_EXAMPLES)*TRAIN_SPLIT // BATCH_SIZE

EPOCHS = 5
AUTO = tf.data.experimental.AUTOTUNE

In [12]:
# split the data
X = train_df["text"]
y = train_df["target"]

X_train, X_val, y_train, y_val = model_selection.train_test_split(X, y, test_size=VAL_SPLIT, random_state=42)

X_test = test_df["text"]

In [13]:
# define a DistilBERT model path
preset= "distil_bert_base_en_uncased"
# initialize the preprocessor from pre-trained model
# preprocessor includes steps like tokenization, convert tokens to IDs and creating attention mask
preprocessor = keras_nlp.models.DistilBertPreprocessor.from_preset(preset,
                                                                   sequence_length=160,
                                                                   name="preprocessor_4_tweets"
                                                                  )

# load pretrained classifier model
# the classifier is meant to take the preprocessed input data and perform classification tasks,
# outputting the probability that the input text belongs to one of the specified classes
classifier = keras_nlp.models.DistilBertClassifier.from_preset(preset,
                                                               preprocessor = preprocessor, 
                                                               num_classes=2)
# display the classifier summary
classifier.summary()

Attaching 'tokenizer.json' from model 'keras/distil_bert/keras/distil_bert_base_en_uncased/2' to your Kaggle notebook...
Attaching 'tokenizer.json' from model 'keras/distil_bert/keras/distil_bert_base_en_uncased/2' to your Kaggle notebook...
Attaching 'assets/tokenizer/vocabulary.txt' from model 'keras/distil_bert/keras/distil_bert_base_en_uncased/2' to your Kaggle notebook...
Attaching 'config.json' from model 'keras/distil_bert/keras/distil_bert_base_en_uncased/2' to your Kaggle notebook...
Attaching 'config.json' from model 'keras/distil_bert/keras/distil_bert_base_en_uncased/2' to your Kaggle notebook...
Attaching 'model.weights.h5' from model 'keras/distil_bert/keras/distil_bert_base_en_uncased/2' to your Kaggle notebook...
  return id(getattr(self, attr)) not in self._functional_layer_ids
  return id(getattr(self, attr)) not in self._functional_layer_ids


In [14]:
# compile the classifier model with desired settings
classifier.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), # classes are represented as integers.'from_logits=True' and the outputs of the model are logits and not probabilities
    optimizer=Adam(learning_rate=1e-5), # Adam optimizer with a very low learning rate (1e-5) for NN
    metrics=["accuracy"] # monitor the "accuracy" metric during training to see the percentage of correctly classified instances in each epoch.
)

# train the model on the given dataset
history = classifier.fit(
    x=X_train, # training features
    y=y_train, # target data
    batch_size=BATCH_SIZE, # the number of samples per gradient update
    epochs=EPOCHS, # the times go over entire dataset
    validation_data=(X_val, y_val) # the dataset to evaluate model metrics like loss and accuracy
)

Epoch 1/5


I0000 00:00:1708020111.618369      76 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [15]:
def displayConfusionMatrix(true_labels, predicted_labels, title):
    # compute the confusion matrix
    cm = confusion_matrix(true_labels, predicted_labels)
    
    # create a heatmap
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix: {title}')
    plt.ylabel('Actual Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout() 
    plt.show()

In [16]:
# make the prediction
y_pred_train = classifier.predict(X_train)

# call the function with your labels
# displayConfusionMatrix(y_train, y_pred_train, "Training")



In [17]:
# save the prediction
sample_submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")
sample_submission["target"] = np.argmax(classifier.predict(X_test), axis=1)
sample_submission.to_csv("submission.csv", index=False)
print('Completed!')

Completed!
