# TR Text Classification

## Naive Bayes - Baseline Model

## Import Packages

In [44]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, MaxPooling1D
from sklearn.model_selection import train_test_split
print(tf.__version__)

from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import f1_score

2.5.0


## Import Data

In [2]:
df_model = pd.read_csv("./1-Title_Classification/train1.csv")

## EDA

In [3]:
# Drop the ID columns
df_model = df_model.drop(columns = ['ID'])

In [4]:
# distribution of topics

df_model['TOPIC'].value_counts()

0    3107
1    2406
2    2404
Name: TOPIC, dtype: int64

In [5]:
df_model.head()

Unnamed: 0,TITLE,TOPIC
0,RITE AID CORP <RAD> SETS DIVIDEND,0
1,DEL E. WEBB INVESTMENT <DWPA> 4TH QTR NET,0
2,GENERAL HOST CORP <GH> SETS QUARTERLY,0
3,PROFESSOR LIFTS BANC TEXAS <BTX> PREFERRED STAKE,1
4,WINCHELL'S DONUT <WDH> SETS INITIAL QUARTERLY,0


In [None]:
##

In [7]:
x = df_model['TITLE'].values

In [8]:
y = df_model['TOPIC'].values

## Data Prep — Tokenize and Pad Text Data

In [None]:
max_features = 20000

In [34]:
x_tokenizer = text.Tokenizer(max_features)
x_tokenizer.fit_on_texts(list(x))
x_tokenized = x_tokenizer.texts_to_sequences(x)

In [31]:
def longest(list1) :
    l = 0
    for item in list1:
        if len(item) > l:
            l = len(item)
    return l

In [32]:
longest(x_tokenized)

30

In [33]:
max_text_length = longest(x_tokenized)

In [36]:
x_train_val = sequence.pad_sequences(x_tokenized, maxlen=max_text_length)

In [37]:
x_train_val

array([[   0,    0,    0, ..., 2077,   20,   35],
       [   0,    0,    0, ...,    9,    2,    3],
       [   0,    0,    0, ..., 2839,   20,   98],
       ...,
       [   0,    0,    0, ...,   92,  351,   15],
       [   0,    0,    0, ...,  574,   29,  303],
       [   0,    0,    0, ...,   36,   26,    3]], dtype=int32)

## Prepare Embedding Matrix with Pre-trained GloVe Embeddings

In [38]:
# Download and extract GloVe embeddings
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

--2021-12-19 23:52:45--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2021-12-19 23:52:45--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2021-12-19 23:52:46--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2021-1

In [40]:
embedding_dims = 100
embeddings_index = dict()
f = open('glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

embedding_matrix = np.zeros((max_features, embedding_dims))
for word, index in x_tokenizer.word_index.items():
    if index > max_features -1:
        break
    else:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector

## Create Embedding Layer

In [53]:
print('Build model...')
model = Sequential()
# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
#load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
#(we don't want to update them during training).
model.add(Embedding(max_features,
                    embedding_dims,
                    embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
                    trainable=False))
model.add(Dropout(0.2))

Build model...


## Build the Model

In [54]:
filters = 250
kernel_size = 3
hidden_dims = 250

In [55]:
# we add a Convolution1D, which will learn filters
# word group filters of size filter_length:
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu'))
model.add(MaxPooling1D())
model.add(Conv1D(filters,
                 5,
                 padding='valid',
                 activation='relu'))
# we use max pooling:
model.add(GlobalMaxPooling1D())
# We add a vanilla hidden layer:
model.add(Dense(hidden_dims, activation='relu'))
model.add(Dropout(0.2))

# We project onto 6 output layers, and squash it with a sigmoid:
model.add(Dense(3, activation='softmax'))
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 100)         2000000   
_________________________________________________________________
dropout_2 (Dropout)          (None, None, 100)         0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, None, 250)         75250     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, None, 250)         0         
_________________________________________________________________
conv1d_6 (Conv1D)            (None, None, 250)         312750    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 250)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 250)              

In [56]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])

## Train Model

In [57]:
X_train, X_test, y_train, y_test  = train_test_split(x,
                                                     y,
                                                     test_size=0.2,
                                                     random_state=42)

In [58]:
batch_size = 32
epochs = 3

In [59]:
model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(X_test, y_test))

Epoch 1/3


ValueError: in user code:

    /home/lawrence/miniconda3/envs/nlp/lib/python3.9/site-packages/tensorflow/python/keras/engine/training.py:855 train_function  *
        return step_function(self, iterator)
    /home/lawrence/miniconda3/envs/nlp/lib/python3.9/site-packages/tensorflow/python/keras/engine/training.py:845 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    /home/lawrence/miniconda3/envs/nlp/lib/python3.9/site-packages/tensorflow/python/distribute/distribute_lib.py:1285 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /home/lawrence/miniconda3/envs/nlp/lib/python3.9/site-packages/tensorflow/python/distribute/distribute_lib.py:2833 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /home/lawrence/miniconda3/envs/nlp/lib/python3.9/site-packages/tensorflow/python/distribute/distribute_lib.py:3608 _call_for_each_replica
        return fn(*args, **kwargs)
    /home/lawrence/miniconda3/envs/nlp/lib/python3.9/site-packages/tensorflow/python/keras/engine/training.py:838 run_step  **
        outputs = model.train_step(data)
    /home/lawrence/miniconda3/envs/nlp/lib/python3.9/site-packages/tensorflow/python/keras/engine/training.py:795 train_step
        y_pred = self(x, training=True)
    /home/lawrence/miniconda3/envs/nlp/lib/python3.9/site-packages/tensorflow/python/keras/engine/base_layer.py:1030 __call__
        outputs = call_fn(inputs, *args, **kwargs)
    /home/lawrence/miniconda3/envs/nlp/lib/python3.9/site-packages/tensorflow/python/keras/engine/sequential.py:380 call
        return super(Sequential, self).call(inputs, training=training, mask=mask)
    /home/lawrence/miniconda3/envs/nlp/lib/python3.9/site-packages/tensorflow/python/keras/engine/functional.py:420 call
        return self._run_internal_graph(
    /home/lawrence/miniconda3/envs/nlp/lib/python3.9/site-packages/tensorflow/python/keras/engine/functional.py:556 _run_internal_graph
        outputs = node.layer(*args, **kwargs)
    /home/lawrence/miniconda3/envs/nlp/lib/python3.9/site-packages/tensorflow/python/keras/engine/base_layer.py:1030 __call__
        outputs = call_fn(inputs, *args, **kwargs)
    /home/lawrence/miniconda3/envs/nlp/lib/python3.9/site-packages/tensorflow/python/keras/layers/convolutional.py:249 call
        outputs = self._convolution_op(inputs, self.kernel)
    /home/lawrence/miniconda3/envs/nlp/lib/python3.9/site-packages/tensorflow/python/util/dispatch.py:206 wrapper
        return target(*args, **kwargs)
    /home/lawrence/miniconda3/envs/nlp/lib/python3.9/site-packages/tensorflow/python/ops/nn_ops.py:1012 convolution_v2
        return convolution_internal(
    /home/lawrence/miniconda3/envs/nlp/lib/python3.9/site-packages/tensorflow/python/ops/nn_ops.py:1142 convolution_internal
        return op(
    /home/lawrence/miniconda3/envs/nlp/lib/python3.9/site-packages/tensorflow/python/util/dispatch.py:206 wrapper
        return target(*args, **kwargs)
    /home/lawrence/miniconda3/envs/nlp/lib/python3.9/site-packages/tensorflow/python/util/deprecation.py:602 new_func
        return func(*args, **kwargs)
    /home/lawrence/miniconda3/envs/nlp/lib/python3.9/site-packages/tensorflow/python/util/deprecation.py:602 new_func
        return func(*args, **kwargs)
    /home/lawrence/miniconda3/envs/nlp/lib/python3.9/site-packages/tensorflow/python/ops/nn_ops.py:1884 conv1d
        result = gen_nn_ops.conv2d(
    /home/lawrence/miniconda3/envs/nlp/lib/python3.9/site-packages/tensorflow/python/ops/gen_nn_ops.py:969 conv2d
        _, _, _op, _outputs = _op_def_library._apply_op_helper(
    /home/lawrence/miniconda3/envs/nlp/lib/python3.9/site-packages/tensorflow/python/framework/op_def_library.py:748 _apply_op_helper
        op = g._create_op_internal(op_type_name, inputs, dtypes=None,
    /home/lawrence/miniconda3/envs/nlp/lib/python3.9/site-packages/tensorflow/python/framework/func_graph.py:599 _create_op_internal
        return super(FuncGraph, self)._create_op_internal(  # pylint: disable=protected-access
    /home/lawrence/miniconda3/envs/nlp/lib/python3.9/site-packages/tensorflow/python/framework/ops.py:3557 _create_op_internal
        ret = Operation(
    /home/lawrence/miniconda3/envs/nlp/lib/python3.9/site-packages/tensorflow/python/framework/ops.py:2041 __init__
        self._c_op = _create_c_op(self._graph, node_def, inputs,
    /home/lawrence/miniconda3/envs/nlp/lib/python3.9/site-packages/tensorflow/python/framework/ops.py:1883 _create_c_op
        raise ValueError(str(e))

    ValueError: Negative dimension size caused by subtracting 3 from 1 for '{{node sequential_1/conv1d_5/conv1d}} = Conv2D[T=DT_FLOAT, data_format="NHWC", dilations=[1, 1, 1, 1], explicit_paddings=[], padding="VALID", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true](sequential_1/conv1d_5/conv1d/ExpandDims, sequential_1/conv1d_5/conv1d/ExpandDims_1)' with input shapes: [?,1,1,100], [1,3,100,250].


## CREATE A PIPELINE TO REMOVE PUNCTUATIONS, STOPWORDS AND PERFORM COUNT VECTORIZATION

In [9]:
def message_cleaning(message):
    Test_punc_removed = [char for char in message if char not in string.punctuation]
    Test_punc_removed_join = ''.join(Test_punc_removed)
    Test_punc_removed_join_clean = [word for word in Test_punc_removed_join.split() if word.lower() not in stopwords.words('english')]
    return Test_punc_removed_join_clean

In [10]:
tweets_countvectorizer = CountVectorizer(analyzer = message_cleaning, dtype = 'uint8').fit_transform(df_model['TITLE']).toarray()

In [11]:
X = tweets_countvectorizer

In [12]:
y = df_model['TOPIC']

## TRAIN A NAIVE BAYES CLASSIFIER MODEL

In [13]:
X.shape

(7917, 10611)

In [14]:
y.shape

(7917,)

In [17]:
X_train, X_test, y_train, y_test  = train_test_split(X,
                                                     y,
                                                     test_size=0.2,
                                                     random_state=42)

In [18]:
del X
del y

In [20]:
NB_classifier = MultinomialNB()
NB_classifier.fit(X_train, y_train)

MultinomialNB()

## ASSESS TRAINED MODEL PERFORMANCE 

In [21]:
# Predicting the Test set results
y_predict_test = NB_classifier.predict(X_test)

In [27]:
accuracy_score(y_test, y_predict_test)

0.8712121212121212

In [28]:
balanced_accuracy_score(y_test, y_predict_test)

0.8708694753052063

In [32]:
f1_score(y_test, y_predict_test, average='weighted')

0.8726671988362377