In [None]:
!pip3 install transformers sentencepiece hazm clean-text[gpl]
!pip install pyyaml==5.4.1

In [None]:
!gdown 1D3yt99D0GcCRCbdKbUQGxbqjkeh91hTg

In [None]:
!unrar x hamshahri.rar
!cp /content/hamshahriold/Corpus/Hamshahri-Categories.txt /content/
!unzip /content/hamshahriold/Corpus/Hamshahri-Corpus.zip
!unzip /content/hamshahriold/Corpus/PersianStopWords.zip

In [None]:
import os
import numpy as np
import pandas as pd
import hazm

import plotly.express as px
import plotly.graph_objects as go
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.utils import shuffle
from tqdm.notebook import tqdm
from transformers import AutoConfig, AutoTokenizer, TFAutoModel, AutoModel, DataCollatorWithPadding


# Save data to csv file

In [None]:
# [[DID value, Date value, CAT, text]]
corpus = []
tmp_text = " "
tmp_values = []
c = 0
with open('Hamshahri-Corpus.txt', "rb") as file:
  for line in file:
    line = line.decode("UTF-8")
    if ".DID" in line:
      # some news are abnormal lenght and they are low in number(about 1000)
      if len(tmp_text.split(' ')) < 2500:
        tmp_values.append(tmp_text)
        corpus.append(tmp_values)
      tmp_text = ""
      tmp_values = []
      tmp_values.append(line.replace(".DID\t", "").replace("\r\n",""))
    elif ".Date" in line:
      tmp_values.append(line.replace(".Date\t", "").replace("\r\n","").replace("\\", "/"))
    elif ".Cat" in line:
      tmp_values.append(line.replace(".Cat\t", "").replace("\r\n",""))
    else:
      tmp_text += (line.strip() + " ")
corpus.pop(0)
len(corpus)

In [None]:
df = pd.DataFrame(corpus, columns=['DID', 'date', 'cat', 'text'])
df

In [None]:
df.to_csv("dataset.csv", date_format='%Y%m%d')

#preprocessing

### stopwords

In [None]:
df = df[['text', 'cat']]

In [None]:
# stop word
stop_words_list = []
with open('PersianStopWords.txt', "rb") as file:
  for line in file:
    stop_words_list.append(line.decode("UTF-8").replace('\r\n', ""))

for idx, txt in enumerate(df["text"]):
  word_tokenized =  hazm.word_tokenize(txt)
  cps = ""
  for word in word_tokenized:
    if word not in stop_words_list:
      cps += word + " "
      
  df.loc[idx].at['text'] = cps
  if idx % 30000 == 0:
    print(idx, "numbers cleaned")

### Normalization
The text have different lengths based on words! Detecting the most normal range could help us find the maximum length of the sequences for the preprocessing step

In [None]:
# calculate the length of text based on their words
df['text_len_by_words'] = df['text'].apply(lambda t: len(hazm.word_tokenize(t)))
min_max_len = df["text_len_by_words"].min(), df["text_len_by_words"].max()
print(f'Min: {min_max_len[0]} \tMax: {min_max_len[1]}')

In [None]:
def data_gl_than(data, less_than=100.0, greater_than=0.0, col='text_len_by_words'):
    data_length = data[col].values
    data_glt = sum([1 for length in data_length if greater_than < length <= less_than])
    data_glt_rate = (data_glt / len(data_length)) * 100
    print(f'Texts with word length of greater than {greater_than} and less than {less_than} includes {data_glt_rate:.2f}% of the whole!')

In [None]:
minlim, maxlim = 10, 1000
data_gl_than(df, maxlim, minlim)

In [None]:
# remove text with the length of fewer than minlim words and more than maxlim
df['text_len_by_words'] = df['text_len_by_words'].apply(lambda len_t: len_t if minlim <= len_t else None)
df = df.dropna(subset=['text_len_by_words'])
df = df.reset_index(drop=True)

In [None]:
fig = go.Figure()

fig.add_trace(go.Histogram(
    x=df['text_len_by_words']
))

fig.update_layout(
    title_text='Distribution of word counts within text',
    xaxis_title_text='Word Count',
    yaxis_title_text='Frequency',
    bargap=0.2,
    bargroupgap=0.2)

fig.show()

In [None]:
int(np.mean(df['text_len_by_words']))

In [None]:
fig = go.Figure()

groupby_cat = df.groupby('cat')['cat'].count()

fig.add_trace(go.Bar(
    x=list(groupby_cat.index),
    y=groupby_cat.tolist(),
    text=groupby_cat.tolist(),
    textposition='auto'
))

fig.update_layout(
    title_text='Distribution of rate within text',
    xaxis_title_text='Rate',
    yaxis_title_text='Frequency',
    bargap=0.2,
    bargroupgap=0.2)

fig.show()

### balance data which their cats are under 1000 instances

In [None]:
group_cats = list(groupby_cat.index)
group_values = list(groupby_cat.values)
remove_cats = []
for idx, cat in enumerate(group_cats):
  if group_values[idx] < 1000:
    remove_cats.append(cat)

In [None]:
df['cat'] = df['cat'].apply(lambda cat: None if cat in remove_cats else cat)
df = df.dropna(subset=['cat'])
df = df.reset_index(drop=True)

In [None]:
labels = list(sorted(df['cat'].unique()))
print(f'We have #{len(labels)}: {labels}')

### Train,Test split

In [None]:
df['cat_id'] = df['cat'].apply(lambda t: labels.index(t))
train, test = train_test_split(df, test_size=0.1, random_state=1, stratify=df['cat'])
train, valid = train_test_split(train, test_size=0.1, random_state=1, stratify=train['cat'])

train = train.reset_index(drop=True)
valid = valid.reset_index(drop=True)
test = test.reset_index(drop=True)

x_train, y_train = train['text'].values.tolist(), train['cat_id'].values.tolist()
x_valid, y_valid = valid['text'].values.tolist(), valid['cat_id'].values.tolist()
x_test, y_test = test['text'].values.tolist(), test['cat_id'].values.tolist()

print(len(x_train))
print(len(x_valid))
print(len(x_test))

#Model loading
the BERT model input is a combination of 3 embeddings.
- Token embeddings: WordPiece token vocabulary (WordPiece is another word segmentation algorithm, similar to BPE)
- Segment embeddings: for pair sentences [A-B] marked as $E_A$ or $E_B$ mean that it belongs to the first sentence or the second one.
- Position embeddings: specify the position of words in a sentence

In [None]:
from transformers import BertConfig, BertTokenizer
from transformers import TFBertModel, TFBertForSequenceClassification, BertForSequenceClassification
from transformers import glue_convert_examples_to_features

import tensorflow as tf

In [None]:
# general config
MAX_LEN = 128
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
TEST_BATCH_SIZE = 32

EPOCHS = 3
EEVERY_EPOCH = 1000
LEARNING_RATE = 5e-3
CLIP = 0.0

MODEL_NAME_OR_PATH = 'HooshvareLab/bert-fa-base-uncased'
OUTPUT_PATH = '/content/news_classification.bin'

os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)

In [None]:
label2id = {label: i for i, label in enumerate(labels)}
id2label = {v: k for k, v in label2id.items()}

print(f'label2id: {label2id}')
print(f'id2label: {id2label}')

In [None]:
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME_OR_PATH)

#Input Embeddings / Dataset

In [None]:
class InputExample:
    """ A single example for simple sequence classification. """

    def __init__(self, guid, text_a, text_b=None, label=None):
        """ Constructs a InputExample. """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label


def make_examples(tokenizer, x, y=None, maxlen=MAX_LEN, output_mode="classification", is_tf_dataset=True):
    examples = []
    y = y if isinstance(y, list) or isinstance(y, np.ndarray) else [None] * len(x)

    for i, (_x, _y) in tqdm(enumerate(zip(x, y)), position=0, total=len(x)):
        guid = "%s" % i
        label = int(_y)
        
        if isinstance(_x, str):
            text_a = _x
            text_b = None
        else:
            assert len(_x) == 2
            text_a = _x[0]
            text_b = _x[1]
        
        examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    
    features = glue_convert_examples_to_features(
        examples, 
        tokenizer, 
        maxlen, 
        output_mode=output_mode, 
        label_list=list(np.unique(y)))

    all_input_ids = []
    all_attention_masks = []
    all_token_type_ids = []
    all_labels = []

    for f in tqdm(features, position=0, total=len(examples)):
        if is_tf_dataset:
            all_input_ids.append(tf.constant(f.input_ids))
            all_attention_masks.append(tf.constant(f.attention_mask))
            all_token_type_ids.append(tf.constant(f.token_type_ids))
            all_labels.append(tf.constant(f.label))
        else:
            all_input_ids.append(f.input_ids)
            all_attention_masks.append(f.attention_mask)
            all_token_type_ids.append(f.token_type_ids)
            all_labels.append(f.label)

    if is_tf_dataset:
        dataset = tf.data.Dataset.from_tensor_slices(({
            'input_ids': all_input_ids,
            'attention_mask': all_attention_masks,
            'token_type_ids': all_token_type_ids
        }, all_labels))

        return dataset, features
    
    xdata = [np.array(all_input_ids), np.array(all_attention_masks), np.array(all_token_type_ids)]
    ydata = all_labels

    return [xdata, ydata], features

In [None]:
train_dataset_base, train_examples = make_examples(tokenizer, x_train, y_train, maxlen=MAX_LEN)
valid_dataset_base, valid_examples = make_examples(tokenizer, x_valid, y_valid, maxlen=MAX_LEN)

test_dataset_base, test_examples = make_examples(tokenizer, x_test, y_test, maxlen=MAX_LEN)
[xtest, ytest], test_examples = make_examples(tokenizer, x_test, y_test, maxlen=MAX_LEN, is_tf_dataset=False)

In [None]:
for value in train_dataset_base.take(1):
    print(f'     input_ids: {value[0]["input_ids"]}')
    print(f'attention_mask: {value[0]["attention_mask"]}')
    print(f'token_type_ids: {value[0]["token_type_ids"]}')
    print(f'        target: {value[1]}')

In [None]:
def get_training_dataset(dataset, batch_size, buffer_size):
    dataset = dataset.shuffle(2048)
    dataset = dataset.batch(batch_size)

    return dataset

def get_validation_dataset(dataset, batch_size):
    dataset = dataset.batch(batch_size)

    return dataset

In [None]:
train_dataset = get_training_dataset(train_dataset_base, TRAIN_BATCH_SIZE, len(train_examples))
valid_dataset = get_validation_dataset(valid_dataset_base, VALID_BATCH_SIZE)

train_steps = len(train_examples) // TRAIN_BATCH_SIZE
valid_steps = len(valid_examples) // VALID_BATCH_SIZE

train_steps, valid_steps

## model

In [None]:
config = BertConfig.from_pretrained(
    MODEL_NAME_OR_PATH, **{
        'label2id': label2id,
        'id2label': id2label,
    })
print(config.to_json_string())

In [None]:
from transformers import TFBertModel, TFBertForSequenceClassification, BertForSequenceClassification
from transformers import TFAutoModel, AutoModel, AutoModelForTokenClassification


In [None]:
def build_model(model_name, config, learning_rate=3e-5):
    model = TFBertForSequenceClassification.from_pretrained(model_name, config=config)
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
    model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

    return model

In [None]:
model = build_model(MODEL_NAME_OR_PATH, config, learning_rate=LEARNING_RATE)

In [None]:
%%time

r = model.fit(
    train_dataset,
    validation_data=valid_dataset,
    steps_per_epoch=train_steps,
    validation_steps=valid_steps,
    epochs=EPOCHS,
    verbose=1)

final_accuracy = r.history['val_accuracy']
print('FINAL ACCURACY MEAN: ', np.mean(final_accuracy))

In [None]:
# save the model

model.save_pretrained(os.path.dirname(OUTPUT_PATH))

## Evaluation / Prediction

In [None]:
ev = model.evaluate(test_dataset_base.batch(TEST_BATCH_SIZE))
print()
print(f'Evaluation: {ev}')
print()

predictions = model.predict(xtest)
ypred = predictions[0].argmax(axis=-1).tolist()

print()
print(classification_report(ytest, ypred, target_names=labels))
print()

print(f'F1: {f1_score(ytest, ypred, average="weighted")}')

#Model loading
the BERT model input is a combination of 3 embeddings.
- Token embeddings: WordPiece token vocabulary (WordPiece is another word segmentation algorithm, similar to BPE)
- Segment embeddings: for pair sentences [A-B] marked as $E_A$ or $E_B$ mean that it belongs to the first sentence or the second one.
- Position embeddings: specify the position of words in a sentence

In [21]:
from transformers import BertConfig, BertTokenizer
from transformers import TFBertModel, TFBertForSequenceClassification, BertForSequenceClassification
from transformers import glue_convert_examples_to_features

import tensorflow as tf

In [27]:
# general config
MAX_LEN = 128
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
TEST_BATCH_SIZE = 32

EPOCHS = 3
EEVERY_EPOCH = 1000
LEARNING_RATE = 5e-3
CLIP = 0.0

MODEL_NAME_OR_PATH = 'HooshvareLab/bert-fa-base-uncased'
OUTPUT_PATH = '/content/news_classification.bin'

os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)

In [28]:
label2id = {label: i for i, label in enumerate(labels)}
id2label = {v: k for k, v in label2id.items()}

print(f'label2id: {label2id}')
print(f'id2label: {id2label}')

label2id: {'adabh': 0, 'aeqts': 1, 'akhar': 2, 'ejtem': 3, 'elmfa': 4, 'elmif': 5, 'eqtes': 6, 'gozar': 7, 'gungn': 8, 'havad': 9, 'jvarz': 10, 'kharj': 11, 'maqal': 12, 'nnaft': 13, 'polig': 14, 'shahr': 15, 'shari': 16, 'shrst': 17, 'siasi': 18, 'soxan': 19, 'vrzsh': 20}
id2label: {0: 'adabh', 1: 'aeqts', 2: 'akhar', 3: 'ejtem', 4: 'elmfa', 5: 'elmif', 6: 'eqtes', 7: 'gozar', 8: 'gungn', 9: 'havad', 10: 'jvarz', 11: 'kharj', 12: 'maqal', 13: 'nnaft', 14: 'polig', 15: 'shahr', 16: 'shari', 17: 'shrst', 18: 'siasi', 19: 'soxan', 20: 'vrzsh'}


In [24]:
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME_OR_PATH)

Downloading:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440 [00:00<?, ?B/s]

#Input Embeddings / Dataset

In [29]:
class InputExample:
    """ A single example for simple sequence classification. """

    def __init__(self, guid, text_a, text_b=None, label=None):
        """ Constructs a InputExample. """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label


def make_examples(tokenizer, x, y=None, maxlen=MAX_LEN, output_mode="classification", is_tf_dataset=True):
    examples = []
    y = y if isinstance(y, list) or isinstance(y, np.ndarray) else [None] * len(x)

    for i, (_x, _y) in tqdm(enumerate(zip(x, y)), position=0, total=len(x)):
        guid = "%s" % i
        label = int(_y)
        
        if isinstance(_x, str):
            text_a = _x
            text_b = None
        else:
            assert len(_x) == 2
            text_a = _x[0]
            text_b = _x[1]
        
        examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    
    features = glue_convert_examples_to_features(
        examples, 
        tokenizer, 
        maxlen, 
        output_mode=output_mode, 
        label_list=list(np.unique(y)))

    all_input_ids = []
    all_attention_masks = []
    all_token_type_ids = []
    all_labels = []

    for f in tqdm(features, position=0, total=len(examples)):
        if is_tf_dataset:
            all_input_ids.append(tf.constant(f.input_ids))
            all_attention_masks.append(tf.constant(f.attention_mask))
            all_token_type_ids.append(tf.constant(f.token_type_ids))
            all_labels.append(tf.constant(f.label))
        else:
            all_input_ids.append(f.input_ids)
            all_attention_masks.append(f.attention_mask)
            all_token_type_ids.append(f.token_type_ids)
            all_labels.append(f.label)

    if is_tf_dataset:
        dataset = tf.data.Dataset.from_tensor_slices(({
            'input_ids': all_input_ids,
            'attention_mask': all_attention_masks,
            'token_type_ids': all_token_type_ids
        }, all_labels))

        return dataset, features
    
    xdata = [np.array(all_input_ids), np.array(all_attention_masks), np.array(all_token_type_ids)]
    ydata = all_labels

    return [xdata, ydata], features

In [30]:
train_dataset_base, train_examples = make_examples(tokenizer, x_train, y_train, maxlen=MAX_LEN)
valid_dataset_base, valid_examples = make_examples(tokenizer, x_valid, y_valid, maxlen=MAX_LEN)

test_dataset_base, test_examples = make_examples(tokenizer, x_test, y_test, maxlen=MAX_LEN)
[xtest, ytest], test_examples = make_examples(tokenizer, x_test, y_test, maxlen=MAX_LEN, is_tf_dataset=False)

  0%|          | 0/14500 [00:00<?, ?it/s]

In [31]:
for value in train_dataset_base.take(1):
    print(f'     input_ids: {value[0]["input_ids"]}')
    print(f'attention_mask: {value[0]["attention_mask"]}')
    print(f'token_type_ids: {value[0]["token_type_ids"]}')
    print(f'        target: {value[1]}')

     input_ids: [    2     1  3229  2038  4906  7590  4963  2015  3501  2038     1  3912
  5388  2038  2038  4443 27490  2038  4073 32694  3483     1  2897  5662
  4115 27490  2011 10892  2956  4963  2015  3501  2038     1  3912  8568
  3414  4906     1     1  5045  3381  2038  3326     1 14213  3148  2038
  3764     1  3589     1     1  3543 11373 43166  5486  2038  3127 19682
  3298  5388  2038  2038 20399  5301  2822 27490  2809  2038  3381  5301
  2876 19006  4029  5985  4443 27490  2038  9923  2783  2976 27490 20749
  3350  4863     1  5168 27490  5388  2038  2038 14489     1  3229  2038
  4443 27490  2038  3434  4676  2038  2897     1  3229  2038     1  2991
  3421  3733  3764     1  3470     1 14213  4004  3740 27490 54492  3127
 19682  3298  5388  2038  2038  5824  9682     4]
attention_mask: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

In [74]:
def get_training_dataset(dataset, batch_size, buffer_size):
    dataset = dataset.shuffle(2048)
    dataset = dataset.batch(batch_size)

    return dataset

def get_validation_dataset(dataset, batch_size):
    dataset = dataset.batch(batch_size)

    return dataset

In [75]:
train_dataset = get_training_dataset(train_dataset_base, TRAIN_BATCH_SIZE, len(train_examples))
valid_dataset = get_validation_dataset(valid_dataset_base, VALID_BATCH_SIZE)

train_steps = len(train_examples) // TRAIN_BATCH_SIZE
valid_steps = len(valid_examples) // VALID_BATCH_SIZE

train_steps, valid_steps

(3670, 407)

## model

In [None]:
config = BertConfig.from_pretrained(
    MODEL_NAME_OR_PATH, **{
        'label2id': label2id,
        'id2label': id2label,
    })
print(config.to_json_string())

In [35]:
from transformers import TFBertModel, TFBertForSequenceClassification, BertForSequenceClassification
from transformers import TFAutoModel, AutoModel, AutoModelForTokenClassification


In [36]:
def build_model(model_name, config, learning_rate=3e-5):
    model = TFBertForSequenceClassification.from_pretrained(model_name, config=config)
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
    model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

    return model

In [None]:
model = build_model(MODEL_NAME_OR_PATH, config, learning_rate=LEARNING_RATE)

In [None]:
%%time

r = model.fit(
    train_dataset,
    validation_data=valid_dataset,
    steps_per_epoch=train_steps,
    validation_steps=valid_steps,
    epochs=EPOCHS,
    verbose=1)

final_accuracy = r.history['val_accuracy']
print('FINAL ACCURACY MEAN: ', np.mean(final_accuracy))

In [None]:
# save the model

model.save_pretrained(os.path.dirname(OUTPUT_PATH))

## Evaluation / Prediction

In [None]:
ev = model.evaluate(test_dataset_base.batch(TEST_BATCH_SIZE))
print()
print(f'Evaluation: {ev}')
print()

predictions = model.predict(xtest)
ypred = predictions[0].argmax(axis=-1).tolist()

print()
print(classification_report(ytest, ypred, target_names=labels))
print()

print(f'F1: {f1_score(ytest, ypred, average="weighted")}')