In [None]:
import pandas as pd
from transformers import DistilBertTokenizer
from transformers import TFDistilBertForSequenceClassification
from transformers import  TextClassificationPipeline

import tensorflow as tf
import json
import gc

from sklearn.model_selection import train_test_split
import re
import nltk

from nltk.corpus import stopwords
nltk.download('stopwords')

stopwrds = stopwords.words('english')

import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from plotly.offline import iplot


In [None]:
df = pd.read_csv("../input/bbc-text.csv")

In [None]:
df.shape

### Histogram of count of text

In [None]:
df['count'] = df['text'].apply(lambda x : len(x.split()))
df.head(10)

In [None]:
plt.figure(figsize=(8, 8))
sns.displot(data=df['count'])

plt.xlim(0, 1000)
plt.xlabel("The num of words", fontsize=16)
plt.yticks()

In [None]:
category_count = df['category'].value_counts()
categories = category_count.index

categories

In [None]:
fig = plt.figure(figsize=(11, 5))
ax = fig.add_subplot(111)

sns.barplot(x = category_count.index, y = category_count)

for a, p in enumerate(ax.patches):
    ax.annotate(f'{categories[a]}\n' + format(p.get_height(), '.0f'), xy=(p.get_x() + p.get_width()/2.0, p.get_height()), 
                xytext=(0, -25), size=13, color='white', ha='center', va='center', textcoords='offset points', 
                bbox= dict(boxstyle= 'round', facecolor='none', edgecolor='white', alpha=.05))

plt.xlabel("Categories", size=15)
plt.ylabel("No of News", size=15)
plt.xticks(size=12)
plt.title("No of News by categories", size=18)

In [None]:
df['encoded_text'] = df['category'].astype('category').cat.codes

In [None]:
df[['category', 'encoded_text']].sort_values

In [None]:
data_texts = df['text'].to_list()

data_labels = df['encoded_text'].to_list()

### Train-Test Split

In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(data_texts, data_labels, test_size=0.2, random_state=0)

train_texts, test_texts, train_labels, test_labels = train_test_split(train_texts, train_labels, test_size=0.01, random_state=0)

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

train_encodings = tokenizer(train_texts, truncation=True, padding=True)

val_encodings = tokenizer(val_texts, truncation=True, padding=True)

In [None]:
train_datasets = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings), train_labels
))

val_datasets = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings), val_labels
))

### Fine Tuning with the TFTrainer class

In [None]:
from transformers import TFDistilBertForSequenceClassification, TFTrainer, TFTrainingArguments

training_args = TFTrainingArguments(
    output_dir='results',
    num_train_epochs=4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=1e-5,
    logging_dir="logs",
    eval_steps=100
)

with training_args.strategy.scope():
    trainer_model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=5)

trainer = TFTrainer(
    model = trainer_model,
    args = training_args,
    train_dataset=train_datasets,
    eval_dataset=val_datasets
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()