In [None]:
!pip3 install transformers sentencepiece hazm clean-text[gpl]
!pip install pyyaml==5.4.1

In [None]:
!gdown 1D3yt99D0GcCRCbdKbUQGxbqjkeh91hTg

In [None]:
!unrar x hamshahri.rar
!cp /content/hamshahriold/Corpus/Hamshahri-Categories.txt /content/
!unzip /content/hamshahriold/Corpus/Hamshahri-Corpus.zip
!unzip /content/hamshahriold/Corpus/PersianStopWords.zip

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.utils import shuffle

import hazm
from transformers import AutoConfig, AutoTokenizer, TFAutoModel, AutoModel, DataCollatorWithPadding
import plotly.express as px
import plotly.graph_objects as go

# Save data to csv file

In [None]:
# [[DID value, Date value, CAT, text]]
corpus = []
tmp_text = " "
tmp_values = []
c = 0
with open('Hamshahri-Corpus.txt', "rb") as file:
  for line in file:
    line = line.decode("UTF-8")
    if ".DID" in line:
      # some news are abnormal lenght and they are low in number(about 1000)
      if len(tmp_text.split(' ')) < 2500:
        tmp_values.append(tmp_text)
        corpus.append(tmp_values)
      tmp_text = ""
      tmp_values = []
      tmp_values.append(line.replace(".DID\t", "").replace("\r\n",""))
    elif ".Date" in line:
      tmp_values.append(line.replace(".Date\t", "").replace("\r\n","").replace("\\", "/"))
    elif ".Cat" in line:
      tmp_values.append(line.replace(".Cat\t", "").replace("\r\n",""))
    else:
      tmp_text += (line.strip() + " ")
corpus.pop(0)
len(corpus)

In [None]:
df = pd.DataFrame(corpus, columns=['DID', 'date', 'cat', 'text'])
df

In [None]:
df.to_csv("dataset.csv", date_format='%Y%m%d')

#preprocessing

In [None]:
df = df[['text', 'cat']]

In [None]:
x = 0
y = 0
z = 0
for txt in df["text"]:
  l = len(txt.split(" "))
  if len(txt.split(" ")) > 2500:
    z += 1
    x = l
    y=txt
x
z

In [None]:
y

In [None]:
# stop word
stop_words_list = []
with open('PersianStopWords.txt', "rb") as file:
  for line in file:
    stop_words_list.append(line.decode("UTF-8").replace('\r\n', ""))

for idx, txt in enumerate(df["text"]):
  word_tokenized =  hazm.word_tokenize(txt)
  cps = ""
  for word in word_tokenized:
    if word not in stop_words_list:
      cps += word + " "
      
  df.loc[idx].at['text'] = cps
  if idx % 30000 == 0:
    print(idx, "numbers cleaned")

### Normalization
The text have different lengths based on words! Detecting the most normal range could help us find the maximum length of the sequences for the preprocessing step

In [None]:
# calculate the length of text based on their words
df['text_len_by_words'] = df['text'].apply(lambda t: len(hazm.word_tokenize(t)))
min_max_len = df["text_len_by_words"].min(), df["text_len_by_words"].max()
print(f'Min: {min_max_len[0]} \tMax: {min_max_len[1]}')

In [None]:
def data_gl_than(data, less_than=100.0, greater_than=0.0, col='text_len_by_words'):
    data_length = data[col].values
    data_glt = sum([1 for length in data_length if greater_than < length <= less_than])
    data_glt_rate = (data_glt / len(data_length)) * 100
    print(f'Texts with word length of greater than {greater_than} and less than {less_than} includes {data_glt_rate:.2f}% of the whole!')

In [None]:
minlim, maxlim = 10, 1000
data_gl_than(df, maxlim, minlim)

In [None]:
# remove text with the length of fewer than minlim words and more than maxlim
df['text_len_by_words'] = df['text_len_by_words'].apply(lambda len_t: len_t if minlim <= len_t <= maxlim else None)
df = df.dropna(subset=['text_len_by_words'])
df = df.reset_index(drop=True)

In [None]:
fig = go.Figure()

fig.add_trace(go.Histogram(
    x=df['text_len_by_words']
))

fig.update_layout(
    title_text='Distribution of word counts within text',
    xaxis_title_text='Word Count',
    yaxis_title_text='Frequency',
    bargap=0.2,
    bargroupgap=0.2)

fig.show()

In [None]:
fig = go.Figure()

groupby_cat = df.groupby('cat')['cat'].count()

fig.add_trace(go.Bar(
    x=list(groupby_cat.index),
    y=groupby_cat.tolist(),
    text=groupby_cat.tolist(),
    textposition='auto'
))

fig.update_layout(
    title_text='Distribution of rate within text',
    xaxis_title_text='Rate',
    yaxis_title_text='Frequency',
    bargap=0.2,
    bargroupgap=0.2)

fig.show()

### balance data which their cats are under 1000 instances

In [None]:
group_cats = list(groupby_cat.index)
group_values = list(groupby_cat.values)
remove_cats = []
for idx, cat in enumerate(group_cats):
  if group_values[idx] < 1000:
    remove_cats.append(cat)

In [None]:
df['cat'] = df['cat'].apply(lambda cat: None if cat in remove_cats else cat)
df = df.dropna(subset=['cat'])
df = df.reset_index(drop=True)

In [None]:
unique_cats = list(sorted(df['cat'].unique()))
print(f'We have #{len(unique_cats)}: {unique_cats}')

## Train,Test split

In [None]:
df['cat_id'] = df['cat'].apply(lambda t: unique_cats.index(t))
train, test = train_test_split(df, test_size=0.1, random_state=1, stratify=df['cat'])
train, valid = train_test_split(train, test_size=0.1, random_state=1, stratify=train['cat'])

train = train.reset_index(drop=True)
valid = valid.reset_index(drop=True)
test = test.reset_index(drop=True)

x_train, y_train = train['text'].values.tolist(), train['cat_id'].values.tolist()
x_valid, y_valid = valid['text'].values.tolist(), valid['cat_id'].values.tolist()
x_test, y_test = test['text'].values.tolist(), test['cat_id'].values.tolist()

print(train.shape)
print(valid.shape)
print(test.shape)