### Important Notes for Using This Notebook

- Please set a device id
- Due to data privacy and GitHub data size allowed you have to upload the data and word2vec pre-trained model yourself. <br>
	For Turkish Word2vec pretrained model, you need to go to <a href="https://drive.google.com/drive/folders/1IBMTAGtZ4DakSCyAoA4j7Ch0Ft1aFoww">this address</a> and download it
- Please set file paths for checkpoint and model (checkpoint_file_path and model_name)
- To be able to use different data independent of the dataset, you must load them yourself (for this go to 4.1 and you can see an example like reading text file)

## 1. LOAD LIBRARIES

In [None]:
import numpy as np
import pandas as pd
from sklearn import metrics
from gensim.models import KeyedVectors
from tensorflow.keras.models import load_model
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import itertools
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
import pickle
import os
import sys

source_path = (os.path.abspath(os.path.join(os.path.dirname("__file__"), '..'))+ '/src/')
sys.path.append(source_path)

# text preprocessing method
from text_preprocessing import preprocessing

gpu_number = DEVICE_ID #### SET SPECIAL DEVICE ID 
gpus = tf.config.list_physical_devices('GPU')
if gpus:
	tf.config.experimental.set_visible_devices(gpus[gpu_number], 'GPU') 
	logical_gpus = tf.config.experimental.list_logical_devices('GPU')

## 2. Load Dataset and Word Vectors and Prepare Necessary objects for Modelling

To download the Turkish Word2Vec Model https://drive.google.com/drive/folders/1IBMTAGtZ4DakSCyAoA4j7Ch0Ft1aFoww go to this link

In [2]:
main_dir = os.path.dirname(os.path.dirname(os.path.abspath("__file__")))
data_dir = os.path.join(main_dir, "data/")
models_dir = os.path.join(main_dir, "model/")
outputs_dir = models_dir = os.path.join(main_dir, "outputs/")

In [None]:
data_path = data_dir + "your_dataset.csv"
word_vectors_path = data_dir + "word_index.pkl"

In [3]:
df = pd.read_csv(data_path)
word_vectors = KeyedVectors.load_word2vec_format(word_vectors_path, binary=True)

# we decided to remove two categories from the dataset that have less data than the others and we will solve them with regex
df = df[~df.kategori.isin(["Cumhurbaşkanlığı Kararnamesi", "Kanun Hükmünde Kararname"])]

# if no preprocessing has been done, use preprocessing method
df.data_text = df.data_text.apply(preprocessing)

In [None]:
le = LabelEncoder()
ids = le.fit_transform(df.kategori)
label_dict = dict(zip(le.classes_, range(len(le.classes_))))

pickle.dump(open(data_dir + "label_dict.pkl", "rb"))

<h4>INFO:</h4>
<b>chunk_size</b> is a value to be used to augmentation data and it will use word_indexes to render the first N of the texts as a new line.
	
            text = filter_by_word_index(" ".join(text.split()[:chunk_size]))

<b>max_length/max_len</b> limits a text to a specified value (0:N)

            text = " ".join(text.split()[:max_len])

Note: if max_len < chunk_size, chunk_size is disabled       

In [4]:
# These values are random. If you want to see performance at different values, change these values as you want
# When we evaluated the performances according max_len, best scores were observed at 64, 128 and 256 max_len
CHUNK_SIZE = 300
MAX_LEN = 64

<i>DeepDataset module includes word_index, embedding_matrix, train, validation and test datasets for use in training</i>


In [None]:
from dataset import DeepDataset
dataset = DeepDataset(df, text_column="text", label_column="kategori", chunk_size=CHUNK_SIZE, word_vectors=word_vectors, max_len=MAX_LEN)
dataset.prepare_data()

#### Confusion Matrix Visualization Method

In [5]:
def plot_confusion_matrix(cm, classes, normalize=False, title=None, cmap=plt.cm.Blues):
		plt.figure(figsize=(12,6))
		plt.imshow(cm, interpolation='nearest', cmap=cmap)
		plt.title(title)
		plt.colorbar()
		tick_marks = np.arange(len(classes))
		plt.xticks(tick_marks, classes, rotation=90)
		plt.yticks(tick_marks, classes)

		if normalize:
			cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
			print("Normalized confusion matrix")
		else:
			print('Confusion matrix, without normalization')

		thresh = cm.max() / 2.
		for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
			plt.text(j, i, cm[i, j],
						horizontalalignment="center",
						color="white" if cm[i, j] > thresh else "black")

		plt.tight_layout()
		plt.ylabel('True label')
		plt.xlabel('Predicted label')

## 3. TRAINING    

In [None]:
EARLY_STOPPING = True
CLASS_WEIGHTS = True

EPOCHS = 10
BATCH_SIZE = 256 # decrease this value if you have insufficient GPU capacity

In [None]:
from modelling import BiLSTM_Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# create callbacks
# set file path to save for checkpoint
checkpoint = ModelCheckpoint(models_dir + "model_checkpoint.hdf5", monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
earlystopping = EarlyStopping(monitor="val_accuracy", min_delta=0.001, patience=9, mode="max")

callbacks = [checkpoint, earlystopping]


In [None]:
# build model and start training
model = BiLSTM_Model(epochs=EPOCHS, batch_size=BATCH_SIZE, callbacks=None)
model.train(dataset, class_weights=CLASS_WEIGHTS)

In [None]:
# save model
model.model.save(models_dir + "model.hdf5")

## 4. TESTING and RESULTS

In [None]:
# predict method is in modeling.py, not in tensorflow/keras and for tensorflow classic methods use model.model.predict

# model.predict stores class probabilities and class values
model.predict(dataset.test_data)

predictions = model.predictions
test_labels = dataset.test_labels

In [None]:
# show performance scores
print(metrics.classification_report(test_labels, predictions, target_names=list(dataset.label_dict.keys())))

In [None]:
# and plot confusion matrix
cm = confusion_matrix(test_labels, predictions)
plot_confusion_matrix(cm, classes=list(dataset.label_dict.keys()), title="")

#### 4.1. Predict Special Text OR Text List (From dataframe, json, file etc.)

In [17]:
def prediction(model, word_index, max_len, text_list):

	tokenized = []
	for text in text_list:
		padded_custom = [0 for i in range(max_len)]
		_ = [padded_custom.__setitem__(i, word_index[word]) for i, word in enumerate(text.split()[:max_len]) if word in word_index]
		tokenized.append(padded_custom)   

	class_probabilities = model.predict(tokenized)
	prediction = np.argmax(class_probabilities, axis=1)
	return prediction[0]

In [None]:
#load model
model = load_model(models_dir + "model.hdf5")  

In [None]:
# testing requires word index and tag dict loading
word_index = pickle.load(data_dir + "word_index.pkl")
label_dict = pickle.load(data_dir + "label_dict.pkl")

# swap keys and values to convert predictions into categories
reverse_label_dict = {v:k for k,v in label_dict.items()}

In [None]:
# upload sample text from a text file and preprocessing
with open(data_dir + "sample.txt", "r") as f:
    sample_text = f.read()
    sample_text = preprocessing(sample_text)

In [None]:
# use [text] pattern for single text
# otherwise use text list
# assign max_len value the model uses 
prediction = prediction(model, word_index, MAX_LEN, [sample_text])
print(reverse_label_dict[prediction])