In [87]:
import numpy as np
import tensorflow_datasets as tfds
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder

In [88]:
# data_set_url = '/home/ireshr/PageroLabs/Full_dataset/secure_archive/dataset.csv'
data_set_url = '/home/ireshr/PageroLabs/11-1/dataset_acrhive/processed-dataset.csv'
b_data=pd.read_csv( data_set_url , lineterminator='\n' , header = None)
b_data=b_data.astype(str)

In [89]:
MAX_SAMPLES = 10000

### Fetch Data

In [90]:
subject_array = b_data[2][:].tolist()
description_array = b_data[3][:].tolist()
teams_array = b_data[4][:].tolist()
# teams = b_data[3][:10000]

In [91]:
len(teams_array)

9999

### Preprocess

In [92]:
import re

def preprocess_sentence(text):
    text = re.sub(r"\S{21,}", " ", text)
    text = ' '.join(re.findall(r'\b[a-zA-ZäöåÄÖÅ]+\b', text))
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [93]:
def preprocess_sentences(sentences):
    new_sentences = []
    for sentence in sentences:
        new_sentences.append(preprocess_sentence(sentence))
    return new_sentences

In [94]:
subject_array = preprocess_sentences(subject_array)

In [95]:
description_array = preprocess_sentences(description_array)

In [96]:
special_token = ['<PAD>', '<EOS>', '<OUT>', '<SOS>', '<ST>']

In [97]:
tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    special_token + subject_array + description_array, target_vocab_size=2**18)

In [98]:
VOCAB_SIZE = tokenizer.vocab_size

In [99]:
print(VOCAB_SIZE)

46995


In [100]:
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(teams_array)

In [101]:
from datetime import  datetime
# 
now = datetime.now()
# tokenizer.save_to_file('./dictionary_'+ now.strftime("%m %d %Y, %H:%M:%S"))

In [102]:
subwords = tokenizer.subwords

# Step 3: Sort the subwords
subwords.sort()

# Step 4: Save the sorted subwords to a file
file_path = './dictionary_'+ now.strftime("%m %d %Y, %H:%M:%S")
with open(file_path, "w", encoding="utf-8") as file:
    for subword in subwords:
        file.write(subword + "\n")

In [103]:
START_TOKEN, END_TOKEN , SUBJECT_TOKEN = [tokenizer.vocab_size], [tokenizer.vocab_size + 1] , [tokenizer.vocab_size + 2]

In [104]:
def concat_subject_and_description(subjects, descriptions):
    inputs = []
    for (subject, description) in zip(subjects, descriptions):
        input = ' <ST> '+ subject + ' <ST> ' + description
        inputs.append(input)
    return inputs

ticket_input = concat_subject_and_description(subject_array, description_array)

In [105]:
def maximum_input_size(inputs):
    max = len(tokenizer.encode(inputs[0]))
    sentence = ''
    for element in inputs:
        tokens = tokenizer.encode(element)
        if max > len(tokens):
            max = max
        else:
            max = len(tokens)
            sentence = element

    return max , sentence

MAX_LENGTH , sentence = maximum_input_size(ticket_input)


In [106]:
print(MAX_LENGTH)
MAX_LENGTH = MAX_LENGTH + 3
# print(sentence)

3737


In [107]:
# unique_teams = sorted(list(set(teams_array)))
# unique_teams

In [108]:
# Tokenize, filter and pad sentences
def tokenize_and_filter(inputs, outputs):
    tokenized_inputs,labeled_team = [], []

    for (sentence1, team) in zip(inputs, outputs):
        # tokenize sentence
        sentence1 = START_TOKEN + tokenizer.encode(sentence1) + END_TOKEN
        # sentence2 = START_TOKEN + tokenizer.encode(sentence2) + END_TOKEN
        team = label_encoder.transform([team])
        # check tokenized sentence max length
        if len(sentence1) <= MAX_LENGTH:
            tokenized_inputs.append(sentence1)
            labeled_team.append(team)

    # pad tokenized sentences
    tokenized_inputs = tf.keras.preprocessing.sequence.pad_sequences(
        tokenized_inputs, maxlen=MAX_LENGTH, padding='post')

    return tokenized_inputs, labeled_team


questions, answers = tokenize_and_filter(ticket_input, teams_array)

In [109]:
# for (a,b) in zip(teams_array, answers):
#     print(a,b)

In [112]:
data_to_save = {
    'questions': questions,
    'answers': answers,
    'VOCAB_SIZE': VOCAB_SIZE
}

In [113]:
import pickle
with open('data.pkl', 'wb') as file:
    pickle.dump(data_to_save, file)

# Load the data from the pickle file
with open('data.pkl', 'rb') as file:
    loaded_data = pickle.load(file)

In [114]:
loaded_array1 = loaded_data['questions']
loaded_array2 = loaded_data['answers']
loaded_variable = loaded_data['VOCAB_SIZE']

# Print the loaded arrays and variable
print("Loaded Array 1:", loaded_array1)
print("Loaded Array 2:", loaded_array2)
print("Loaded Variable:", loaded_variable)

Loaded Array 1: [[46995 46771 46799 ...     0     0     0]
 [46995 46771 46799 ...     0     0     0]
 [46995 46771 46799 ...     0     0     0]
 ...
 [46995 46771 46799 ...     0     0     0]
 [46995 46771 46799 ...     0     0     0]
 [46995 46771 46799 ...     0     0     0]]
Loaded Array 2: [array([55]), array([8]), array([25]), array([2]), array([12]), array([2]), array([46]), array([46]), array([10]), array([12]), array([12]), array([9]), array([12]), array([49]), array([25]), array([12]), array([43]), array([29]), array([12]), array([13]), array([10]), array([10]), array([10]), array([42]), array([2]), array([17]), array([10]), array([12]), array([10]), array([43]), array([46]), array([12]), array([12]), array([12]), array([13]), array([33]), array([13]), array([43]), array([12]), array([37]), array([18]), array([35]), array([38]), array([11]), array([18]), array([51]), array([12]), array([49]), array([18]), array([18]), array([37]), array([46]), array([47]), array([49]), array(