In [37]:
import numpy as np
import tensorflow_datasets as tfds
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder

In [38]:
# data_set_url = '/home/ireshr/PageroLabs/Full_dataset/secure_archive/dataset.csv'
data_set_url = '/home/ireshr/PageroLabs/11-1/dataset_acrhive/processed-dataset.csv'
b_data=pd.read_csv( data_set_url , lineterminator='\n' , header = None)
b_data=b_data.astype(str)

In [39]:
MAX_SAMPLES = 10000

### Fetch Data

In [40]:
subject_array = b_data[2][1:].tolist()
description_array = b_data[3][1:].tolist()
teams_array = b_data[4][1:].tolist()
# teams = b_data[3][:10000]

In [41]:
teams_array

['2nd Line - PO & Pay',
 'Integration Analysts',
 '1st Line DE - HBS',
 '3rd Line R&D - HBS',
 '1st Line DE - HBS',
 'Projects SE - TMS',
 'Projects SE - TMS',
 '3rd Line - Pay',
 '3rd Line R&D - HBS',
 '3rd Line R&D - HBS',
 '2nd Line DE - HBS',
 '3rd Line R&D - HBS',
 'Scrooge',
 'Integration Analysts',
 '3rd Line R&D - HBS',
 'Projects DE - HBS',
 'Integration Consultants – Managed Service',
 '3rd Line R&D - HBS',
 'Administrators',
 '3rd Line - Pay',
 '3rd Line - Pay',
 '3rd Line - Pay',
 'Product Owner Pay',
 '1st Line DE - HBS',
 'Data Capturing DE - HBS',
 '3rd Line - Pay',
 '3rd Line R&D - HBS',
 '3rd Line - Pay',
 'Projects DE - HBS',
 'Projects SE - TMS',
 '3rd Line R&D - HBS',
 '3rd Line R&D - HBS',
 '3rd Line R&D - HBS',
 'Administrators',
 'Interop',
 'Administrators',
 'Projects DE - HBS',
 '3rd Line R&D - HBS',
 'Pagero Eloomi - Zendesk course',
 'Eagle',
 'Light Agents',
 'Pagero Support Web Admin',
 '3rd Line Integration - HBS',
 'Eagle',
 'Team Lead and Management',
 

In [42]:
len(teams_array)

9998

### Preprocess

In [43]:
import re

def preprocess_sentence(text):
    text = re.sub(r"\S{21,}", " ", text)
    text = ' '.join(re.findall(r'\b[a-zA-ZäöåÄÖÅ]+\b', text))
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [44]:
def preprocess_sentences(sentences):
    new_sentences = []
    for sentence in sentences:
        new_sentences.append(preprocess_sentence(sentence))
    return new_sentences

In [45]:
subject_array = preprocess_sentences(subject_array)

In [46]:
description_array = preprocess_sentences(description_array)

In [47]:
special_token = ['<PAD>', '<EOS>', '<OUT>', '<SOS>', '<ST>']

In [48]:
tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    special_token + subject_array + description_array, target_vocab_size=2**18)

In [49]:
VOCAB_SIZE = tokenizer.vocab_size

In [50]:
print(VOCAB_SIZE)

46994


In [51]:
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(teams_array)

In [52]:
total_teams = len(label_encoder.classes_)

In [53]:
from datetime import  datetime
# 
now = datetime.now()
# tokenizer.save_to_file('./dictionary_'+ now.strftime("%m %d %Y, %H:%M:%S"))

In [54]:
subwords = tokenizer.subwords

# Step 3: Sort the subwords
subwords.sort()

# Step 4: Save the sorted subwords to a file
file_path = './dictionary_'+ now.strftime("%m %d %Y, %H:%M:%S")
with open(file_path, "w", encoding="utf-8") as file:
    for subword in subwords:
        file.write(subword + "\n")

In [55]:
START_TOKEN, END_TOKEN , SUBJECT_TOKEN = [tokenizer.vocab_size], [tokenizer.vocab_size + 1] , [tokenizer.vocab_size + 2]

In [56]:
def concat_subject_and_description(subjects, descriptions):
    inputs = []
    for (subject, description) in zip(subjects, descriptions):
        input = ' <ST> '+ subject + ' <ST> ' + description
        inputs.append(input)
    return inputs

ticket_input = concat_subject_and_description(subject_array, description_array)

In [57]:
def maximum_input_size(inputs):
    max = len(tokenizer.encode(inputs[0]))
    sentence = ''
    for element in inputs:
        tokens = tokenizer.encode(element)
        if max > len(tokens):
            max = max
        else:
            max = len(tokens)
            sentence = element

    return max , sentence

MAX_LENGTH , sentence = maximum_input_size(ticket_input)


In [58]:
print(MAX_LENGTH)
MAX_LENGTH = MAX_LENGTH + 3
# print(sentence)

3737


In [59]:
# unique_teams = sorted(list(set(teams_array)))
# unique_teams

In [60]:
# Tokenize, filter and pad sentences
def tokenize_and_filter(inputs, outputs):
    tokenized_inputs,labeled_team = [], []

    for (sentence1, team) in zip(inputs, outputs):
        # tokenize sentence
        sentence1 = START_TOKEN + tokenizer.encode(sentence1) + END_TOKEN
        # sentence2 = START_TOKEN + tokenizer.encode(sentence2) + END_TOKEN
        team = label_encoder.transform([team])
        # check tokenized sentence max length
        if len(sentence1) <= MAX_LENGTH:
            tokenized_inputs.append(sentence1)
            labeled_team.append(team[0])

    # pad tokenized sentences
    tokenized_inputs = tf.keras.preprocessing.sequence.pad_sequences(
        tokenized_inputs, maxlen=MAX_LENGTH, padding='post')

    return tokenized_inputs, labeled_team


questions, answers = tokenize_and_filter(ticket_input, teams_array)

In [61]:
# for (a,b) in zip(teams_array, answers):
#     print(a,b)

In [62]:
data_to_save = {
    'questions': questions,
    'answers': answers,
    'VOCAB_SIZE': VOCAB_SIZE,
    'total_teams': total_teams
}

In [63]:
import pickle
with open('data1.pkl', 'wb') as file:
    pickle.dump(data_to_save, file)

# Load the data from the pickle file
with open('data1.pkl', 'rb') as file:
    loaded_data = pickle.load(file)

In [64]:
loaded_array1 = loaded_data['questions']
loaded_array2 = loaded_data['answers']
loaded_variable = loaded_data['VOCAB_SIZE']
total_teams = loaded_data['total_teams']

# Print the loaded arrays and variable
print("Loaded Array 1:", loaded_array1[0])
print("Loaded Array 2:", loaded_array2[0])
print("Loaded Variable:", loaded_variable)
print("total teams:", total_teams)

Loaded Array 1: [46994 46770 46798 ...     0     0     0]
Loaded Array 2: 8
Loaded Variable: 46994
total teams: 55
