In [59]:
import numpy as np
import tensorflow_datasets as tfds
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder

In [60]:
# data_set_url = '/home/ireshr/PageroLabs/Full_dataset/secure_archive/dataset.csv'
data_set_url = '/home/ireshr/PageroLabs/11-1/dataset_acrhive/processed-dataset.csv'
b_data=pd.read_csv( data_set_url , lineterminator='\n' , header = None)
b_data=b_data.astype(str)

In [61]:
MAX_SAMPLES = 10000

### Fetch Data

In [62]:
subject_array = b_data[2][:].tolist()
description_array = b_data[3][:].tolist()
teams_array = b_data[4][:].tolist()
# teams = b_data[3][:10000]

In [63]:
len(teams_array)

9999

### Preprocess

In [64]:
import re

def preprocess_sentence(text):
    text = re.sub(r"\S{21,}", " ", text)
    text = ' '.join(re.findall(r'\b[a-zA-ZäöåÄÖÅ]+\b', text))
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [65]:
def preprocess_sentences(sentences):
    new_sentences = []
    for sentence in sentences:
        new_sentences.append(preprocess_sentence(sentence))
    return new_sentences

In [66]:
subject_array = preprocess_sentences(subject_array)

In [67]:
description_array = preprocess_sentences(description_array)

In [68]:
special_token = ['<PAD>', '<EOS>', '<OUT>', '<SOS>', '<ST>']

In [69]:
tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    special_token + subject_array + description_array, target_vocab_size=2**18)

In [70]:
VOCAB_SIZE = tokenizer.vocab_size

In [71]:
print(VOCAB_SIZE)

46995


In [72]:
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(teams_array)

In [73]:
from datetime import  datetime
# 
now = datetime.now()
# tokenizer.save_to_file('./dictionary_'+ now.strftime("%m %d %Y, %H:%M:%S"))

In [74]:
subwords = tokenizer.subwords

# Step 3: Sort the subwords
subwords.sort()

# Step 4: Save the sorted subwords to a file
file_path = './dictionary_'+ now.strftime("%m %d %Y, %H:%M:%S")
with open(file_path, "w", encoding="utf-8") as file:
    for subword in subwords:
        file.write(subword + "\n")

In [75]:
START_TOKEN, END_TOKEN , SUBJECT_TOKEN = [tokenizer.vocab_size], [tokenizer.vocab_size + 1] , [tokenizer.vocab_size + 2]

In [76]:
def concat_subject_and_description(subjects, descriptions):
    inputs = []
    for (subject, description) in zip(subjects, descriptions):
        input = ' <ST> '+ subject + ' <ST> ' + description
        inputs.append(input)
    return inputs

ticket_input = concat_subject_and_description(subject_array, description_array)

In [77]:
def maximum_input_size(inputs):
    max = len(tokenizer.encode(inputs[0]))
    sentence = ''
    for element in inputs:
        tokens = tokenizer.encode(element)
        if max > len(tokens):
            max = max
        else:
            max = len(tokens)
            sentence = element

    return max , sentence

MAX_LENGTH , sentence = maximum_input_size(ticket_input)


In [78]:
print(MAX_LENGTH)
MAX_LENGTH = MAX_LENGTH + 3
# print(sentence)

3737


In [79]:
# unique_teams = sorted(list(set(teams_array)))
# unique_teams

['1st & 2nd Line SE - TMS',
 '1st Line 24/7 - PO',
 '1st Line DE - HBS',
 '1st Line ES - PO',
 '1st Line FI - PO & HBS',
 '1st Line FR - PO',
 '1st Line PL - PO',
 '1st Line SE - PO & Pay',
 '2nd Line - PO & Pay',
 '2nd Line DE - HBS',
 '3rd Line - Pay',
 '3rd Line Integration - HBS',
 '3rd Line R&D - HBS',
 'Administrators',
 'Albatross',
 'Buzzard',
 'CI - HBS Projects',
 'Data Capturing DE - HBS',
 'Eagle',
 'Falcon',
 'Griffin',
 'Hawk',
 'Hippogriff',
 'Hummingbird',
 'Integration',
 'Integration Analysts',
 'Integration Consultants – Connectivity',
 'Integration Consultants – Freight',
 'Integration Consultants – Implementation',
 'Integration Consultants – Managed Service',
 'Integration Consultants – Solution',
 'Integration Consultants – USA',
 'Integration Core Specialists',
 'Interop',
 'Kingfisher',
 'Light Agents',
 'Owl',
 'Pagero Eloomi - Zendesk course',
 'Pagero Support Web Admin',
 'Peacock',
 'Penguin',
 'Phoenix',
 'Product Owner Pay',
 'Projects DE - HBS',
 'Projec

In [80]:
# Tokenize, filter and pad sentences
def tokenize_and_filter(inputs, outputs):
    tokenized_inputs,labeled_team = [], []

    for (sentence1, team) in zip(inputs, outputs):
        # tokenize sentence
        sentence1 = START_TOKEN + tokenizer.encode(sentence1) + END_TOKEN
        # sentence2 = START_TOKEN + tokenizer.encode(sentence2) + END_TOKEN
        team = label_encoder.transform([team])
        # check tokenized sentence max length
        if len(sentence1) <= MAX_LENGTH:
            tokenized_inputs.append(sentence1)
            labeled_team.append(team)

    # pad tokenized sentences
    tokenized_inputs = tf.keras.preprocessing.sequence.pad_sequences(
        tokenized_inputs, maxlen=MAX_LENGTH, padding='post')

    return tokenized_inputs, labeled_team


questions, answers = tokenize_and_filter(ticket_input, teams_array)

In [86]:
# for (a,b) in zip(teams_array, answers):
#     print(a,b)

group [55]
2nd Line - PO & Pay [8]
Integration Analysts [25]
1st Line DE - HBS [2]
3rd Line R&D - HBS [12]
1st Line DE - HBS [2]
Projects SE - TMS [46]
Projects SE - TMS [46]
3rd Line - Pay [10]
3rd Line R&D - HBS [12]
3rd Line R&D - HBS [12]
2nd Line DE - HBS [9]
3rd Line R&D - HBS [12]
Scrooge [49]
Integration Analysts [25]
3rd Line R&D - HBS [12]
Projects DE - HBS [43]
Integration Consultants – Managed Service [29]
3rd Line R&D - HBS [12]
Administrators [13]
3rd Line - Pay [10]
3rd Line - Pay [10]
3rd Line - Pay [10]
Product Owner Pay [42]
1st Line DE - HBS [2]
Data Capturing DE - HBS [17]
3rd Line - Pay [10]
3rd Line R&D - HBS [12]
3rd Line - Pay [10]
Projects DE - HBS [43]
Projects SE - TMS [46]
3rd Line R&D - HBS [12]
3rd Line R&D - HBS [12]
3rd Line R&D - HBS [12]
Administrators [13]
Interop [33]
Administrators [13]
Projects DE - HBS [43]
3rd Line R&D - HBS [12]
Pagero Eloomi - Zendesk course [37]
Eagle [18]
Light Agents [35]
Pagero Support Web Admin [38]
3rd Line Integration - 

In [None]:
import pickle
with open('data.pkl', 'wb') as file:
    pickle.dump(questions, file)
    pickle.dump(answers, file)
    pickle.dump(VOCAB_SIZE, file)