In [None]:
##

### Importing necessary libraries and input files

In [None]:
# !pip install --upgrade transformers
# !pip install simpletransformers
# # memory footprint support libraries/code
# !ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
# !pip install gputil
# !pip install psutil
# !pip install humanize


In [None]:
import psutil
import humanize
import os
import GPUtil as GPU

GPUs = GPU.getGPUs()
gpu = GPUs[0]
def printm():
    process = psutil.Process(os.getpid())
    print("Gen RAM Free: " + humanize.naturalsize(psutil.virtual_memory().available), " |     Proc size: " + humanize.naturalsize(process.memory_info().rss))
    print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total     {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm()


Gen RAM Free: 26.4 GB  |     Proc size: 110.6 MB
GPU RAM Free: 16280MB | Used: 0MB | Util   0% | Total     16280MB


In [None]:
import numpy as np
import pandas as pd
from google.colab import files
from tqdm import tqdm
import warnings
warnings.simplefilter('ignore')
import gc
from scipy.special import softmax
from simpletransformers.classification import ClassificationModel
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
import sklearn
from sklearn.metrics import log_loss
from sklearn.metrics import *
from sklearn.model_selection import *
import re
import random
import torch
pd.options.display.max_colwidth = 200

def seed_all(seed_value):
    random.seed(seed_value) # Python
    np.random.seed(seed_value) # cpu vars
    torch.manual_seed(seed_value) # cpu  vars
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value) # gpu vars
        torch.backends.cudnn.deterministic = True  #needed
        torch.backends.cudnn.benchmark = False


In [None]:
train_df = pd.read_csv(r'/content/train.csv',sep=";",encoding='utf-8')
test_df  = pd.read_csv(r'/content/test_reduced.csv',sep=";",encoding='utf-8')
train_df['Text'] = train_df['MailSubject']+' '+train_df['MailTextBody']
test_df['Text']  = test_df['MailSubject']+' '+test_df['MailTextBody']

In [None]:
train_df['Text']=train_df['Text'].str.replace('\\r',' ')
train_df['Text']=train_df['Text'].str.replace('\\n','')
train_df['Text']=train_df['Text'].str.replace('\r','')
train_df['Text']=train_df['Text'].str.replace('\n','')
train_df['Text']=train_df['Text'].str.replace('  ','')
train_df['Text']=train_df['Text'].str.lower()
train_df['Text']=train_df['Text'].str.replace('&nbsp;','')
train_df['Text']=train_df['Text'].str.strip()


test_df['Text']=test_df['Text'].str.replace('\\r',' ')
test_df['Text']=test_df['Text'].str.replace('\\n','')
test_df['Text']=test_df['Text'].str.replace('\r','')
test_df['Text']=test_df['Text'].str.replace('\n','')
test_df['Text']=test_df['Text'].str.replace('  ','')
test_df['Text']=test_df['Text'].str.lower()
test_df['Text']=test_df['Text'].str.replace('&nbsp;','')
test_df['Text']=test_df['Text'].str.strip()

In [None]:
train_df.head(3)

Unnamed: 0.1,Unnamed: 0,Id,Impact,Urgency,IncidentType,ServiceProcessed,MailSubject,MailTextBody,ManualGroups,Text
0,3554,INC000010587669,4-Minor/Localized,2-High,Failure,EDA_S_BA_2FA,smart card blockiert MD5:2225e4a30a5b7e592ba34883a823ed4d,"lieber helpdesk sie haben mir den computer freigeschaltet, weil meine smartcard blockiert ist. ich bin nun in ausserholligen und möchte die smardcard gern deblockieren. was muss ich tun? bin jet...",,"smart card blockiert md5:2225e4a30a5b7e592ba34883a823ed4d lieber helpdesksie haben mir den computer freigeschaltet, weil meine smartcard blockiert ist. ich bin nun in ausserholligen und möchte die..."
1,3553,INC000010585556,4-Minor/Localized,4-Low,Service Request,EDA_S_Order Management,FW: Webcam MD5:f757a42a9bdb1dfd9c8427e588586ca8,"liebe kolleginnen und kollegen ich möchte für susanne caseri eine webcam für videokoferenzen bestellen, die man am bildschirm befestigen kann. mit bestem dank und vielen grüsse",,"fw: webcam md5:f757a42a9bdb1dfd9c8427e588586ca8 liebe kolleginnen und kollegenich möchte für susanne caseri eine webcam für videokoferenzen bestellen, die man am bildschirm befestigen kann.mit bes..."
2,3552,INC000010585519,4-Minor/Localized,4-Low,Failure,EDA_S_Peripheriegeräte,FW: IT Support heute Nachmittag MD5:bc594d6bfdea74a2e40ba31d41444025,"liebe kolleginnen und kollegen es handelt sich um eine sitzung der deza-direktion heute um 16.30 uhr im sitzungszimmer 0 1 2 3 . darf ich sie gem. untenstehendem mail bitten, dass jemand vor ort...",,fw: it support heute nachmittag md5:bc594d6bfdea74a2e40ba31d41444025 liebe kolleginnen und kollegenes handelt sich um eine sitzung der deza-direktion heute um 16.30 uhr im sitzungszimmer 0 1 2 3 ....


In [None]:
labels_df = pd.read_csv(r'/content/labels_to_classify.csv',encoding='utf-8')
class_dict = dict(zip(labels_df.Label_given.values.tolist(),labels_df.labels.values.tolist()))
labels_df

Unnamed: 0,Label_given,Count,labels
0,EDA_S_Order Management,224,0
1,EDA_S_BA_Mailbox,216,1
2,EDA_S_APS_OS_BasisSW,208,2
3,EDA_S_Mobile Kommunikation,201,3
4,EDA_ANW_SysP eDoc,173,4
5,EDA_S_APS_PC,149,5
6,EDA_S_BA_UCC_Benutzertelefonie,147,6
7,EDA_S_Netzdrucker,114,7
8,EDA_S_BA_2FA,100,8
9,EDA_S_APS_Monitor,99,9


In [None]:
subdflab = labels_df[labels_df['labels']<34]
class_dict_rev = dict(zip(subdflab.labels.values.tolist(),subdflab.Label_given.values.tolist()))

class_dict_rev[34] = 'EDA_ANW_at Honorarvertretung'

In [None]:
train_df['target'] = train_df.ServiceProcessed.map(class_dict)
test_df['target']  = 0

In [None]:
train_data = train_df[['Text','target']]
test_data = test_df[['Text','target']]

train_data['Text'] = train_data['Text'].astype(str)
test_data['Text'] = test_data['Text'].astype(str)


from sklearn.model_selection import train_test_split

train_df_, test_df_ = train_test_split(train_data, test_size=0.05)

In [None]:
train_data.head(3)

Unnamed: 0,Text,target
0,"smart card blockiert md5:2225e4a30a5b7e592ba34883a823ed4d lieber helpdesksie haben mir den computer freigeschaltet, weil meine smartcard blockiert ist. ich bin nun in ausserholligen und möchte die...",8
1,"fw: webcam md5:f757a42a9bdb1dfd9c8427e588586ca8 liebe kolleginnen und kollegenich möchte für susanne caseri eine webcam für videokoferenzen bestellen, die man am bildschirm befestigen kann.mit bes...",0
2,fw: it support heute nachmittag md5:bc594d6bfdea74a2e40ba31d41444025 liebe kolleginnen und kollegenes handelt sich um eine sitzung der deza-direktion heute um 16.30 uhr im sitzungszimmer 0 1 2 3 ....,19


In [None]:
train_data.target.value_counts()

0     224
1     216
2     208
3     201
4     173
5     149
6     147
7     114
8     100
9      99
10     90
11     83
12     80
34     80
13     76
14     76
15     75
16     75
17     73
18     69
19     64
20     57
21     54
22     44
23     43
24     34
25     29
26     22
28     15
27     15
29     14
30     13
31     12
32     11
33     10
Name: target, dtype: int64

In [None]:
model = ClassificationModel("bert", "bert-base-german-cased", use_cuda=True,num_labels=35, 
                                                                  args={'train_batch_size':8,
                                                                        'reprocess_input_data': True,
                                                                        'overwrite_output_dir': True,
                                                                        'fp16': False,
                                                                        'do_lower_case': False,
                                                                        'num_train_epochs': 4,
                                                                        'max_seq_length': 256,
                                                                        'regression': False,
                                                                        'manual_seed': 2,
                                                                        "learning_rate":3e-5,
                                                                        'weight_decay':0,
                                                                        "save_eval_checkpoints": False,
                                                                        "save_model_every_epoch": False,
                                                                        "silent": False})
model.train_model(train_df_)

Some weights of the model checkpoint at bert-base-german-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoi

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2702.0), HTML(value='')))




HBox(children=(HTML(value='Epoch'), FloatProgress(value=0.0, max=4.0), HTML(value='')))

HBox(children=(HTML(value='Running Epoch 0 of 4'), FloatProgress(value=0.0, max=338.0), HTML(value='')))




HBox(children=(HTML(value='Running Epoch 1 of 4'), FloatProgress(value=0.0, max=338.0), HTML(value='')))




HBox(children=(HTML(value='Running Epoch 2 of 4'), FloatProgress(value=0.0, max=338.0), HTML(value='')))




HBox(children=(HTML(value='Running Epoch 3 of 4'), FloatProgress(value=0.0, max=338.0), HTML(value='')))





(1352, 1.5827406841556173)

In [None]:
from sklearn.metrics import f1_score, accuracy_score


def f1_multiclass(labels, preds):
    return f1_score(labels, preds, average='micro')
    
result, model_outputs, wrong_predictions = model.eval_model(test_df_, f1=f1_multiclass, acc=accuracy_score)

result

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=143.0), HTML(value='')))




HBox(children=(HTML(value='Running Evaluation'), FloatProgress(value=0.0, max=18.0), HTML(value='')))




{'acc': 0.6503496503496503,
 'eval_loss': 1.490555849340227,
 'f1': 0.6503496503496503,
 'mcc': 0.629936270219213}

In [None]:
preds=model.predict(test_data['Text'])[0]
len(preds)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=712.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=89.0), HTML(value='')))




712

In [None]:
output_df=pd.DataFrame(test_df['Id'])
output_df['Predicted_labels']=preds
output_df.head()

Unnamed: 0,Id,Predicted_labels
0,INC000009681999,5
1,INC000009004056,2
2,INC000009244944,5
3,INC000008891561,6
4,INC000009702828,5


In [None]:
output_df['Predicted']=output_df['Predicted_labels'].map(class_dict_rev)
output_df['Predicted'].value_counts()/output_df.shape[0]

EDA_S_BA_Mailbox                       0.099719
EDA_S_Order Management                 0.092697
EDA_ANW_SysP eDoc                      0.082865
EDA_S_APS_OS_BasisSW                   0.081461
EDA_S_APS_PC                           0.066011
EDA_S_BA_UCC_Benutzertelefonie         0.060393
EDA_S_Mobile Kommunikation             0.057584
EDA_S_Netzdrucker                      0.049157
EDA_S_APS_Monitor                      0.044944
EDA_ANW_Intranet/Collaboration EDA     0.040730
EDA_S_Netzwerk Ausland                 0.033708
EDA_ANW_SAP Services                   0.032303
EDA_S_BA_2FA                           0.030899
EDA_S_APS_Peripherie                   0.029494
EDA_S_Betrieb Übermitttlungssysteme    0.028090
EDA_S_Zusätzliche Software             0.028090
EDA_S_BA_Account                       0.026685
EDA_S_BA_UCC_IVR                       0.026685
EDA_S_Raumbewirtschaftung              0.014045
EDA_S_Peripheriegeräte                 0.014045
EDA_ANW_CH@World (MOSS)                0

In [None]:
output_df[['Id','Predicted']].to_csv('sample_trial_german_bert.csv',index=False)

In [None]:
output_df

Unnamed: 0,Id,Predicted_labels,Predicted
0,INC000009681999,5,EDA_S_APS_PC
1,INC000009004056,2,EDA_S_APS_OS_BasisSW
2,INC000009244944,5,EDA_S_APS_PC
3,INC000008891561,6,EDA_S_BA_UCC_Benutzertelefonie
4,INC000009702828,5,EDA_S_APS_PC
...,...,...,...
707,INC000009347556,1,EDA_S_BA_Mailbox
708,INC000009476926,4,EDA_ANW_SysP eDoc
709,INC000009846600,6,EDA_S_BA_UCC_Benutzertelefonie
710,INC000009836885,0,EDA_S_Order Management
