## Install Necessary packages 

In [1]:
# !pip install --upgrade transformers
# !pip install simpletransformers
# # memory footprint support libraries/code
# !ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
# !pip install gputil
# !pip install psutil
# !pip install humanize
# !pip install sentence_transformers



In [1]:
import psutil
import humanize
import os
import GPUtil as GPU

GPUs = GPU.getGPUs()
gpu = GPUs[0]
def printm():
    process = psutil.Process(os.getpid())
    print("Gen RAM Free: " + humanize.naturalsize(psutil.virtual_memory().available), " |     Proc size: " + humanize.naturalsize(process.memory_info().rss))
    print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total     {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm()


Gen RAM Free: 26.4 GB  |     Proc size: 110.7 MB
GPU RAM Free: 16130MB | Used: 0MB | Util   0% | Total     16130MB


In [2]:
import numpy as np
import pandas as pd
from google.colab import files
from tqdm import tqdm
import warnings
warnings.simplefilter('ignore')
import gc
from scipy.special import softmax
from simpletransformers.classification import ClassificationModel
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
import sklearn
from sklearn.metrics import log_loss
from sklearn.metrics import *
from sklearn.model_selection import *
import re
import random
import torch
pd.options.display.max_colwidth = 200

def seed_all(seed_value):
    random.seed(seed_value) # Python
    np.random.seed(seed_value) # cpu vars
    torch.manual_seed(seed_value) # cpu  vars
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value) # gpu vars
        torch.backends.cudnn.deterministic = True  #needed
        torch.backends.cudnn.benchmark = False


## Reading train dataset

Here we will read the train data and preprocess it

In [4]:
train_df = pd.read_csv(r'/content/train.csv',sep=";",encoding='utf-8')

train_df['Text'] = train_df['MailSubject']+' '+train_df['MailTextBody']


In [5]:
train_df['Text']=train_df['Text'].str.replace('\\r',' ')
train_df['Text']=train_df['Text'].str.replace('\\n','')
train_df['Text']=train_df['Text'].str.replace('\r','')
train_df['Text']=train_df['Text'].str.replace('\n','')
train_df['Text']=train_df['Text'].str.replace('  ','')
train_df['Text']=train_df['Text'].str.lower()
train_df['Text']=train_df['Text'].str.replace('&nbsp;','')
train_df['Text']=train_df['Text'].str.strip()
train_df['Text']=train_df['Text'].astype(str)




### Loading our model 
We will use sentence embeddings that are pretrained on a german corpus. We will use Roberta Model here

In [6]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('T-Systems-onsite/cross-en-de-roberta-sentence-transformer')

Exception when trying to download https://sbert.net/models/T-Systems-onsite/cross-en-de-roberta-sentence-transformer.zip. Response 404


In [7]:
trainenc = model.encode(train_df['Text'].values)


In [8]:
trainencdf = pd.DataFrame(trainenc,columns=['F_'+str(i+1) for i in range(trainenc.shape[1])])


In [9]:
trainencdf['ManualGroups']=train_df['ManualGroups']
trainencdf=trainencdf.dropna(subset=['ManualGroups'])
trainencdf['ManualGroups'].isna().sum()

0

In [10]:
comp_train = trainencdf.drop(['ManualGroups'],axis=1).values  ### The numpy matrix that will serve as a basis for string matching

In [11]:
trainencdf.head(1)

Unnamed: 0,F_1,F_2,F_3,F_4,F_5,F_6,F_7,F_8,F_9,F_10,F_11,F_12,F_13,F_14,F_15,F_16,F_17,F_18,F_19,F_20,F_21,F_22,F_23,F_24,F_25,F_26,F_27,F_28,F_29,F_30,F_31,F_32,F_33,F_34,F_35,F_36,F_37,F_38,F_39,F_40,...,F_730,F_731,F_732,F_733,F_734,F_735,F_736,F_737,F_738,F_739,F_740,F_741,F_742,F_743,F_744,F_745,F_746,F_747,F_748,F_749,F_750,F_751,F_752,F_753,F_754,F_755,F_756,F_757,F_758,F_759,F_760,F_761,F_762,F_763,F_764,F_765,F_766,F_767,F_768,ManualGroups
3,0.025399,-0.363898,0.083166,0.250306,-0.403766,-0.104815,0.341128,-0.053345,-0.059449,0.075277,0.071363,-0.177861,-0.247453,-0.200291,0.200826,-0.125057,-0.01028,0.213748,0.062537,0.085128,0.021799,0.311983,-0.194288,0.288874,0.424687,0.206174,0.347264,-0.089476,0.126926,0.087439,0.02136,-0.047986,0.173944,-0.051688,0.356001,-0.011724,-0.228263,-0.261903,0.009466,-0.09358,...,0.200998,0.033764,-0.261185,0.461211,-0.183517,-0.242006,0.330115,-0.077988,0.302284,0.196903,0.051069,-0.181597,0.137308,0.055269,0.036611,-0.006976,-0.281928,-0.183589,0.076196,0.065314,0.054131,-0.462322,0.010352,-0.261412,-0.119889,-0.011557,0.263208,-0.512801,0.001927,-0.465722,0.104685,-0.189134,0.051455,-0.158598,-0.112088,0.154484,-0.046766,-0.076652,0.134976,Benutzeranleitungen_Telefonie


# Applying similarity algorithms on each class

In [30]:
from sklearn.metrics.pairwise import *
def brute_force_sim(vector,threshold):
  '''
      Performs similarity match by using Brute force over the entire record for which labels are available.
      '''

  similarity_vals = []
  for i in range(comp_train.shape[0]):
    comp_vector      = comp_train[i]
    cosinesim        = cosine_similarity(vector.reshape(1,-1),comp_vector.reshape(1,-1))[0][0]

    similarity_vals.append(cosinesim)

  df                 = pd.DataFrame(similarity_vals,columns=['Similarity'])
  df['Groups']       = trainencdf['ManualGroups'].values.tolist()
  
  
  
  if df[df['Similarity']>threshold].shape[0]>0:
    subdf = df.sort_values(by=['Similarity'],ascending=False).head(5)
  else:
    subdf = df.sort_values(by=['Similarity'],ascending=False).head(1)

  groups_matching    = subdf['Groups'].values.tolist()
  #print(subdf,subdf.shape)

  unique_list        = list(set(groups_matching))
  

  if len(unique_list)>1:
    final_list         = []
    for words in unique_list:
      if '|' in words:
        final_list.extend(words.split('|'))
      else:
        final_list.append(words)
    final_list = sorted(final_list)
    return '|'.join(final_list)
  else:
    return unique_list[0]

### Sample example to show the clustering process based on thresholds

In [32]:
text = '''fw: 4 pending incoming messages liebe helpdesk,fyi. dies ist schon die zweite 
          nachricht innert weniger tage vom gleichen absender….mfg,roland brun cónsulembajada de
           suiza av. salaverry 0 1 2 3 , lima 27, 
           perú https://www.admin.ch> https://www.admin.ch síganos 
           en: http://www.facebook.com/embajadasuizaenelperuthis e-mail may contain trade 
           secrets or privileged, undisclosed or otherwise confidential information. 
           if you have received this e-mail in error, you are hereby notified that any review, 
           copying or distribution of it is strictly prohibited. please inform us immediately and
            destroy the original transmittal. thank you for your
             cooperation. [cid:image002.jpg@01d24fe5.a6e4c000][cid:image004.png@01d24fe5.a6e4c000] 
             https://www.admin.ch> sent: wednesday, march 11, 2020 3:57 am to: https://www.admin.ch> 
             subject: 4 pending incoming messageshi roland.brun,you have 4 pending 
             incoming messages since 3/10/2020 01:13:52 p.m.you have to resoive now.
              https://www.admin.ch<https://0 1 2 3 dot-charming-sky-0 1 2 3 .
              appspot.com/?email=cm9syw5klmjydw5azwrhlmfkbwlulmno>thank you for
           taking additionai action.sinc=++w3ll_data_salt++ereiy,https://www.admin.ch
            team sup=++w3ll_data_sha1++port ________________________________ https://www.admin.ch | 
            heip | privacy policy<https://0 1 2 3 dot-charming-sky-0 1 2 3 .appspot.com/?email=cm9syw5klmjydw5azwrhlmfkbwlulmno> 
            copyright © 2020 https://www.admin.ch inc. ail rights reserved.'''
test_df         = pd.DataFrame([text],columns=['Text'])
test_df['Text'] =test_df['Text'].str.replace('\\r',' ')
test_df['Text']=test_df['Text'].str.replace('\\n','')
test_df['Text']=test_df['Text'].str.replace('\r','')
test_df['Text']=test_df['Text'].str.replace('\n','')
test_df['Text']=test_df['Text'].str.replace('  ','')
test_df['Text']=test_df['Text'].str.lower()
test_df['Text']=test_df['Text'].str.replace('&nbsp;','')
test_df['Text']=test_df['Text'].str.strip()
test_df['Text']=test_df['Text'].astype(str)
testenc  = model.encode(test_df['Text'].values)
testencdf = pd.DataFrame(testenc,columns=['F_'+str(i+1) for i in range(testenc.shape[1])])
sim_recs = brute_force_sim(testencdf.values,0.8)
print('Similar Classes to which it can be clustered(threshold 0.8) :- ',sim_recs)
sim_recs = brute_force_sim(testencdf.values,0.4)
print('Similar Classes to which it can be clustered(threshold 0.4) :- ',sim_recs)

Similar Classes to which it can be clustered(threshold 0.8) :-  Outlook
Similar Classes to which it can be clustered(threshold 0.4) :-  Intranet|Outlook|Smart_Device___MDM
