# PreProcessing of Drug Dataset From Dreammarket

In [1]:
import difflib
import pickle
import pandas as pd
import re
import langdetect as ld
import stanza
from stanza.pipeline.processor import register_processor, Processor

import time
from faker import Faker
import numpy as np
from pathlib import Path
import phonenumbers
import random
import csv




# Load GRAMS data for the Domain Adaptive Preprocessing Corpus

Load Abraxas, Agora, Alpha, Me and Oxygen  
DO NOT LOAD NK and Silkkitie since their data csv is quite noisy

In [2]:

path_grams_dataset = ["grams/Abraxas.csv", "grams/Agora.csv", "grams/Alpha.csv", "grams/ME.csv", "grams/Oxygen.csv"]


gram_list = []
for filename in path_grams_dataset:
    df = pd.read_csv(filename)
    gram_list.append(df)

    
DrugColsToKeep = [ 'hash', 'vendor', 'name', 'description',]
grams = pd.concat(gram_list, axis=0, ignore_index=True)
grams.rename(columns = {'vendor_name':'vendor'}, inplace=True)
grams = grams[DrugColsToKeep]
print("Columns: ", grams.columns, " shape: ", grams.shape)


Columns:  Index(['hash', 'vendor', 'name', 'description'], dtype='object')  shape:  (61420, 4)


In [3]:
#Duplicate removal:
print("lines before duplicate removal:", grams.shape[0])
grams.drop_duplicates('description', inplace=True)
print("lines After duplicate removal:", grams.shape[0])

#Create unique index again
grams.reset_index(inplace=True, drop=True)

lines before duplicate removal: 61420
lines After duplicate removal: 37078


# Remove other languages, empty and non-string descriptions

In [4]:
%%time
def checkLang(text):
    try:
        if isinstance(text, str):
            return ld.detect(text)
        else:
            return 'No String'
    except:  
        return 'Error in detect'

grams['lang'] = grams.apply(lambda row: checkLang(row['description']), axis=1) # TODO shall we work with other languages?

CPU times: user 6min 29s, sys: 30.2 s, total: 6min 59s
Wall time: 6min 59s


In [5]:
print('Shape Before Language Removal')
print(str(grams.shape))
grams = grams[grams['lang'] == 'en'].copy()
print('Shape AFTER Language Removal')
print(str(grams.shape))

Shape Before Language Removal
(37078, 5)
Shape AFTER Language Removal
(34353, 5)


# Remove very short/long texts or texts with similiar start

Calculating the edit distance is super cumbersume --> approx. a day runtime with only edit distance (and edit distance doesn't) Take into account that a char can be inserted.. ("apple" and "1apple" has an edit distance of 5)
levenstein or hamming etc. (which compensate for inserted chars) would take even longer.. 
Therefore, this small heuristc was found to remove texts where the beginning is equal

In [6]:
#Very small descriptions are not interesting for us since we try to find named entities in sentences..
print('Shape Before Short Descr. Removal')
print(str(grams.shape))
grams = grams[grams['description'].map(len) > 30].copy()
grams = grams[grams['description'].map(len) < 3000].copy() # In case we take very long descriptions we gotta cut them down to blocks.
print('Shape After Short Descr. Removal')
print(str(grams.shape))

Shape Before Short Descr. Removal
(34353, 5)
Shape After Short Descr. Removal
(32893, 5)


#### We take the first 200 chars and remove everything non-alphabetical from them. if they are equal to some other line we remove it. Therefore we can be quite sure that we don't have very similiar drug ads where only the amount of the drug or something like this changes.

In [7]:
grams['descr_start'] = grams['description'].str[:100].apply(lambda text: re.sub(r'[^A-Za-z]+', '', text))
print('Shape Before Start/End Duplicate Removal')
print(str(grams.shape))
grams.drop_duplicates('descr_start', inplace=True)
print('Shape After Start/End Duplicate Removal')
print(str(grams.shape))

#Remove helper Columns
del grams['descr_start'] 

Shape Before Start/End Duplicate Removal
(32893, 6)
Shape After Start/End Duplicate Removal
(24460, 6)


# Pseudonymize 
Remove telephone numbers and vendor names

In [8]:
# returns a random phone number
def rand_phone():
    return "+%d %d %d %d" % (random.randint(20,99), random.randint(100,999), random.randint(1000,9999), random.randint(100,999))


# create random name
def get_fake_name():
    # can create fake names
    fake = Faker()
    fake_name = fake.name().replace(" ", "")
    if random.randint(1, 10) % 2 == 0:
        fake_name = fake_name.lower()
    if random.randint(1, 10) % 3 == 0:
        fake_name = "%s%d" % (fake_name.lower(), random.randint(10, 99))
    return fake_name


# 
def pseudonymize(df, name_column): 
    # create a list of all vendor names with different Capitalisation
    names_list = df['vendor'].unique()
    names_list = [re.escape(str(x)) for x in names_list] # OLD [str(x) for x in names_list] 
    capitalize = [x.capitalize() for x in names_list]
    lower = [x.lower() for x in names_list]
    upper = [x.upper() for x in names_list]
    names_list += capitalize + lower + upper
    
    
    if 'vendor' in df.columns:
        # remove empty strings
        df.replace('', np.nan, inplace=True)
        
        # replace all vendor names by dummie
        name_replacements = {name: get_fake_name() for name in names_list  if name is not np.nan}
        df.replace({name_column: name_replacements}, inplace=True)
        df.replace({"description": name_replacements}, inplace=True, regex=True)
       
    return df



In [9]:
# find all phonenumbers 
all_phonenumbers = [match.raw_string for match in phonenumbers.PhoneNumberMatcher(" ".join(grams.description.unique()), "INTERNATIONAL")]
all_phonenumbers = (list(set(all_phonenumbers)))
# Replace all phonenumbers
for number in all_phonenumbers:
    grams.description = grams.description.apply(lambda x:x.replace(number,rand_phone()))

In [10]:
def replace_mails(text):
    domain = ['.com','.de','.ru','.org']
    vendor = ['gmail', 'air','wing','microsoft','hotmail','outlook']

    mail = get_fake_name() + '@' + vendor[random.randint(0,len(vendor)-1)] + domain[random.randint(0,len(domain)-1)] 
    return re.sub(r'\S+@\S+\s?', mail, text)

grams.description = grams.description.apply(lambda text: replace_mails(text))

# Pseudonymisation is currently not needed, since we don't publish the DAPT data right?

In [11]:
%%time
# vendor name  bcdirect is replaced by random name 
#
grams_pseudo = pseudonymize(grams.copy(),"vendor")

CPU times: user 12min 44s, sys: 1min 2s, total: 13min 46s
Wall time: 13min 46s


# Stanza Processing 

### Create special char remover

In [12]:
def remove_unwanted_elements(text):
    final_text=text
    final_text = re.sub(r'https?:\/\/\S*[\r\n]*', '', final_text)
    final_text = re.sub(r'\S*.onion\S*[\r\n]*', '', final_text)
    final_text = re.sub(r'[\+!~@#$%^&*()={}\[\]:;<.>?\'"]', '', final_text)
    final_text = re.sub(r'[-]+', '-', final_text)
    final_text = re.sub(r'[_]+', '_', final_text)
    return final_text



@register_processor("customcleaner")
class cleanerProcessor(Processor):
    ''' Processor removes all special chars we do not appreciate and links '''
    _requires = set(['tokenize']) # Shouldn#t we do that before
    _provides = set(['specialCharRemover'])

    def __init__(self, config, pipeline, use_gpu):
        pass

    def _set_up_model(self, *args):
        pass

    def process(self, doc):
        
        doc.text = doc.text # Question to Gabor SHALL the original text stay the same?
        for sent in doc.sentences:
            for tok in sent.tokens:
                tok.text = remove_unwanted_elements(tok.text)
                
            for word in sent.words:
                word.text = remove_unwanted_elements(word.text)

                
            #remove empty tokens/words    
            sent.tokens = [tok for tok in sent.tokens if len(tok.text) > 0]
            sent.words = [word for word in sent.words if len(word.text) > 0]
        #remove empty sentences
        doc.sentences = [sent for sent in doc.sentences if len(sent.tokens) > 0]

        return doc

### Stanza Segmentation

TODO Talk with Gabor if we should really separate sentences according to Stanza.. Imho this will result in a lot of small sentences with very little context. Does this matter? In case of  Annotating something like Drug_mention it would..

In [13]:
tokenizer = stanza.Pipeline('en', processors='tokenize,customcleaner')#,specialchars

2021-02-26 09:22:51 INFO: Loading these models for language: en (English):
| Processor     | Package |
---------------------------
| tokenize      | ewt     |
| customcleaner | default |

2021-02-26 09:22:51 INFO: Use device: gpu
2021-02-26 09:22:51 INFO: Loading: tokenize
2021-02-26 09:22:57 INFO: Loading: customcleaner
2021-02-26 09:22:57 INFO: Done loading processors!


In [14]:
%%time
grams_pseudo['doc'] =  grams_pseudo.description.apply(lambda text:  tokenizer(text))

CPU times: user 15min 54s, sys: 4.61 s, total: 15min 58s
Wall time: 15min 58s


### Restore Stanza to text

In [15]:
def restore_tokenized_text(doc):
    cleaned_text= ""
    for sentence in doc.sentences:
        for token_dict in sentence.words:
            cleaned_text += token_dict.text + " "
        cleaned_text = cleaned_text.strip() + ". "
    return cleaned_text.strip()

grams_pseudo["clean_description"] = grams_pseudo["doc"].apply(lambda doc:  restore_tokenized_text(doc))

In [16]:
# Shuffle the dataset to prevent any order with meaning
df_shuffled=grams_pseudo.sample(frac=1, random_state=42).reset_index(drop=True)


df_shuffled.to_pickle('intermediate_store/preprocessed_DAPT.pickle')
df_shuffled.shape

(24460, 7)

In [19]:
df_shuffled = pd.read_pickle('intermediate_store/preprocessed_DAPT.pickle')


# Write clean descriptions to file 
TODO --> Model architecture could be improved to [CLS]name[SEP]Clean_description, since we probably have to cut down bigger parts for the language model training.

In [20]:
df_shuffled['description_combi'] = df_shuffled['name'] + '[SEP]' + df_shuffled['clean_description']
    
#df_shuffled['dream_id', 'name', 'clean_description', 'description_combi']].to_csv("final_data/DAPT1.0.tsv", sep = '\t')
df_shuffled["clean_description"].to_csv("final_data/DAPT_Grams_1.0.txt", sep=' ', index=False, header=False)
