# PreProcessing of Drug Dataset From Dreammarket

In [1]:
import difflib
import pickle
import pandas as pd
import re
import langdetect as ld
import stanza
from stanza.pipeline.processor import register_processor, Processor

import time
from faker import Faker
import numpy as np
from pathlib import Path
import phonenumbers
import random
import csv




# Load the dataset and remove

In [2]:
path_dream_json = "Drug_export_50000_final.json"
dnm = pd.read_json(path_dream_json)

dnm.rename(columns = {'idproduct':'dream_id', 'seller_name':'vendor', 'product_name':'name', 'category':'subcategory' }, inplace=True)

DrugColsToKeep = [ 'dream_id', 'vendor', 'name', 'description',]
dnm = dnm[DrugColsToKeep]
print("Columns: ", dnm.columns, " shape: ", dnm.shape)


Columns:  Index(['dream_id', 'vendor', 'name', 'description'], dtype='object')  shape:  (45446, 4)


# TODO SHALL WE Load the grams data as well?? Currently not loaded since it contains other things than drugs as well

Load Abraxas, Agora, Alpha, Me and Oxygen  
DO NOT LOAD NK and Silkkitie since their data csv is quite noisy

In [3]:
'''
path_grams_dataset = ["grams/Abraxas.csv", "grams/Agora.csv", "grams/Alpha.csv", "grams/ME.csv", "grams/Oxygen.csv"]


gram_list = []
for filename in path_grams_dataset:
    df = pd.read_csv(filename)
    gram_list.append(df)

grams = pd.concat(gram_list, axis=0, ignore_index=True)
grams.rename(columns = {'vendor_name':'vendor'}, inplace=True)
grams = grams[DrugColsToKeep]
print("Columns: ", grams.columns, " shape: ", grams.shape)
#dnm = pd.concat([dnm, grams], axis=0, ignore_index=True) # TODO Shall I add Grams Data as well? This would add noise!

print("Columns: ", grams.columns, " shape: ", grams.shape)
'''

'\npath_grams_dataset = ["grams/Abraxas.csv", "grams/Agora.csv", "grams/Alpha.csv", "grams/ME.csv", "grams/Oxygen.csv"]\n\n\ngram_list = []\nfor filename in path_grams_dataset:\n    df = pd.read_csv(filename)\n    gram_list.append(df)\n\ngrams = pd.concat(gram_list, axis=0, ignore_index=True)\ngrams.rename(columns = {\'vendor_name\':\'vendor\'}, inplace=True)\ngrams = grams[DrugColsToKeep]\nprint("Columns: ", grams.columns, " shape: ", grams.shape)\n#dnm = pd.concat([dnm, grams], axis=0, ignore_index=True) # TODO Shall I add Grams Data as well? This would add noise!\n\nprint("Columns: ", grams.columns, " shape: ", grams.shape)\n'

In [4]:
#Duplicate removal:
print("lines before duplicate removal:", dnm.shape[0])
dnm.drop_duplicates('description', inplace=True)
print("lines After duplicate removal:", dnm.shape[0])

#Create unique index again
dnm.reset_index(inplace=True, drop=True)

lines before duplicate removal: 45446
lines After duplicate removal: 20434


# Remove other languages, empty and non-string descriptions

In [5]:
%%time
def checkLang(text):
    try:
        if isinstance(text, str):
            return ld.detect(text)
        else:
            return 'No String'
    except:  
        return 'Error in detect'

dnm['lang'] = dnm.apply(lambda row: checkLang(row['description']), axis=1) # TODO shall we work with other languages?

CPU times: user 4min 21s, sys: 16.4 s, total: 4min 38s
Wall time: 4min 38s


In [6]:
print('Shape Before Language Removal')
print(str(dnm.shape))
dnm_en = dnm[dnm['lang'] == 'en'].copy()
print('Shape AFTER Language Removal')
print(str(dnm_en.shape))

Shape Before Language Removal
(20434, 5)
Shape AFTER Language Removal
(18332, 5)


# Remove very short/long texts or texts with similiar start

Calculating the edit distance is super cumbersume --> approx. a day runtime with only edit distance (and edit distance doesn't) Take into account that a char can be inserted.. ("apple" and "1apple" has an edit distance of 5)
levenstein or hamming etc. (which compensate for inserted chars) would take even longer.. 
Therefore, this small heuristc was found to remove texts where the beginning is equal

In [7]:
#Very small descriptions are not interesting for us since we try to find named entities in sentences..
print('Shape Before Short Descr. Removal')
print(str(dnm_en.shape))
dnm_en = dnm_en[dnm_en['description'].map(len) > 30].copy()
dnm_en = dnm_en[dnm_en['description'].map(len) < 3000].copy()#Suuper long texts usually contain noisy crap
print('Shape After Short Descr. Removal')
print(str(dnm_en.shape))

Shape Before Short Descr. Removal
(18332, 5)
Shape After Short Descr. Removal
(16844, 5)


#### We take the first 200 chars and remove everything non-alphabetical from them. if they are equal to some other line we remove it. Therefore we can be quite sure that we don't have very similiar drug ads where only the amount of the drug or something like this changes.

In [8]:
dnm_en['descr_start'] = dnm_en['description'].str[:100].apply(lambda text: re.sub(r'[^A-Za-z]+', '', text))
print('Shape Before Start/End Duplicate Removal')
print(str(dnm_en.shape))
dnm_en.drop_duplicates('descr_start', inplace=True)
print('Shape After Start/End Duplicate Removal')
print(str(dnm_en.shape))

#Remove helper Columns
del dnm_en['descr_start'] 

Shape Before Start/End Duplicate Removal
(16844, 6)
Shape After Start/End Duplicate Removal
(11672, 6)


# Pseudonymize 
Remove telephone numbers and vendor names

In [9]:
# returns a random phone number
def rand_phone():
    return "+%d %d %d %d" % (random.randint(20,99), random.randint(100,999), random.randint(1000,9999), random.randint(100,999))


# create random name
def get_fake_name():
    # can create fake names
    fake = Faker()
    fake_name = fake.name().replace(" ", "")
    if random.randint(1, 10) % 2 == 0:
        fake_name = fake_name.lower()
    if random.randint(1, 10) % 3 == 0:
        fake_name = "%s%d" % (fake_name.lower(), random.randint(10, 99))
    return fake_name


# 
def pseudonymize(df, name_column): 
    # create a list of all vendor names with different Capitalisation
    names_list = df['vendor'].unique()
    names_list = [re.escape(str(x)) for x in names_list] # OLD [str(x) for x in names_list] 
    capitalize = [x.capitalize() for x in names_list]
    lower = [x.lower() for x in names_list]
    upper = [x.upper() for x in names_list]
    names_list += capitalize + lower + upper
    
    
    if 'vendor' in df.columns:
        # remove empty strings
        df.replace('', np.nan, inplace=True)
        
        # replace all vendor names by dummie
        name_replacements = {name: get_fake_name() for name in names_list  if name is not np.nan}
        df.replace({name_column: name_replacements}, inplace=True)
        df.replace({"description": name_replacements}, inplace=True, regex=True)
       
    return df



In [10]:
# find all phonenumbers 
all_phonenumbers = [match.raw_string for match in phonenumbers.PhoneNumberMatcher(" ".join(dnm_en.description.unique()), "INTERNATIONAL")]
all_phonenumbers = (list(set(all_phonenumbers)))
# Replace all phonenumbers
for number in all_phonenumbers:
    dnm_en.description = dnm_en.description.apply(lambda x:x.replace(number,rand_phone()))

In [11]:
def replace_mails(text):
    domain = ['.com','.de','.ru','.org']
    vendor = ['gmail', 'air','wing','microsoft','hotmail','outlook']

    mail = get_fake_name() + '@' + vendor[random.randint(0,len(vendor)-1)] + domain[random.randint(0,len(domain)-1)] 
    return re.sub(r'\S+@\S+\s?', mail, text)

dnm_en.description = dnm_en.description.apply(lambda text: replace_mails(text))

In [12]:
%%time
# vendor name  bcdirect is replaced by random name 

dnm_pseudo = pseudonymize(dnm_en.copy(),"vendor")

CPU times: user 6min 12s, sys: 35.3 s, total: 6min 47s
Wall time: 6min 47s


# Stanza Processing 

### Create special char remover

In [13]:
def remove_unwanted_elements(text):
    final_text=text
    final_text = re.sub(r'https?:\/\/\S*[\r\n]*', '', final_text)
    final_text = re.sub(r'\S*.onion\S*[\r\n]*', '', final_text)
    final_text = re.sub(r'[\+!~@#$%^&*()={}\[\]:;<.>?\'"]', '', final_text)
    final_text = re.sub(r'[-]+', '-', final_text)
    final_text = re.sub(r'[_]+', '_', final_text)
    return final_text



@register_processor("customcleaner")
class cleanerProcessor(Processor):
    ''' Processor removes all special chars we do not appreciate and links '''
    _requires = set(['tokenize']) # Shouldn#t we do that before
    _provides = set(['specialCharRemover'])

    def __init__(self, config, pipeline, use_gpu):
        pass

    def _set_up_model(self, *args):
        pass

    def process(self, doc):
        
        doc.text = doc.text # Question to Gabor SHALL the original text stay the same?
        for sent in doc.sentences:
            for tok in sent.tokens:
                tok.text = remove_unwanted_elements(tok.text)
                
            for word in sent.words:
                word.text = remove_unwanted_elements(word.text)

                
            #remove empty tokens/words    
            sent.tokens = [tok for tok in sent.tokens if len(tok.text) > 0]
            sent.words = [word for word in sent.words if len(word.text) > 0]
        #remove empty sentences
        doc.sentences = [sent for sent in doc.sentences if len(sent.tokens) > 0]

        return doc

### Stanza Segmentation

TODO Talk with Gabor if we should really separate sentences according to Stanza.. Imho this will result in a lot of small sentences with very little context. Does this matter? In case of  Annotating something like Drug_mention it would..

In [14]:
tokenizer = stanza.Pipeline('en', processors='tokenize,customcleaner')#,specialchars

2021-02-03 10:59:19 INFO: Loading these models for language: en (English):
| Processor     | Package |
---------------------------
| tokenize      | ewt     |
| customcleaner | default |

2021-02-03 10:59:19 INFO: Use device: gpu
2021-02-03 10:59:19 INFO: Loading: tokenize
2021-02-03 10:59:25 INFO: Loading: customcleaner
2021-02-03 10:59:25 INFO: Done loading processors!


In [15]:
%%time
dnm_pseudo['doc'] =  dnm_pseudo.description.apply(lambda text:  tokenizer(text))

CPU times: user 8min 15s, sys: 2.62 s, total: 8min 18s
Wall time: 8min 17s


### Restore Stanza to text

In [16]:
def restore_tokenized_text(doc):
    cleaned_text= ""
    for sentence in doc.sentences:
        for token_dict in sentence.words:
            cleaned_text += token_dict.text + " "
        cleaned_text = cleaned_text.strip() + ". "
    return cleaned_text.strip()

dnm_pseudo["clean_description"] = dnm_pseudo["doc"].apply(lambda doc:  restore_tokenized_text(doc))

In [17]:
# Shuffle the dataset to prevent any order with meaning
df_shuffled=dnm_pseudo.sample(frac=1, random_state=42).reset_index(drop=True)


df_shuffled.to_pickle('intermediate_store/preprocessed_ads_V1.1.pickle')
dnm_pseudo.shape

(11672, 7)

In [18]:
df_shuffled = pd.read_pickle('intermediate_store/preprocessed_ads_V1.1.pickle')


# Separate item listings according to length into batches

In [19]:
def categorize_textLength(text):
    if len(text)<100:
        return 1
    elif len(text)<250:
        return 2
    elif len(text)<500:
        return 3
    elif len(text)<750:
        return 4
    elif len(text)<1000:
        return 5
    elif len(text)<1500:
        return 6
    elif len(text)<2000:
        return 7
    elif len(text)<2500:
        return 8
    elif len(text)<3000:
        return 9
    
df_shuffled["textLengthCat"] = df_shuffled["clean_description"].apply(lambda text: categorize_textLength(text))
    
    

In [20]:
df_shuffled["textLengthCat"].value_counts()

2.0    2784
3.0    2475
1.0    1972
4.0    1347
6.0    1041
5.0     948
7.0     585
8.0     347
9.0     172
Name: textLengthCat, dtype: int64

In [21]:
df_shuffled[df_shuffled['textLengthCat'] <= 5][['dream_id', 'name', 'clean_description']].to_csv("final_data/shortTextDrugsV1.0.tsv", sep = '\t')
df_shuffled[df_shuffled['textLengthCat'] > 5][['dream_id', 'name', 'clean_description']].to_csv("final_data/longTextDrugsV1.0.tsv", sep = '\t') 