# PreProcessing of Drug Dataset From Dreammarket

In [2]:
import difflib
import pickle
import pandas as pd
import re
import langdetect as ld
import stanza
from stanza.pipeline.processor import register_processor, Processor

import time
from faker import Faker
import numpy as np
from pathlib import Path
import phonenumbers
import random
import csv

import json


# Load the dataset and remove

In [2]:
path_dream_json = "Drug_export_50000_final.json"
dnm = pd.read_json(path_dream_json)

dnm.rename(columns = {'idproduct':'dream_id', 'seller_name':'vendor', 'product_name':'name', 'category':'subcategory' }, inplace=True)

DrugColsToKeep = [ 'dream_id', 'vendor', 'name', 'description',]
dnm = dnm[DrugColsToKeep]
print("Columns: ", dnm.columns, " shape: ", dnm.shape)


Columns:  Index(['dream_id', 'vendor', 'name', 'description'], dtype='object')  shape:  (45446, 4)


In [3]:
#Duplicate removal:
print("lines before duplicate removal:", dnm.shape[0])
dnm.drop_duplicates('description', inplace=True)
print("lines After duplicate removal:", dnm.shape[0])

#Create unique index again
dnm.reset_index(inplace=True, drop=True)

lines before duplicate removal: 45446
lines After duplicate removal: 20434


# Remove other languages, empty and non-string descriptions

In [4]:
%%time
def checkLang(text):
    try:
        if isinstance(text, str):
            return ld.detect(text)
        else:
            return 'No String'
    except:  
        return 'Error in detect'

dnm['lang'] = dnm.apply(lambda row: checkLang(row['description']), axis=1) # TODO shall we work with other languages?

CPU times: user 4min 22s, sys: 16.7 s, total: 4min 38s
Wall time: 4min 38s


In [5]:
print('Shape Before Language Removal')
print(str(dnm.shape))
dnm_en = dnm[dnm['lang'] == 'en'].copy()
print('Shape AFTER Language Removal')
print(str(dnm_en.shape))

Shape Before Language Removal
(20434, 5)
Shape AFTER Language Removal
(18326, 5)


# Remove very short/long texts or texts with similiar start

Calculating the edit distance is super cumbersume --> approx. a day runtime with only edit distance (and edit distance doesn't) Take into account that a char can be inserted.. ("apple" and "1apple" has an edit distance of 5)
levenstein or hamming etc. (which compensate for inserted chars) would take even longer.. 
Therefore, this small heuristc was found to remove texts where the beginning is equal

In [6]:
#Very small descriptions are not interesting for us since we try to find named entities in sentences..
print('Shape Before Short Descr. Removal')
print(str(dnm_en.shape))
dnm_en = dnm_en[dnm_en['description'].map(len) > 30].copy()
dnm_en = dnm_en[dnm_en['description'].map(len) < 3000].copy()#Suuper long texts usually contain noisy crap
print('Shape After Short Descr. Removal')
print(str(dnm_en.shape))

Shape Before Short Descr. Removal
(18326, 5)
Shape After Short Descr. Removal
(16844, 5)


#### We take the first 200 chars and remove everything non-alphabetical from them. if they are equal to some other line we remove it. Therefore we can be quite sure that we don't have very similiar drug ads where only the amount of the drug or something like this changes.

In [7]:
dnm_en['descr_start'] = dnm_en['description'].str[:100].apply(lambda text: re.sub(r'[^A-Za-z]+', '', text))
print('Shape Before Start/End Duplicate Removal')
print(str(dnm_en.shape))
dnm_en.drop_duplicates('descr_start', inplace=True)
print('Shape After Start/End Duplicate Removal')
print(str(dnm_en.shape))

#Remove helper Columns
del dnm_en['descr_start'] 

Shape Before Start/End Duplicate Removal
(16844, 6)
Shape After Start/End Duplicate Removal
(11674, 6)


# Pseudonymize 
Remove telephone numbers and vendor names

In [8]:
# returns a random phone number
def rand_phone():
    return "+%d %d %d %d" % (random.randint(20,99), random.randint(100,999), random.randint(1000,9999), random.randint(100,999))


# create random name
def get_fake_name():
    # can create fake names
    fake = Faker()
    fake_name = fake.name().replace(" ", "")
    if random.randint(1, 10) % 2 == 0:
        fake_name = fake_name.lower()
    if random.randint(1, 10) % 3 == 0:
        fake_name = "%s%d" % (fake_name.lower(), random.randint(10, 99))
    return fake_name


# 
def pseudonymize(df, name_column): 
    # create a list of all vendor names with different Capitalisation
    names_list = df['vendor'].unique()
    names_list = [re.escape(str(x)) for x in names_list] # OLD [str(x) for x in names_list] 
    capitalize = [x.capitalize() for x in names_list]
    lower = [x.lower() for x in names_list]
    upper = [x.upper() for x in names_list]
    names_list += capitalize + lower + upper
    
    
    if 'vendor' in df.columns:
        # remove empty strings
        df.replace('', np.nan, inplace=True)
        
        # replace all vendor names by dummie
        name_replacements = {name: get_fake_name() for name in names_list  if name is not np.nan}
        df.replace({name_column: name_replacements}, inplace=True)
        df.replace({"description": name_replacements}, inplace=True, regex=True)
       
    return df



In [9]:
# find all phonenumbers 
all_phonenumbers = [match.raw_string for match in phonenumbers.PhoneNumberMatcher(" ".join(dnm_en.description.unique()), "INTERNATIONAL")]
all_phonenumbers = (list(set(all_phonenumbers)))
# Replace all phonenumbers
for number in all_phonenumbers:
    dnm_en.description = dnm_en.description.apply(lambda x:x.replace(number,rand_phone()))

In [10]:
def replace_mails(text):
    domain = ['.com','.de','.ru','.org']
    vendor = ['gmail', 'air','wing','microsoft','hotmail','outlook']

    mail = get_fake_name() + '@' + vendor[random.randint(0,len(vendor)-1)] + domain[random.randint(0,len(domain)-1)] 
    return re.sub(r'\S+@\S+\s?', mail, text)

dnm_en.description = dnm_en.description.apply(lambda text: replace_mails(text))

In [11]:
%%time
# vendor name  bcdirect is replaced by random name 

dnm_pseudo = pseudonymize(dnm_en.copy(),"vendor")

CPU times: user 6min 14s, sys: 35.8 s, total: 6min 50s
Wall time: 6min 50s


# Stanza Processing 

### Create special char remover

In [13]:
def remove_unwanted_elements(text):
    final_text=text
    final_text = re.sub(r'https?:\/\/\S*[\r\n]*', '', final_text)
    final_text = re.sub(r'\S*.onion\S*[\r\n]*', '', final_text)
    final_text = re.sub(r'[\+!~@#$%^&*()={}\[\]:;<.>?\'"]', '', final_text)
    
    #Due to AMT restrictions.. replace \' \" and ',' with their unicode descriptor
    final_text = re.sub(',', '&#44', final_text)
    final_text = re.sub('\'', '&#39', final_text)
    final_text = re.sub('\"', '&#34', final_text)
    
    final_text = re.sub(r'[-]+', '-', final_text)
    final_text = re.sub(r'[_]+', '_', final_text)
    return final_text



@register_processor("customcleaner")
class cleanerProcessor(Processor):
    ''' Processor removes all special chars we do not appreciate and links '''
    _requires = set(['tokenize']) # Shouldn#t we do that before
    _provides = set(['specialCharRemover'])

    def __init__(self, config, pipeline, use_gpu):
        pass

    def _set_up_model(self, *args):
        pass

    def process(self, doc):
        
        doc.text = doc.text # Question to Gabor SHALL the original text stay the same?
        for sent in doc.sentences:
            for tok in sent.tokens:
                tok.text = remove_unwanted_elements(tok.text)
                
            for word in sent.words:
                word.text = remove_unwanted_elements(word.text)

                
            #remove empty tokens/words    
            sent.tokens = [tok for tok in sent.tokens if len(tok.text) > 0]
            sent.words = [word for word in sent.words if len(word.text) > 0]
        #remove empty sentences
        doc.sentences = [sent for sent in doc.sentences if len(sent.tokens) > 0]

        return doc

### Stanza Segmentation

TODO Talk with Gabor if we should really separate sentences according to Stanza.. Imho this will result in a lot of small sentences with very little context. Does this matter? In case of  Annotating something like Drug_mention it would..

In [14]:
tokenizer = stanza.Pipeline('en', processors='tokenize,customcleaner')#,specialchars

2021-03-04 10:27:10 INFO: Loading these models for language: en (English):
| Processor     | Package |
---------------------------
| tokenize      | ewt     |
| customcleaner | default |

2021-03-04 10:27:10 INFO: Use device: gpu
2021-03-04 10:27:10 INFO: Loading: tokenize
2021-03-04 10:27:16 INFO: Loading: customcleaner
2021-03-04 10:27:16 INFO: Done loading processors!


In [15]:
%%time
dnm_pseudo['doc'] =  dnm_pseudo.description.apply(lambda text:  tokenizer(text))

CPU times: user 7min 43s, sys: 1.91 s, total: 7min 45s
Wall time: 7min 45s


### Restore Stanza to text

In [16]:
def restore_tokenized_text(doc):
    cleaned_text= ""
    for sentence in doc.sentences:
        for token_dict in sentence.words:
            cleaned_text += token_dict.text + " "
        cleaned_text = cleaned_text.strip() + ". "
    return cleaned_text.strip()

dnm_pseudo["clean_description"] = dnm_pseudo["doc"].apply(lambda doc:  restore_tokenized_text(doc))

In [17]:
# Shuffle the dataset to prevent any order with meaning
df_shuffled=dnm_pseudo.sample(frac=1, random_state=42).reset_index(drop=True)


#df_shuffled.to_pickle('intermediate_store/preprocessed_AMT_ads_V1.2.pickle')
dnm_pseudo.shape

(11674, 7)

In [3]:
import pandas as pd
import stanza
from stanza.pipeline.processor import register_processor, Processor

ModuleNotFoundError: No module named 'stanza'

In [2]:
df_shuffled = pd.read_pickle('intermediate_store/preprocessed_AMT_ads_V1.2.pickle')


ModuleNotFoundError: No module named 'stanza'

# Remove ITEM already annotated by APPEN

In [4]:
#Load reviewed annotation data
with open('reviewed_data/initial_1300.json') as json_file:
    rewiewed_json = json.load(json_file)

In [5]:
annotated_dream_ids = []

for review in rewiewed_json:
    annotated_dream_ids.append(review['data']['dream_id'])

In [6]:
df_shuffled.shape

(11674, 7)

In [7]:
df_shuffled = df_shuffled[~df_shuffled['dream_id'].isin(annotated_dream_ids)]
df_shuffled.shape

(10661, 7)

In [8]:
df_shuffled.clean_description

1        ONLY DOMESTIC BIG BUD XXL Very GOOD Quality Ca...
2        Durgamata is a precious strain &#44 it s has a...
3        Goodfellers is back. 4500 sales on AB always p...
4        This listing is for. Pregabalin 300 mg x 56 Ta...
5        Direct from US pharmacy. Real Adderall - not P...
                               ...                        
11669    This is a custom listing for previously approv...
11670    New batch of ketamine shards this time. Pure c...
11671    14g TOTAL. 7g HOMEGROWN BIG BUDDHA BLUE CHEESE...
11672    This is for 20 OXYCONTIN 40 mg pills just like...
11673    VitaminClub proudly offers you the highest and...
Name: clean_description, Length: 10661, dtype: object

# Separate item listings according to length into batches

In [9]:
def categorize_textLength(text):
    if len(text)<100:
        return 1
    elif len(text)<250:
        return 2
    elif len(text)<500:
        return 3
    elif len(text)<750:
        return 4
    elif len(text)<1000:
        return 5
    elif len(text)<1500:
        return 6
    elif len(text)<2000:
        return 7
    elif len(text)<2500:
        return 8
    elif len(text)<3000:
        return 9
    
df_shuffled["textLengthCat"] = df_shuffled["clean_description"].apply(lambda text: categorize_textLength(text))
    
    

In [11]:
df_shuffled['clean_name']= df_shuffled['name']

df_shuffled['clean_name'] = df_shuffled['clean_name'].str.replace(',', '&#44')
df_shuffled['clean_name'] = df_shuffled['clean_name'].str.replace('\'', '&#39')
df_shuffled['clean_name'] = df_shuffled['clean_name'].str.replace('"', '&#34')

In [17]:
#Create the combination for the name
df_shuffled["dream_id_name"] = df_shuffled["dream_id"].astype(str) +" - "+ df_shuffled["clean_name"]

In [18]:
def get_doc_length(doc):
    length=0
    doc_dict = doc.to_dict()
    for sent in doc_dict:
        length+=len(sent)
    return length
#testdoc = df_shuffled.iloc[1]
#get_doc_length(testdoc.doc)
pd.set_option('display.max_rows', df_shuffled.shape[0])
#df_shuffled.doc.apply(lambda doc: get_doc_length(doc)).sort_values(ascending=False)
df_shuffled['doc_len'] = df_shuffled.doc.apply(lambda doc: get_doc_length(doc))
#TODO remove super long ones in the future.
#Hui Hui Hui das war knapp

In [13]:
df_shuffled=df_shuffled[df_shuffled['textLengthCat'] <= 5].copy()
df_shuffled=df_shuffled[1600:1900].copy()
#:100 in first batch
#200:500 in second batch
#500:1100 in thrid batch
#1100:1600 in fourth batch
#1600:1900 in fifth batch


#df_shuffled=df_shuffled[(df_shuffled['textLengthCat'] > 5 ) & (df_shuffled['doc_len'] <= 509)].copy()
#df_shuffled=df_shuffled[500:700].copy()
#Long Batches
#:100 in first batch
#100:300 in second batch
#300:500 in third batch
#500:700 in fourth batch


df_shuffled.shape

(300, 11)

In [14]:
df_shuffled[df_shuffled['textLengthCat'] <= 5][['dream_id_name', 'clean_description']].to_csv("final_data/Batch_1600_1900_AMT.csv", 
                                                                                                 sep = ',', quotechar='\'', index=False, quoting=csv.QUOTE_NONNUMERIC)
#df_shuffled[['dream_id_name',  'clean_description']].to_csv("final_data/LongBatch_500_700_AMT.csv", 
#                                                                                                sep = ',', quotechar='\'', index=False, quoting=csv.QUOTE_NONNUMERIC)

In [15]:
df_shuffled.shape

(300, 11)