# PreProcessing of Drug Dataset From Dreammarket

In [1]:
import difflib
import pickle
import pandas as pd
import re
import langdetect as ld
import stanza
from stanza.pipeline.processor import register_processor, Processor

import time
from faker import Faker
import numpy as np
from pathlib import Path
import phonenumbers
import random
import csv

import json


In [2]:
df_shuffled = pd.read_pickle('intermediate_store/preprocessed_AMT_ads_V1.2.pickle')


# Remove ITEM already annotated by APPEN

In [3]:
#Load reviewed annotation data
with open('reviewed_data/initial_1300.json') as json_file:
    rewiewed_json = json.load(json_file)

In [4]:
annotated_dream_ids = []

for review in rewiewed_json:
    annotated_dream_ids.append(review['data']['dream_id'])

In [5]:
df_shuffled.shape

(11674, 7)

In [6]:
df_shuffled = df_shuffled[~df_shuffled['dream_id'].isin(annotated_dream_ids)]
df_shuffled.shape

(10661, 7)

# Remove HTML encoding

In [7]:
df_shuffled['clean_description'] = df_shuffled['clean_description'].str.replace('&#44', ',')
df_shuffled['clean_description'] = df_shuffled['clean_description'].str.replace('&#39', '\'')
df_shuffled['clean_description'] = df_shuffled['clean_description'].str.replace('&#34', '\"')

In [8]:
df_shuffled.clean_description

1        ONLY DOMESTIC BIG BUD XXL Very GOOD Quality Ca...
2        Durgamata is a precious strain , it s has a un...
3        Goodfellers is back. 4500 sales on AB always p...
4        This listing is for. Pregabalin 300 mg x 56 Ta...
5        Direct from US pharmacy. Real Adderall - not P...
                               ...                        
11669    This is a custom listing for previously approv...
11670    New batch of ketamine shards this time. Pure c...
11671    14g TOTAL. 7g HOMEGROWN BIG BUDDHA BLUE CHEESE...
11672    This is for 20 OXYCONTIN 40 mg pills just like...
11673    VitaminClub proudly offers you the highest and...
Name: clean_description, Length: 10661, dtype: object

# Separate item listings according to length into batches

In [9]:
def categorize_textLength(text):
    if len(text)<100:
        return 1
    elif len(text)<250:
        return 2
    elif len(text)<500:
        return 3
    elif len(text)<750:
        return 4
    elif len(text)<1000:
        return 5
    elif len(text)<1500:
        return 6
    elif len(text)<2000:
        return 7
    elif len(text)<2500:
        return 8
    elif len(text)<3000:
        return 9
    
df_shuffled["textLengthCat"] = df_shuffled["clean_description"].apply(lambda text: categorize_textLength(text))
    
    

In [10]:
def get_doc_length(doc):
    length=0
    doc_dict = doc.to_dict()
    for sent in doc_dict:
        length+=len(sent)
    return length
#testdoc = df_shuffled.iloc[1]
#get_doc_length(testdoc.doc)
pd.set_option('display.max_rows', df_shuffled.shape[0])
#df_shuffled.doc.apply(lambda doc: get_doc_length(doc)).sort_values(ascending=False)
df_shuffled['doc_len'] = df_shuffled.doc.apply(lambda doc: get_doc_length(doc))
#TODO remove super long ones in the future.
#Hui Hui Hui das war knapp

In [11]:
df_short=df_shuffled[df_shuffled['textLengthCat'] <= 5].copy()
df_short=df_short[1900:].copy()
#:100 in first batch
#200:500 in second batch
#500:1100 in thrid batch
#1100:1600 in fourth batch
#1600:1900 in fifth batch
#--> 1900:end as DAPT


df_long=df_shuffled[(df_shuffled['textLengthCat'] > 5 ) & (df_shuffled['doc_len'] <= 509)].copy()
df_long=df_long[700:].copy()
#Long Batches
#:100 in first batch
#100:300 in second batch
#300:500 in third batch
#500:700 in fourth batch
# --> 700:end as DAPT

dapt = pd.concat([df_short, df_long])
dapt.shape

(8016, 9)

In [12]:
dapt[ 'clean_description'].to_csv("final_data/DAPT_DreamMarket1.0.txt", sep = '\t', quotechar='\'', index=False, header=False, quoting=csv.QUOTE_NONE)

In [13]:
df_shuffled.shape

(10661, 9)