In [4]:
import pandas as pd
import os
import time

In [5]:
MAIN_DIR = "/Volumes/Extreme SSD"
DATA_DIR = os.path.join(MAIN_DIR, "data")

In [6]:
def get_day(day):
    day = day.str.split(' ').str[0]
    day = day.replace('today', '0')
    pd.to_numeric(day)
    return day

def to_dt(col):
    return pd.to_datetime(col, unit = 's', errors = 'coerce')

In [7]:
def date_to_num(col):
    """
    This function returns a numeric value for valid date strings
    If the date is today, a zero-value is returned
    Otherwise a Null value is returned 
    """
    if isinstance(col, str):
        if 'today' not in col:
            trans_dict = {
                'year'  : 365.25,
                'month' : 365.25/12,
                'day'   : 1,
                'hour'  : 1/24
            }

            value = float(col.split(' ')[0])
            unit = col.split(' ')[1]
            for key in trans_dict.keys():
                if unit in key:
                    unit = unit.replace(key, str(trans_dict[key]))
            unit = float(unit)

            return value * unit
        
        elif 'today' in col:
            return 0
        
        else:
            return None

## Feedback information
#### Dropping duplicate rows

In [3]:
data_file = os.path.join(MAIN_DIR, 'data', 'parsed', 'silkroad2', 'feedbacks.pickle')
out_file  = os.path.join(MAIN_DIR, 'data', 'final', 'silkroad2', 'feedbacks.pickle')

df = pd.read_pickle(data_file)
mem = df.memory_usage(deep = True).sum()
print(len(df), len(df.drop_duplicates()), len(df) - len(df.drop_duplicates()))
df = df.drop_duplicates()

2275492 2152745 122747


In [5]:
df['freshness'] = pd.to_numeric(get_day(df.freshness))
df = df\
    .assign(rating = pd.to_numeric(df.rating.str.get(0)),
            rtime  = df.stime - df.freshness * 86400)\
    .assign(rtime_dt = lambda df_copy: to_dt(df_copy.rtime))

In [6]:
print('memory usage before cleaning                    ', mem)
print('memory usage before memory usage optimalization ', df.memory_usage(deep = True).sum())

column_types = {
    'name'     : 'category',
    'stime'    : 'float32',
    'rating'   : pd.Int16Dtype(),
    'feedback' : 'object',
    'freshness': 'float32',
    'rtime'    : 'float32',
    'rtime_dt' : 'datetime64'
}

df = df.astype(column_types)

print('memory usage after memory usage optimalization  ', df.memory_usage(deep = True).sum())

memory usage before cleaning                     890426282
memory usage before memory usage optimalization  639579799
memory usage after memory usage optimalization   460772248


#### dropping duplicates

In scaped data, the same data is often scraped multiple times. To assure that feedbacks are not included twice duplicates will be dropped. Feedbacks can be uniquely identified by: 
- `name`
- `rating`
- `feedback`
- `rtime_dt`

In [7]:
# drop duplicate feedbacks from table 
subset = ['name', 'rating', 'feedback', 'rtime_dt']
print("before dropping duplicates: {0}\nafter dropping duplicates:  {1}\n\n{2} duplicate cases were dropped".format(len(df), len(df.drop_duplicates(subset = subset)), len(df) - len(df.drop_duplicates(subset = subset))))

df = df.drop_duplicates(subset = subset)
df = df.reset_index()

before dropping duplicates: 2152745
after dropping duplicates:  2145909

6836 duplicate cases were dropped


In [8]:
df.to_pickle(out_file)

## Vendor information


In [None]:
data_file = os.path.join(MAIN_DIR, 'data', 'parsed', 'silkroad2', 'vendors.pickle')
out_file  = os.path.join(MAIN_DIR, 'data', 'final', 'silkroad2', 'vendors.pickle')

df = pd.read_pickle(data_file)
mem = df.memory_usage(deep = True).sum()

In [None]:
df = df\
    .assign(
        ctime     = df.ctime.str.replace("about ", "").str.replace('s', '').str.replace(' ago', ''),
        otime     = df.otime.str.replace("about ", "").str.replace('s', '').str.replace(' ago', ''))\
    .assign(
        ctime_num = lambda df_copy: df_copy['ctime'].apply(date_to_num),
        otime_num = lambda df_copy: df_copy['otime'].apply(date_to_num))\
    .assign(
        otime_dt  = lambda df_copy: to_dt(df_copy.stime - df_copy.otime_num * 86400),
        ctime_dt  = lambda df_copy: to_dt(df_copy.stime - df_copy.ctime_num * 86400),
        stime_dt  = to_dt(df.stime),
        score     = df.score.replace('NEW VENDOR', 0).apply(pd.to_numeric))\
    .assign(
        score     = lambda df_copy: df_copy.score.mask(df_copy.score > 100)
    )

In [12]:
# drop duplicate feedbacks from table 
df['flag'] = df.ctime_dt.dt.strftime('%d%m%Y').astype('category')
subset = ['name', 'flag', 'location', 'area']
print("before dropping duplicates: {0}\nafter dropping duplicates:   {1}\n\n{2} duplicate cases were dropped".format(len(df), len(df.drop_duplicates(subset = subset)), len(df) - len(df.drop_duplicates(subset = subset))))
df = df.drop_duplicates(subset = subset)

before dropping duplicates: 12242
after dropping duplicates:  9377

2865 duplicate cases were dropped


In [13]:
columns = ['name', 'stime', 'stime_dt', 'score', 'ctime', 'ctime_num', 'ctime_dt', 
           'otime', 'otime_num', 'otime_dt', 'location', 'area']

df = df.reindex(columns, axis=1)
df = df.reset_index()

In [14]:
print('memory usage before cleaning                    ', mem)
print('memory usage before memory usage optimalization ', df.memory_usage(deep = True).sum())

column_types = {
    'name'     : 'category',
    'stime'    : 'float32',
    'stime_dt' : 'datetime64',
    'score'    : pd.Int16Dtype(),
    'ctime'    : 'category', 
    'ctime_num': 'float32', 
    'ctime_dt' : 'datetime64', 
    'otime'    : 'category', 
    'otime_num': 'float32', 
    'otime_dt' : 'datetime64', 
    'location' : 'category', 
    'area'     : 'category'
}

df = df.astype(column_types)

print('memory usage after memory usage optimalization   ', df.memory_usage(deep = True).sum())

memory usage before cleaning                     5708163
memory usage before memory usage optimalization  3695920
memory usage after memory usage optimalization    640278


In [15]:
df.to_pickle(out_file)

## Item Information

In [9]:
data_file = os.path.join(MAIN_DIR, 'data', 'parsed', 'silkroad2', 'items.pickle')
out_file  = os.path.join(MAIN_DIR, 'data', 'final', 'silkroad2', 'items.pickle')

df = pd.read_pickle(data_file)

In [10]:
def clean_freshness(col):
    #assert that freshness is not missing
    new_col = []
    for i in col:
        if not isinstance(i, (float, int)) and 'day' in i and len(i) < 10:
            new_col.append(i)
        else:
            new_col.append(None)
    
    return pd.Series(new_col)

# mem = df.memory_usage(deep = True).sum()

In [11]:
df = df\
    .assign(
        vendor = df.vendor.str.replace('\\n', ''),
        freshness = clean_freshness(df.freshness).\
            str.replace('s', '')\
            .apply(date_to_num),
        rating = df.rating.str.get(0),
        price = df.price.str[1:],
        stime_str = df.stime_dt,
        category = df.category)\
    .assign(
        rtime = lambda df_copy: df_copy.stime - df_copy.freshness *86400)\
    .assign(
        rtime_dt = lambda df_copy: to_dt(df_copy.rtime).dt.normalize(),
        stime_dt = lambda df_copy: to_dt(df_copy.stime).dt.normalize())

In [14]:
column_types = {
    'vendor'   : 'category',
    'stime'    : 'float32',
    'stime_dt' : 'datetime64',
    'stime_str': 'category',
    'rating'   : 'float16',
    'feedback' : 'object',
    'item'     : 'object',
    'category' : 'category',
    'price'    : 'float32',
    'freshness': 'float32',
    'rtime'    : 'float32',
    'rtime_dt' : 'datetime64',
    'loc'      : 'category',
    'area'     : 'category'
}

df = df.astype(column_types)

# downcast integer variables
df = df\
    .assign(
        freshness = pd.Series(df.freshness, dtype=pd.Int16Dtype()),
        rating = pd.Series(df.rating, dtype=pd.Int16Dtype()))

print('memory usage after memory usage optimalization', df.memory_usage(deep = True).sum())

memory usage after memory usage optimalization 4617430229


In [15]:
# drop duplicate feedbacks from table 
subset = ['vendor', 'rtime_dt', 'item', 'loc', 'feedback', 'rating']
print("before dropping duplicates: {0}\nafter dropping duplicates:  {1}\n\n{2} duplicate cases were dropped".format(len(df), len(df.drop_duplicates(subset = subset)), len(df) - len(df.drop_duplicates(subset = subset))))
df = df.drop_duplicates(subset = subset)
df['location'] = df['loc']

before dropping duplicates: 17592264
after dropping duplicates:  4181228

13411036 duplicate cases were dropped


In [16]:
columns = ['vendor', 'stime', 'stime_dt', 'stime_str', 'rating', 'feedback', 'item', 'category', 'price',
         'freshness', 'rtime', 'rtime_dt', 'location', 'area']

df = df.reindex(columns, axis=1)
df = df.reset_index(drop = True)

In [21]:
df = df.dropna()
df.sample(5)

Unnamed: 0,vendor,stime,stime_dt,stime_str,rating,feedback,item,category,price,freshness,rtime,rtime_dt,location,area
2922952,DrWhite93,1411991000.0,2014-09-29,2014-09-30,5,"FAst shipping, great produckt fot it's price",10x 10mg Vardenafil Generic (Levitra),drugs-other-intoxicants-alcohol,0.154181,33,1409140000.0,2014-08-27,United States,United States
1042436,real pharmaceuticals,1402431000.0,2014-06-10,2014-05-24,5,very good thanks,"Sertraline (trade names Zoloft, Lustral) 50 mg",drugs-prescription,0.001004,2,1402258000.0,2014-06-08,United Kingdom,Worldwide
2058001,nzt48givesyouwings96,1408889000.0,2014-08-24,2014-08-27,5,AWESUMMMMMM++++++,▀▄▀ Pure Dextro 10mg x5 Inst Release,drugs-prescription-stimulants-amphetamine,0.758752,60,1403705000.0,2014-06-25,United States,United States
102267,Breeze The Gnome,1393159000.0,2014-02-23,2014-02-24,5,"Amazing stealth, good communication and prompt...",1 g DMT freebase from MHRB,drugs-psychedelics-dmt,0.206897,18,1391604000.0,2014-02-05,United States,Worldwide
505641,ValueMart,1402430000.0,2014-06-10,2014-04-12,1,Never even sent package. He told me this himse...,5 Grams of pure MDMA 84% Brown.,drugs-Ecstasy-mdma,0.649592,12,1401394000.0,2014-05-29,United States,United States


In [20]:
df.to_pickle(out_file)

# Categories

In [55]:
data_file = os.path.join(MAIN_DIR, 'data', 'parsed', 'silkroad2', 'categories.pickle')
out_file  = os.path.join(MAIN_DIR, 'data', 'final', 'silkroad2', 'categories.pickle')

df = pd.read_pickle(data_file)
mem = df.memory_usage(deep = True).sum()
print(len(df), len(df.drop_duplicates()), len(df) - len(df.drop_duplicates()))
df = df.drop_duplicates()

347877 325040 22837


In [57]:
# clean variables
df = df\
    .assign(title = df.title.str.strip(),
            vendor = df.vendor.str.strip(),
            location = df.location.str.strip(),
            area = df.location.str.strip(),
            price = pd.to_numeric(
                    df.price.str.strip()\
                    .str[1:],
                errors = 'coerce'),
            category = df.category.str.strip())

In [59]:
# memory optimization
column_types = {
    'title'    : 'object',
    'vendor'   : 'category',
    'location' : 'category',
    'area'     : 'category',
    'price'    : 'float64',
    'category' : 'category'
}

df = df.astype(column_types)

In [78]:
df.sample(5)

Unnamed: 0,title,vendor,location,area,price,category
106315,Dexedrine SR (Dextroamphetamine) 10mg Spansule...,Quixote,United States,United States,0.172319,drugs-prescription-stimulants-amphetamine-dexe...
97107,28g AK48 Indoor Organic Top Shelf Weed Dank!,KushDepot,United States,United States,0.60791,drugs-cannabis
305695,100g pure FU-144 with FREE worldwide shipping,drzheng,China,China,2.462049,drugs-cannabis-synthetic
36184,1 Milion Youtube High Quality Real Views!,profesorhouse,Belgium,Belgium,1.105088,money
162306,MDMA 7 GRAMS - PURE & UNCUT ROCKS,jerseycow,United Kingdom,United Kingdom,0.70087,drugs-ecstasy


In [77]:
# encode infinite values to missing and restructure. 
df['price']= df['price'].replace([np.inf, -np.inf], np.nan)
df = df.reset_index(drop = True)

# safe dataframe
df.to_pickle(out_file)