# Cleaner

In [None]:
import pandas as pd
import os
import time

In [None]:
def get_day(day):
    day = day.str.split(' ').str[0]
    day = day.replace('today', '0')
    pd.to_numeric(day)
    return day

def to_dt(col):
    return pd.to_datetime(col, unit = 's', errors = 'coerce')

In [None]:
def date_to_num(col):
    """
    This function returns a numeric value for valid date strings
    If the date is today, a zero-value is returned
    Otherwise a Null value is returned 
    """
    if isinstance(col, str):
        if 'today' not in col:
            trans_dict = {
                'year'  : 365.25,
                'month' : 365.25/12,
                'day'   : 1,
                'hour'  : 1/24
            }

            value = float(col.split(' ')[0])
            unit = col.split(' ')[1]
            for key in trans_dict.keys():
                if unit in key:
                    unit = unit.replace(key, str(trans_dict[key]))
            unit = float(unit)

            return value * unit
        
        elif 'today' in col:
            return 0
        
        else:
            return None

## Feedback information
### Dropping duplicate rows

In [None]:
MAINDIR = os.getcwd().rsplit('/', 1)[0]

In [None]:
data_file = os.path.join(MAINDIR, 'data/parsed/silkroad2', 'feedbacks.pickle')
out_file  = os.path.join(MAINDIR, 'data/final/silkroad2', 'feedbacks.pickle')

df = pd.read_pickle(data_file)
mem = df.memory_usage(deep = True).sum()
print(len(df), len(df.drop_duplicates()), len(df) - len(df.drop_duplicates()))
df = df.drop_duplicates()

In [None]:
df['freshness'] = pd.to_numeric(get_day(df.freshness))
df = df\
    .assign(rating = pd.to_numeric(df.rating.str.get(0)),
            rtime  = df.stime - df.freshness * 86400)\
    .assign(rtime_dt = lambda df_copy: to_dt(df_copy.rtime))

In [None]:
print('memory usage before cleaning                    ', 
      mem)
print('memory usage before memory usage optimalization ', 
      df.memory_usage(deep = True).sum())

column_types = {
    'name'     : 'category',
    'stime'    : 'float32',
    'rating'   : pd.Int16Dtype(),
    'feedback' : 'object',
    'freshness': 'float32',
    'rtime'    : 'float32',
    'rtime_dt' : 'datetime64'
}

df = df.astype(column_types)

print('memory usage after memory usage optimalization  ', 
      df.memory_usage(deep = True).sum())

### Dropping Duplicates

In scaped data, the same data is often scraped multiple times. To assure that feedbacks are not included twice duplicates will be dropped. Feedbacks can be uniquely identified by: 
- `name`
- `rating`
- `feedback`
- `rtime_dt`

In [None]:
# drop duplicate feedbacks from table 
subset = ['name', 'rating', 'feedback', 'rtime_dt']
print("before dropping duplicates: {0}\nafter dropping duplicates:"
      + "  {1}\n\n{2} duplicate cases were dropped"\
      .format(len(df), 
              len(df.drop_duplicates(subset = subset)), 
              len(df) - len(df.drop_duplicates(subset = subset))))

df = df.drop_duplicates(subset = subset)
df = df.reset_index()

In [None]:
df.to_pickle(out_file)

## Vendor information

In [None]:
data_file = os.path.join(MAINDIR, 'data/parsed/silkroad2', 'vendors.pickle')
out_file  = os.path.join(MAINDIR, 'data/final/silkroad2', 'vendors.pickle')

df = pd.read_pickle(data_file)
mem = df.memory_usage(deep = True).sum()

In [None]:
df = df\
    .assign(
        ctime     = df.ctime.str.replace("about ", "").str.replace('s', '').str.replace(' ago', ''),
        otime     = df.otime.str.replace("about ", "").str.replace('s', '').str.replace(' ago', ''))\
    .assign(
        ctime_num = lambda df_copy: df_copy['ctime'].apply(date_to_num),
        otime_num = lambda df_copy: df_copy['otime'].apply(date_to_num))\
    .assign(
        otime_dt  = lambda df_copy: to_dt(df_copy.stime - df_copy.otime_num * 86400),
        ctime_dt  = lambda df_copy: to_dt(df_copy.stime - df_copy.ctime_num * 86400),
        stime_dt  = to_dt(df.stime),
        score     = df.score.replace('NEW VENDOR', 0).apply(pd.to_numeric))\
    .assign(
        score     = lambda df_copy: df_copy.score.mask(df_copy.score > 100)
    )

In [None]:
# drop duplicate feedbacks from table 
df['flag'] = df.ctime_dt.dt.strftime('%d%m%Y').astype('category')
subset = ['name', 'flag', 'location', 'area']
print("before dropping duplicates: {0}\nafter dropping duplicates:"
      + "   {1}\n\n{2} duplicate cases were dropped"\
      .format(len(df), 
              len(df.drop_duplicates(subset = subset)), 
              len(df) - len(df.drop_duplicates(subset = subset))))

df = df.drop_duplicates(subset = subset)

In [None]:
columns = ['name', 'stime', 'stime_dt', 'score', 'ctime', 'ctime_num', 'ctime_dt', 
           'otime', 'otime_num', 'otime_dt', 'location', 'area']

df = df.reindex(columns, axis=1)
df = df.reset_index()

In [None]:
print('memory usage before cleaning                    ', mem)
print('memory usage before memory usage optimalization ', df.memory_usage(deep = True).sum())

column_types = {
    'name'     : 'category',
    'stime'    : 'float32',
    'stime_dt' : 'datetime64',
    'score'    : pd.Int16Dtype(),
    'ctime'    : 'category', 
    'ctime_num': 'float32', 
    'ctime_dt' : 'datetime64', 
    'otime'    : 'category', 
    'otime_num': 'float32', 
    'otime_dt' : 'datetime64', 
    'location' : 'category', 
    'area'     : 'category'
}

df = df.astype(column_types)

print('memory usage after memory usage optimalization   ', df.memory_usage(deep = True).sum())

In [None]:
df.to_pickle(out_file)

## Item Information

In [None]:
data_file = os.path.join(MAINDIR, 'data/parsed/silkroad2', 'items.pickle')
out_file  = os.path.join(MAINDIR, 'data/final/silkroad2', 'items.pickle')

df = pd.read_pickle(data_file)

In [None]:
def clean_freshness(col):
    #assert that freshness is not missing
    new_col = []
    for i in col:
        if not isinstance(i, (float, int)) and 'day' in i and len(i) < 10:
            new_col.append(i)
        else:
            new_col.append(None)
    
    return pd.Series(new_col)

In [None]:
df = df\
    .assign(
        vendor = df.vendor.str.replace('\\n', ''),
        freshness = clean_freshness(df.freshness).\
            str.replace('s', '')\
            .apply(date_to_num),
        rating = df.rating.str.get(0),
        price = df.price.str[1:],
        stime_str = df.stime_dt,
        category = df.category)\
    .assign(
        rtime = lambda df_copy: df_copy.stime - df_copy.freshness *86400)\
    .assign(
        rtime_dt = lambda df_copy: to_dt(df_copy.rtime).dt.normalize(),
        stime_dt = lambda df_copy: to_dt(df_copy.stime).dt.normalize())

In [None]:
column_types = {
    'vendor'   : 'category',
    'stime'    : 'float32',
    'stime_dt' : 'datetime64',
    'stime_str': 'category',
    'rating'   : 'float16',
    'feedback' : 'object',
    'item'     : 'object',
    'category' : 'category',
    'price'    : 'float32',
    'freshness': 'float32',
    'rtime'    : 'float32',
    'rtime_dt' : 'datetime64',
    'loc'      : 'category',
    'area'     : 'category'
}

df = df.astype(column_types)

# downcast integer variables
df = df\
    .assign(
        freshness = pd.Series(df.freshness, dtype=pd.Int16Dtype()),
        rating = pd.Series(df.rating, dtype=pd.Int16Dtype()))

print('memory usage after memory usage optimalization', df.memory_usage(deep = True).sum())

In [None]:
# drop duplicate feedbacks from table 
subset = ['vendor', 'rtime_dt', 'item', 'loc', 'feedback', 'rating']
print("before dropping duplicates: {0}\nafter dropping duplicates:"
      + "  {1}\n\n{2} duplicate cases were dropped"\
      .format(len(df), 
              len(df.drop_duplicates(subset = subset)), 
              len(df) - len(df.drop_duplicates(subset = subset))))

df = df.drop_duplicates(subset = subset)
df['location'] = df['loc']

In [None]:
columns = ['vendor', 'stime', 'stime_dt', 'stime_str', 
           'rating', 'feedback', 'item', 'category', 
           'price', 'freshness', 'rtime', 'rtime_dt', 
           'location', 'area']

df = df.reindex(columns, axis=1)
df = df.reset_index(drop = True)
df = df.dropna()

In [None]:
df.to_pickle(out_file)

## Categories

In [None]:
data_file = os.path.join(MAINDIR, 'data/parsed/silkroad2', 'categories.pickle')
out_file  = os.path.join(MAINDIR, 'data/final/silkroad2', 'categories.pickle')

df = pd.read_pickle(data_file)
mem = df.memory_usage(deep = True).sum()
print(len(df), len(df.drop_duplicates()), len(df) - len(df.drop_duplicates()))
df = df.drop_duplicates()

In [None]:
# clean variables
df = df\
    .assign(title = df.title.str.strip(),
            vendor = df.vendor.str.strip(),
            location = df.location.str.strip(),
            area = df.location.str.strip(),
            price = pd.to_numeric(
                    df.price.str.strip()\
                    .str[1:],
                errors = 'coerce'),
            category = df.category.str.strip())

In [None]:
# memory optimization
column_types = {
    'title'    : 'object',
    'vendor'   : 'category',
    'location' : 'category',
    'area'     : 'category',
    'price'    : 'float64',
    'category' : 'category'
}

df = df.astype(column_types)

In [None]:
# encode infinite values to missing and restructure. 
df['price']= df['price'].replace([np.inf, -np.inf], np.nan)
df = df.reset_index(drop = True)

# safe dataframe
df.to_pickle(out_file)