## Parser

This script parses files that are unpacked using `unpacker.ipynb`. 

**Reading in folder structure**
1. it defines a function(`get_directory_structure()`) to read in the folder structure as nested dictionaries
2. it reads the folder structure and creates a nested dictionary. 
3. it writes the nested dictionary to as a JSON-dump


**Parsing silkroad2**

*vendor information*
- defines a parser to extract information about the vendor
- interates over the folder structure and applies the ` parse_vendor_info()` to all vendor files
- concatinates data to a datafile

*feedback information*
- defines a parser to extract feedbacks from seller pages. 
- iterates over the folder structure and applies the `parse_vendor_feedbacks()` to all vendor files
- concatinates feedback-data to a datafile

*item information*
- defines a parser to extract feedbacks from item pages
- iterates over the folder structure and applies the parser to all item files
- iterates over the folder stucutre and apllies the `parse_item_information()` to all item files

In [18]:
import os
import numpy as np
import re
import pandas as pd
import time
import csv
from functools import reduce
import json
from bs4 import BeautifulSoup
import multiprocessing

In [19]:
MAIN_DIR = "/Volumes/Extreme SSD"
DATA_DIR = os.path.join(MAIN_DIR, "data", "unpacked")

In [20]:
DATA_DIR

'/Volumes/Extreme SSD/data/unpacked'

In [21]:
files = [folder for folder in os.listdir(DATA_DIR) if ".DS_" not in folder]

In [22]:
files

['abraxas',
 'agape',
 'agora',
 'alphabay',
 'cloudnine',
 'cryptomarket',
 'diabolus',
 'hydra',
 'nucleus',
 'outlawmarket',
 'silkroad2',
 'themarketplace']

In [23]:
def get_directory_structure(rootdir):
    """
    Creates a nested dictionary that represents the folder structure of rootdir
    """
    
    dir = {}
    rootdir = rootdir.rstrip(os.sep)
    start = rootdir.rfind(os.sep) + 1
    for path, dirs, files in os.walk(rootdir):
            folders = path[start:].split(os.sep)
            subdir = dict.fromkeys(files)
            parent = reduce(dict.get, folders[:-1], dir)
            parent[folders[-1]] = subdir
    return dir

def clean(dirs):
    return [d for d in dirs if ".DS_" not in d]

In [24]:
folder_structure = get_directory_structure(os.path.join(DATA_DIR))

In [25]:
file = os.path.join(MAIN_DIR, 'data', 'logs', 'folder_structure.json')
with open(file, "w") as f:
    json.dump(folder_structure, f)

## Silkroad2 Market
### vendor information

In [9]:
def parse_vendor_info(vendor):
    """
    creates a line that contains: 
        name     = name of vendor
        stime    = time of scrape
        stime_dt = date of scrape
        score    = vendor rating average (out of 100)
        ctime    = time from creation
        otime    = time last online
        loc      = location
        area     = area of operation
    
    Args:
        string of file path
    
    Returns: 
        list of data
    """
    
    # get information from file
    data = [vendor.split(os.sep)[-1].split('.')[0], 
            os.stat(vendor).st_birthtime,
            file.split(os.sep)[-3]]
    
    # opens and parses the html file
    with open(vendor, 'r') as f:
        soup = BeautifulSoup(f.read(), "lxml")
        containers = soup.find_all('span', attrs = {'class': 'container'})

        # create list with name of vendor and scape date
        # note that not in all instances vendor scores were present
        # therefore the data is parsed conditionally
        if "vendor score:" in containers[0].text:
            try:
                [data.append(item) for item in re.findall(".*: (.*).*\n.*\n.*for (.*)\n.*: (.*)\n.*: (.*)\n.*: (.*)", containers[0].text)[0]]
            except: 
                pass
        else:
            # stores an empty string when vendor scores are missing
            data.append('')
            try:
                [data.append(item) for item in re.findall("for.(.*)\n.*:.(.*)\n.*:.(.*)\n.*:.(.*)", containers[0].text)[0]]
            except: 
                pass
    
    return data

In [21]:
def parse_vendors():
    """
    This function operates sequentially:
    1. It iterates over the dictionary containing the folder structure
    2. It asserts whethter the node in the directory is a file
    3. It parses files using ('parse_item_information()')
    4. It appends data to a container list
    5. It creates a dataframe from the container (list of lists) 
       after which duplicates are dropped and columns are restructured
    """
    
    # empty container for dataframes
    container = []
    
    # Iterate over folder structure
    for market in [i for i in clean(folder_structure['unpacked']) if "silkroad2" in i]:
        # for each date
        for date in clean(folder_structure['unpacked'][market].keys()):
            print(date)

            #for each folder
            for f in clean(folder_structure['unpacked'][market][date].keys()):

                # for each category
                if f in ["users"]:
                    for sub in clean(folder_structure['unpacked'][market][date][f].keys()):
                        file = os.path.join(DATA_DIR, market, date, f, sub)
                        if os.path.isfile(file):
                            if "?" not in sub:
                                try:
                                    container.append(parse_vendor_info(file))
                                except: 
                                    pass
    
    # construct data file from container
    df = pd.DataFrame.from_records(container).drop_duplicates()
    df.columns = ['name', 'stime', 'stime_dt', 'score', 'ctime', 'otime', 'location', 'area']
    df = df.reset_index(drop=True)
    
    return df

# storing data
df = parse_vendor()
df.to_pickle(os.path.join(MAIN_DIR, 'data', 'parsed', 'silkroad2', 'vendors.pickle'))

2013-12-20
2014-01-16
2014-02-11
2014-02-13
2014-02-21
2014-02-24
2014-03-03
2014-03-10
2014-04-12
2014-04-20
2014-04-28
2014-05-03
2014-05-05
2014-05-08
2014-05-10
2014-05-19
2014-05-24
2014-05-29
2014-06-02
2014-06-03
2014-06-11
2014-06-15
2014-06-23
2014-06-24
2014-07-08
2014-07-17
2014-07-23
2014-07-26
2014-07-30
2014-08-04
2014-08-09
2014-08-11
2014-08-17
2014-08-23
2014-08-27
problem with chuck10
2014-08-30
problem with chuck10
2014-09-02
problem with chuck10
2014-09-10
problem with chuck10
2014-09-15
problem with chuck10
2014-09-20
problem with chuck10
2014-09-23
2014-09-26
2014-09-28
2014-09-30
2014-10-04
2014-10-11
2014-10-12
2014-10-13
2014-10-15
2014-10-17
2014-10-20
2014-10-24
problem with chuckie11.1
problem with chuckie11.2
2014-10-27
2014-10-28
2014-11-01
2014-11-05
2014-11-06


### Feedback information

In [12]:
def parse_vendor_feedbacks(page):
    """
    creates a dataframe that contains: 
        rating    = value containing the rating of the seller
        feedback  = textual feedback 
        item      = item classifier
        freshness = the freshness of the review
        name      = the name of the vendor
        stime     = the time at which the item-page was scraped
        
    Args:
        string of file path
    
    Returns: 
        dataframe
    """
    
    # parses html table to dataframe
    df = pd.DataFrame(pd.read_html(page, flavor = "lxml")[0])
    
    # parses feedbacks into a {#} of 5 string format. 
    # Note that some ratings were notated as stars, these
    # stars are counted and parsed as a string number
    if '5' not in df.rating[0]:
        df['rating'] = df.rating.apply(lambda x : "{0} of 5".format(x.count("★")))
    
    # adds name and stime columns 
    df.assign(
        name  = str(page.split(os.sep)[-1].split('.')[0].split('?')[0]),
        stime = os.stat(page).st_birthtime)
    
    #reorders columns in data frame
    df = df[['name', 'stime', 'rating', 'feedback', 'item', 'freshness']]
    
    return df

In [13]:
def parse_feedbacks()
    """
    This function operates sequentially:
    1.  It iterates over the dictionary containing the folder structure
    2.  It asserts whethter the node in the directory is a file
    3.  It parses files using ('parse_vendor_feedbacks()')
    4.  It appends data to a container list
    5.  For each date it concatinates the dataframes in the container
        after which the container is replenished to safe working memory
    
    Args:
        string of file path
    
    Returns: 
        dataframe
    """
    # empty list for data frames
    container = []
    
    # storing data.
    for market in [i for i in clean(folder_structure['unpacked']) if "silkroad2" in i]:
        # for each date
        for date in clean(folder_structure['unpacked'][market].keys()):
            print(date)

            #for each folder
            for f in clean(folder_structure['unpacked'][market][date].keys()):

                # for each user
                if f in ["users"]:
                    for sub in clean(folder_structure['unpacked'][market][date][f].keys()):
                        if isinstance(sub, str):
                            try:
                                file = os.path.join(DATA_DIR, market, date, f, sub)
                            except: 
                                pass

                            if os.path.isfile(file):
                                try:
                                    container.append(parse_vendor_feedbacks(file))
                                except:
                                    pass
    
    # concatenate all dataframes in container
    df = pd.concat(container)
    
    return df

df = parse_feedbacks()
df.to_pickle(os.path.join(MAIN_DIR, 'data', 'parsed', 'silkroad2', 'feedbacks.pickle'))

2013-12-20
2014-01-16
2014-02-11
2014-02-13
2014-02-21
2014-02-24
2014-03-03
2014-03-10
2014-04-12
2014-04-20
2014-04-28
2014-05-03
2014-05-05
2014-05-08
2014-05-10
2014-05-19
2014-05-24
2014-05-29
2014-06-02
2014-06-03
2014-06-11
2014-06-15
2014-06-23
2014-06-24
2014-07-08
2014-07-17
2014-07-23
2014-07-26
2014-07-30
2014-08-04
2014-08-09
2014-08-11
2014-08-17
2014-08-23
2014-08-27
2014-08-30
2014-09-02
2014-09-10
2014-09-15
2014-09-20
2014-09-23
2014-09-26
2014-09-28
2014-09-30
2014-10-04
2014-10-11
2014-10-12
2014-10-13
2014-10-15
2014-10-17
2014-10-20
2014-10-24
2014-10-27
2014-10-28
2014-11-01
2014-11-05
2014-11-06


## Item information

In [26]:
file = '/Volumes/Extreme SSD/data/unpacked/silkroad2/2014-05-24/items/0-1g-mdpv-90-pure'
def parse_item_information(file):
    """
    creates a dataframe that contains: 
        rating    = value containing the rating of the seller
        feedback  = textual feedback 
        item      = item classifier
        freshness = the freshness of the review
        price     = the price of the item that has been bought
        vendor    = the name of the vendor
        stime     = the time at which the item-page was scraped
        stime_dt  = the date at which the item page was scraped
        loc       = the country of operation of the vendor
        area      = the area to which the vendor ships
        
    Args:
        string of file path
    
    Returns: 
        dataframe
    """
    
    # get item meta data
    with open(file, 'r') as f:
        soup     = BeautifulSoup(f.read(), "lxml")
        price    = soup.find('div', attrs = {'class': 'price_big'}).text.strip()
        vendor   = soup.find('h3').text.split(": ")[1]
        stime    = os.stat(file).st_birthtime
        stime_dt = file.split(os.sep)[-3]
        p = soup.find_all('p', limit = 2)[1]
        area     = re.findall(".*to: (.*)$",     p.text.strip())[0]
        loc      = re.findall("from: (.*)\\n.*", p.text.strip())[0]
        item     = soup.find('h2').text

    # parses feedback information for item
    df = pd.read_html(file)[-1].drop_duplicates()

    # concats meta data to feedback information
    df = df.assign(item    = item,
                  price    = price,
                  vendor   = vendor, 
                  stime    = stime,
                  stime_dt = stime_dt,
                  loc      = loc,
                  area     = area)
    
    if '5' not in df.rating[0]:
        df['rating'] = df.rating.apply(lambda x : "{0} of 5".format(x.count("★")))
    
    return df


In [28]:
# set out-file path
data_folder = "/Volumes/Extreme SSD/data/parsed/silkroad2/items"

def file_name(data_folder, date):
    return os.path.join(data_folder, ''.join(['items_', date.replace('-', ''), '.pickle']))

# iterates over complex folder structure
def parse_items():
    """
    This function operates sequentially:
    1. It iterates over the dictionary containing the folder structure
    2. It asserts whethter the node in the directory is a file
    3. It parses files using ('parse_item_information()')
    4. It appends data to a container list
    5. For each date it concatinates the dataframes in the container
       after which the container is replenished to safe working memory
    
    Args:
      None
    
    Returns:
      None
      
    Raises:
      Contains a simply try clause that ensure that the parses will 
      continue in all cases. This circumvcirents issues caused by corrupted
      files and encoding. 
    """
    
    for market in [i for i in clean(folder_structure['unpacked']) if "silkroad2" in i]:
        # for each date
        for date in clean(folder_structure['unpacked'][market].keys()):
            # empty list for data frames
            container = []
            print(date)

            #for each folder
            for f in clean(folder_structure['unpacked'][market][date].keys()):

                # for each item
                if f in ["items"]:
                    for sub in clean(folder_structure['unpacked'][market][date][f].keys()):
                        file = os.path.join(DATA_DIR, market, date, f, sub)
                        if os.path.isfile(file):
                            try:
                                container.append(parse_item_information(file))
                            except:
                                pass
                        else: 
                            continue
            
            # append unique data to dfs and replenish the container
            df = pd.concat(container, sort = True)
            df = df.drop_duplicates()
            df.to_pickle(file_name(data_folder, date))
            
            del container
            del df

parse_items()

2013-12-20
2014-01-16
2014-02-11
2014-02-13
2014-02-21
2014-02-24
2014-03-03
2014-03-10
2014-04-12
2014-04-20
2014-04-28
2014-05-03
2014-05-05
2014-05-08
2014-05-10
2014-05-19
2014-05-24
2014-05-29
2014-06-02
2014-06-03
2014-06-11
2014-06-15
2014-06-23
2014-06-24
2014-07-08
2014-07-17
2014-07-23
2014-07-26
2014-07-30
2014-08-04
2014-08-09
2014-08-11
2014-08-17
2014-08-23
2014-08-27
2014-08-30
2014-09-02
2014-09-10
2014-09-15
2014-09-20
2014-09-23
2014-09-26
2014-09-28
2014-09-30
2014-10-04
2014-10-11
2014-10-12
2014-10-13
2014-10-15
2014-10-17
2014-10-20
2014-10-24
2014-10-27
2014-10-28
2014-11-01
2014-11-05
2014-11-06


In [39]:
container = []

for file in os.listdir(data_folder):
    if '.pickle' in file:
        df = pd.read_pickle(os.path.join(data_folder, file))
        container.append(df)
        
df = pd.concat(container)
df = df\
    .drop_duplicates()\
    .reset_index()


In [40]:
df

ImportError: cannot import name 'is_url' from 'pandas.io.common' (/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/pandas/io/common.py)

          index       area                                           feedback  \
0             0  Worldwide                               didnt receive parcel   
1             1  Worldwide  Good communication, next day delivery. Buds lo...   
2             2  Worldwide  Never arrived. DO NOT FE WITH THIS SELLER or b...   
3             3  Worldwide             great package and product will be back   
4             4  Worldwide                                  Nothing arrived..   
...         ...        ...                                                ...   
22003613      5  Worldwide                                          thankyou!   
22003614      6  Worldwide            quick delivery and product as described   
22003615      7  Worldwide                                             Legend   
22003616      8  Worldwide             Got it right away, go check it out....   
22003617      9  Worldwide  I thought I would take a chance with these guy...   

         freshness         

In [41]:
data_file = os.path.join(MAIN_DIR, 'data', 'parsed', 'silkroad2', 'items.pickle')
%time
df.to_pickle(data_file)

CPU times: user 8 µs, sys: 2 µs, total: 10 µs
Wall time: 538 µs
