# Parser

This script parses files that are unpacked using `unpacker.ipynb`. 

## Reading in folder structure
1. it defines a function(`get_directory_structure()`) to read in the folder structure as nested dictionaries
2. it reads the folder structure and creates a nested dictionary. 
3. it writes the nested dictionary to as a JSON-dump


## Parsing silkroad2
*vendor information*
- defines a parser to extract information about the vendor
- interates over the folder structure and applies the ` parse_vendor_info()` to all vendor files
- concatinates data to a datafile

*feedback information*
- defines a parser to extract feedbacks from seller pages. 
- iterates over the folder structure and applies the `parse_vendor_feedbacks()` to all vendor files
- concatinates feedback-data to a datafile

*item information*
- defines a parser to extract feedbacks from item pages
- iterates over the folder structure and applies the parser to all item files
- iterates over the folder stucutre and apllies the `parse_item_information()` to all item files

In [None]:
import os
import numpy as np
import re
import pandas as pd
import time
import csv
from functools import reduce
import json
from bs4 import BeautifulSoup

In [None]:
MAINDIR = os.getcwd().rsplit('/', 1)[0]
files = [folder for folder in os.listdir(os.path.join(MAINDIR, "data", "unpacked")) if ".DS_" not in folder]

In [None]:
def get_directory_structure(rootdir):
    """
    Creates a nested dictionary that represents the folder structure of rootdir
    """

    dir = {}
    rootdir = rootdir.rstrip(os.sep)
    start = rootdir.rfind(os.sep) + 1
    for path, dirs, files in os.walk(rootdir):
        folders = path[start:].split(os.sep)
        subdir = dict.fromkeys(files)
        parent = reduce(dict.get, folders[:-1], dir)
        parent[folders[-1]] = subdir
    return dir


def clean(dirs):
    return [d for d in dirs if ".DS_" not in d]

In [None]:
# folder_structure = get_directory_structure(os.path.join(DATA_DIR))

# file = os.path.join(MAIN_DIR, 'data', 'logs', 'folder_structure.json')
# with open(file, "w") as f:
#     json.dump(folder_structure, f)

In [None]:
# time efficient loading of folder structure
if os.path.exists(os.path.join(MAINDIR, 'data', 'logs')):
    os.makedir(os.path.join(MAINDIR, 'data', 'logs'))

is os.path.exists(os.path.join('data', 'parsed')):
    os.makedir(os.path.join(MAINDIR, 'data', 'parsed'))

file = os.path.join(MAINDIR, 'data/logs', 'folder_structure.json')
with open(file) as json_file:
    folder_structure = json.load(json_file)

### Vendor Information

In [None]:
def parse_vendor_info(vendor):
    """
    creates a line that contains: 
        name     = name of vendor
        stime    = time of scrape
        stime_dt = date of scrape
        score    = vendor rating average (out of 100)
        ctime    = time from creation
        otime    = time last online
        loc      = location
        area     = area of operation

    Args:
        string of file path

    Returns: 
        list of data
    """

    # get information from file
    data = [vendor.split(os.sep)[-1].split('.')[0],
            os.stat(vendor).st_birthtime,
            file.split(os.sep)[-3]]

    # opens and parses the html file
    with open(vendor, 'r') as f:
        soup = BeautifulSoup(f.read(), "lxml")
        containers = soup.find_all('span', attrs={'class': 'container'})

        # create list with name of vendor and scape date
        # note that not in all instances vendor scores were present
        # therefore the data is parsed conditionally
        if "vendor score:" in containers[0].text:
            try:
                [data.append(item) for item in re.findall(
                    ".*: (.*).*\n.*\n.*for (.*)\n.*: (.*)\n.*: (.*)\n.*: (.*)", containers[0].text)[0]]
            except:
                pass
        else:
            # stores an empty string when vendor scores are missing
            data.append('')
            try:
                [data.append(item) for item in re.findall(
                    "for.(.*)\n.*:.(.*)\n.*:.(.*)\n.*:.(.*)", containers[0].text)[0]]
            except:
                pass

    return data

### Feedback Information

In [None]:
def parse_vendor_feedbacks(page):
    """
    creates a dataframe that contains: 
        rating    = value containing the rating of the seller
        feedback  = textual feedback 
        item      = item classifier
        freshness = the freshness of the review
        name      = the name of the vendor
        stime     = the time at which the item-page was scraped

    Args:
        string of file path

    Returns: 
        dataframe
    """

    # parses html table to dataframe
    df = pd.DataFrame(pd.read_html(page, flavor="lxml")[0])

    # parses feedbacks into a {#} of 5 string format.
    # Note that some ratings were notated as stars, these
    # stars are counted and parsed as a string number
    if '5' not in df.rating[0]:
        df['rating'] = df.rating.apply(
            lambda x: "{0} of 5".format(x.count("★")))

    # adds name and stime columns
    df.assign(
        name=str(page.split(os.sep)[-1].split('.')[0].split('?')[0]),
        stime=os.stat(page).st_birthtime)

    # reorders columns in data frame
    df = df[['name', 'stime', 'rating', 'feedback', 'item', 'freshness']]

    return df

In [None]:
def parse_feedbacks(file):
    """
    This function operates sequentially:
    1.  It iterates over the dictionary containing the folder structure
    2.  It asserts whethter the node in the directory is a file
    3.  It parses files using ('parse_vendor_feedbacks()')
    4.  It appends data to a container list
    5.  For each date it concatinates the dataframes in the container
        after which the container is replenished to safe working memory
    
    Args:
        string of file path
    
    Returns: 
        dataframe
    """
    # empty list for data frames
    container = []
    
    # storing data.
    for market in [i for i in clean(folder_structure['unpacked']) if "silkroad2" in i]:
        # for each date
        for date in clean(folder_structure['unpacked'][market].keys()):

            #for each folder
            for f in clean(folder_structure['unpacked'][market][date].keys()):

                # for each user
                if f in ["users"]:
                    for sub in clean(folder_structure['unpacked'][market][date][f].keys()):
                        if isinstance(sub, str):
                            try:
                                file = os.path.join(DATA_DIR, market, date, f, sub)
                            except: 
                                pass

                            if os.path.isfile(file):
                                try:
                                    container.append(parse_vendor_feedbacks(file))
                                except:
                                    pass
    
    # concatenate all dataframes in container
    df = pd.concat(container)
    
    return df

df = parse_feedbacks(file)
df.to_pickle(os.path.join(MAINDIR, 'data/parsed/silkroad2', 'feedbacks.pickle'))

### Item information

In [None]:
def parse_item_information(file):
    """
    creates a dataframe that contains: 
        rating    = value containing the rating of the seller
        feedback  = textual feedback 
        item      = item classifier
        freshness = the freshness of the review
        price     = the price of the item that has been bought
        vendor    = the name of the vendor
        stime     = the time at which the item-page was scraped
        stime_dt  = the date at which the item page was scraped
        loc       = the country of operation of the vendor
        area      = the area to which the vendor ships

    Args:
        string of file path

    Returns: 
        dataframe
    """

    # get item meta data
    with open(file, 'r') as f:
        soup = BeautifulSoup(f.read(), "lxml")
        
    price = soup.find('div', attrs={'class': 'price_big'}).text.strip()
    vendor = soup.find('h3').text.split(": ")[1]
    stime = os.stat(file).st_birthtime
    stime_dt = file.split(os.sep)[-3]
    p = soup.find_all('p', limit=2)[1]
    area = re.findall(".*to: (.*)$",     p.text.strip())[0]
    loc = re.findall("from: (.*)\\n.*", p.text.strip())[0]
    category = soup.find('div', attrs = {'class':'categories'})
    item = soup.find('h2').text
    category = soup.find('div', attrs = {'class':'categories'})
    category = str(category.find('a',href = True)).split('/')[2].strip()

    # parses feedback information for item
    df = pd.read_html(file)[1].drop_duplicates()

    # concats meta data to feedback information
    df = df.assign(item=item,
                   price=price,
                   vendor=vendor,
                   stime=stime,
                   stime_dt=stime_dt,
                   loc=loc,
                   area=area, 
                   category = category)

    if '5' not in df.rating[0]:
        df['rating'] = df.rating.apply(
            lambda x: "{0} of 5".format(x.count("★")))

    return df

In [None]:
# set out-file path

data_folder = os.path.join(MAINDIR, "data/parsed/silkroad2/items")

def file_name(data_folder, date):
    return os.path.join(data_folder, ''.join(['items_', date.replace('-', ''), '.pickle']))

# iterates over complex folder structure
def parse_items():
    """
    This function operates sequentially:
    1. It iterates over the dictionary containing the folder structure
    2. It asserts whethter the node in the directory is a file
    3. It parses files using ('parse_item_information()')
    4. It appends data to a container list
    5. For each date it concatinates the dataframes in the container
       after which the container is replenished to safe working memory
    
    Args:
      None
    
    Returns:
      None
      
    Raises:
      Contains a simply try clause that ensure that the parses will 
      continue in all cases. This circumvcirents issues caused by corrupted
      files and encoding. 
    """
    
    for market in [i for i in clean(folder_structure['unpacked']) if "silkroad2" in i]:
        # for each date
        for date in clean(folder_structure['unpacked'][market].keys()):
            # empty list for data frames
            container = []

            #for each folder
            for f in clean(folder_structure['unpacked'][market][date].keys()):

                # for each item
                if f in ["items"]:
                    for sub in clean(folder_structure['unpacked'][market][date][f].keys()):
                        file = os.path.join(DATA_DIR, market, date, f, sub)
                        if os.path.isfile(file):
                            try:
                                container.append(parse_item_information(file))
                            except:
                                pass
                        else: 
                            continue
            
            # append unique data to dfs and replenish the container
            df = pd.concat(container, sort = True)
            df = df.drop_duplicates()
            df.to_pickle(file_name(data_folder, date))
            
            del container
            del df

parse_items()

In [None]:
container = []

for file in os.listdir(data_folder):
    if '.pickle' in file:
        df = pd.read_pickle(os.path.join(data_folder, file))
        container.append(df)
        
df = pd.concat(container)
df = df\
    .drop_duplicates()\
    .reset_index()


In [None]:
data_file = os.path.join(MAINDIR, 'data/parsed/silkroad2', 'items.pickle')
%time
df.to_pickle(data_file)

### Category Information

In [None]:
def parse_category_information(file):
    # operationalize container and column names
    category = file.split(os.sep)[-2]
    container = []
    columns = ['title', 'vendor', 'location', 'area', 'price']

    #read in html text
    with open(file, 'r') as f:
        soup = BeautifulSoup(f.read(), 'lxml')

    # extract relevant data
    items = soup.find_all('div', {'class': 'item_body'})
    prices = soup.find_all('div', {'class': 'price_big'})

    # parse data for listing
    for item, price in zip(items, prices): 
        title = item.find('div', {'class': 'item_title'}).text
        vendor, location, area = item.find('div', {'class': 'item_details'})\
            .text.strip()\
            .split('\n      ')
        price = price.text
        container.append([title, vendor.split(': ')[-1], location.split(': ')[-1], area.split(': ')[-1], price])

    df = pd.DataFrame.\
        from_records(container, columns = columns)\
        .drop_duplicates()\
        .reset_index(drop = True)
    
    df = df.assign(category = category)
    
    return df

In [None]:
def parse_categories():
    """
    This function operates sequentially:
    1.  It iterates over the dictionary containing the folder structure
    2.  It asserts whethter the node in the directory is a file
    3.  It parses files using ('parse_category_information()')
    4.  It appends data to a container list
    5.  For each date it concatinates the dataframes in the container
        after which the container is replenished to safe working memory
    
    Args:
        string of file path
    
    Returns: 
        dataframe
    """
    
    # empty list for data frames
    container = []
    
    # storing data.
    for market in [i for i in clean(folder_structure['unpacked']) if "silkroad2" in i]:
        # for each date
        for date in clean(folder_structure['unpacked'][market].keys()):

            #for each folder
            for f in clean(folder_structure['unpacked'][market][date].keys()):

                # for each user
                if f in ["categories"]:
                    for c in clean(folder_structure['unpacked'][market][date][f].keys()):
                        for sub in clean(folder_structure['unpacked'][market][date][f][c].keys()):
                            if isinstance(sub, str):
                                try:
                                    file = os.path.join(DATA_DIR, market, date, f, c, sub)
                                except: 
                                    pass

                                if os.path.isfile(file):
                                    try:
                                        container.append(parse_category_information(file))
                                    except:
                                        pass
    
    # concatenate all dataframes in container
    df = pd.concat(container)
    
    return df

df = parse_categories()
df.to_pickle(os.path.join(MAINDIR, 'data/parsed/silkroad2', 'categories.pickle'))