In [1]:
import re
import os
import logging
import datetime
from bs4 import BeautifulSoup

In [106]:
import json

In [7]:
# from https://github.com/infosecanon/dminer/blob/master/dminer/ingestion/dreammarket/dreammarket.py
def extract_listings(soup, timestamp):
    """
    Extracts each DreamMarket listing from the given `soup` variable
    (`bs4.BeautifulSoup`) object. The listings will then be associated to
    the passed `timestamp`. Each listing item is then yielded to the
    caller.

    It is notable that there is currently one situation in which a listing
    will be skipped, and that is if there is no `primary_div` found.
    """

    listings = soup.find_all("div", class_="around")
    for listing_element in listings:
        title_div = listing_element.find("div", class_="oTitle")
        primary_div = listing_element.find("div", class_="oOfferBody")
        if not primary_div:
            #self.logger.info("skipping entry:" + repr(title_div.find("a").text.strip()) + "| Reason: No primary_div found.")
            continue

        listing_meta_div = primary_div.find("td", class_="oOfTextDetail")
        vendor_div = listing_meta_div.find("div", class_="oVendor")

        item = {}
        item["market_name"] = "DreamMarket"
        item["listing_name"] = title_div.find("a").text.strip()
        item["timestamp"] = timestamp

        # starting at 1 to avoid the bitcoin icon
        item["listing_price_btc"] = float(listing_meta_div.find("div", class_="oPrice").text.strip()[1:])
        item["listing_escrow"] = primary_div.find("div", class_="escrowBox").text


        vendor_name = list(tag for tag in vendor_div.find_all("a") if tag["href"].startswith("./"))[0].text.strip()
        vendor_transactions = int(vendor_div.find("span", title="Successful transactions").text.lstrip("(").rstrip(")"))
        vendor_rating = vendor_div.find("span", class_="userRating")
        if vendor_rating:
            vendor_rating = float(vendor_rating.text.strip())
        else:
            vendor_rating = float(0)

        item["vendor_name"] = vendor_name
        item["vendor_transactions"] = vendor_transactions
        item["vendor_rating"] = vendor_rating

        yield item


In [127]:
with open("dreammarket/2015-07-03/market/index.html", encoding = "ISO-8859-1") as fp:
    soup = BeautifulSoup(fp, 'lxml')

In [128]:
sample_listings = extract_listings(soup, '2015-07-03')

In [129]:
soup.find_all("div", class_="around")[0]

<div class="around">
<div class="shopItem shopItem_4485">
<div class="oImage">
<a class="productThumbImage_4485" href="./viewProduct?offer=857560.783794" target="_blank">
<img alt="100g 10 PIN BOWLING HASH" height="132" src="uploadedImage?id=119513.793985&amp;type=thumb" width="176"/>
</a>
</div>
<div class="escrowInfo"><div class="escrowBox withoutEscrow">NO ESCROW</div></div> <div class="hoverInfo ">
<div class="pdd">
<div class="text">
<p>Vendor<br/>
<a href="./contactMember?member=monetka">monetka</a>
</p></div>
</div>
</div>
<div class="text oTitle">
<a class="productThumbImage_4485" href="./viewProduct?offer=857560.783794">
  			  					100g 10 PIN BOWLING HASH	  			  				</a>
</div>
<div class="bottom oPrice">
						฿3.34					</div>
</div>
</div>

In [130]:
soup.find_all('script')[2].get_text().split("\n\n\t\n\n\t")[1].split('\n\t')[0].split('= ')[1]


'[{"lid":"4485","lnk":"857560.783794","title":"100g 10 PIN BOWLING HASH","imageId":"119513.793985","sellerOutLink":-1377523590,"btcPriceText":3.34,"localPriceText":"&pound;550","shipsFrom":"UK","isTrustedSeller":true,"domesticCurrencyValue":null,"escrowMethod":"No"},{"lid":"6281","lnk":"946223.929536","title":"50g Budget Rocky Slabs","imageId":"962934.717087","sellerOutLink":-1377523590,"btcPriceText":"0.79","localPriceText":"&pound;130","shipsFrom":"UK","isTrustedSeller":true,"domesticCurrencyValue":null,"escrowMethod":"No"},{"lid":"5436","lnk":"700734.885048","title":"Freezeland 8g Free Shipping ","imageId":"609519.863451","sellerOutLink":513788116,"btcPriceText":"0.354","localPriceText":"&euro;82","shipsFrom":"Germany","isTrustedSeller":true,"domesticCurrencyValue":null,"escrowMethod":"No"},{"lid":"3622","lnk":"527640.525668","title":"PlayBoy Plus.com - [LIFETIME PORN PREMIUM ACCOUNT]","imageId":"119050.912668","sellerOutLink":-1350947825,"btcPriceText":"0.0394","localPriceText":"$1

In [135]:
json.loads(soup.find_all('script')[2].get_text().split("\n\n\t\n\n\t")[1].split('\n\t')[0].split('= ')[1])

[{'lid': '4485',
  'lnk': '857560.783794',
  'title': '100g 10 PIN BOWLING HASH',
  'imageId': '119513.793985',
  'sellerOutLink': -1377523590,
  'btcPriceText': 3.34,
  'localPriceText': '&pound;550',
  'shipsFrom': 'UK',
  'isTrustedSeller': True,
  'domesticCurrencyValue': None,
  'escrowMethod': 'No'},
 {'lid': '6281',
  'lnk': '946223.929536',
  'title': '50g Budget Rocky Slabs',
  'imageId': '962934.717087',
  'sellerOutLink': -1377523590,
  'btcPriceText': '0.79',
  'localPriceText': '&pound;130',
  'shipsFrom': 'UK',
  'isTrustedSeller': True,
  'domesticCurrencyValue': None,
  'escrowMethod': 'No'},
 {'lid': '5436',
  'lnk': '700734.885048',
  'title': 'Freezeland 8g Free Shipping ',
  'imageId': '609519.863451',
  'sellerOutLink': 513788116,
  'btcPriceText': '0.354',
  'localPriceText': '&euro;82',
  'shipsFrom': 'Germany',
  'isTrustedSeller': True,
  'domesticCurrencyValue': None,
  'escrowMethod': 'No'},
 {'lid': '3622',
  'lnk': '527640.525668',
  'title': 'PlayBoy Plus.

In [143]:
json.loads(soup.find_all('script')[2].get_text().split("\n\n\t\n\n\t")[0].split('= ')[1][:-1])

{'-1377523590': {'jsVendorLink': '\n\t\t\t\t\t<a href="./contactMember?member=monetka"> monetka   </a>  <a class="viewRatings" href="./memberFeedback?member=monetka" target="_blank">  <span title="Successful transactions">(182)</span> (<span class="userRating gold" \n\t\t\ttitle="Average seller rating out of 182 rating[s] (1-5 stars)" >4.9&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</span>)</a> (<span class="agoraLinkedUserRating"\n\t\t\t\t\t\t\t\ttitle="Vendor rating on Agora (Deal count, Average Rating out of 5 stars)">55, 4.81/5</span>) (<span class="blackBankLinkedUserRating"\n\t\t\t\t\t\t\t\ttitle="Vendor rating on Black Bank (positive/neutral/negative)">27/1/0</span>) (<span class="nucleusLinkedUserRating"\n\t\t\t\t\t\t\ttitle="Nucleus vendor rating (Deal count, Average Rating out of 5 stars)">46, 4.8/5</span>)',
  'jsVendorShipsFrom': 'United Kingdom'},
 '513788116': {'jsVendorLink': '\n\t\t\t\t\t<a href="./contactMember?member=TheHeissenberg"> TheHeissenberg   </a>  <a class="viewRatings" hr