In [2]:
import pickle
import bs4
import gzip
import tarfile
import matplotlib.pyplot as plt
import numpy as np
import os
from bs4 import BeautifulSoup
from tqdm import tqdm
from os import listdir
from os.path import isfile, join
from pprint import pprint
from operator import itemgetter

# import matplotlib
# matplotlib.rcParams.update({'font.size': 13})
# plt.style.use("ggplot")

In [5]:
#
# Create the directories needed
#
if not os.path.exists("data"):
    os.makedirs("data")

if not os.path.exists("data/img"):
    os.makedirs("data/img")
    
if not os.path.exists("data/meta"):
    os.makedirs("data/meta")

if not os.path.exists("data/meta/svd"):
    os.makedirs("data/meta/svd")
    
if not os.path.exists("data/meta/bow"):
    os.makedirs("data/meta/bow")

In [3]:
#
# Remove numbers from a sample
#
def remove_numbers(str):
    result = ''.join(i for i in str if not i.isdigit())
    return(result)

In [4]:
#
# Removes script and html-tags from a sample - returns plain text
#
def clean_html(str):
    soup = BeautifulSoup(str,"lxml")

    # kill all script and style elements
    for script in soup(["script", "style"]):
        script.extract() # rip it out 
    text = soup.get_text(strip=False)
    text = text.replace(u'\xa0',u'')
#     text = rem_numb(text)
    return text

In [5]:
#
# Get substring start index
#

def substr(str0, subs, start=0):
    for i in range(start,len(str0)+start):
        if str0[i:i+len(subs)] == subs:
            return i
    raise ValueError('No substring:',subs)
    
def rev_substr(str0, subs, start):
    for i in range(start,0,-1):
        if str0[i:i+len(subs)] == subs:
            return i
    raise ValueError('No rev substring:',subs)

def str_between(str_s, str1, str2):
    str0=str(str_s)
    start = substr(str0, str1)
    stop = substr(str0, str2, start)
    return str0[start+len(str1):stop]

def rev_str_between(str_s, str1, str2):
    str0=str(str_s)
    start = substr(str0, str1)
    stop = rev_substr(str0, str2, start)
    return str0[stop+len(str2):start]
    
# test = "<TarInfo 'alphabay/2015-04-30/search.php?s_terms=ua&pg=18' at 0x7f9dc97d2368>"
# test = "<TarInfo 'alphabay/2015-06-12/user.php?id=2680&tab=5&pg4=1' at 0x7f9dbec302a0>"
# test = " <TarInfo 'silkroad2-forums/2014-02-21/index.php?topic=11429.0' at 0x7f6098dfef20>,"
# rev_str_between(test,".php?","/")

In [6]:
#
# Read all tar-files in folder (parent)
#
def get_tars():
    parent = "/home/hades/exjobb/dumps/dnm/"
    files = [parent+f for f in listdir(parent) if isfile(join(parent, f))]
    return files

# pprint(get_tars())

In [7]:
#
# Untar a file and return its members
# returns: tar, members
#
def get_tar_members(tar_fn):
    fn = str(tar_fn).split("/")[-1].split(".")[0]
    
    if os.path.exists("data/meta/{}_members.pickle".format(fn)):
        _, members = pickle.load(open("data/meta/{}_members.pickle".format(fn),"rb"))
    else:
        tar = tarfile.open(tar_fn, "r:xz")
        members = tar.getmembers()
        tar.close()
        pickle.dump((tar_fn, members), open("data/meta/{}_members.pickle".format(fn),"wb"), protocol=2)

    return members

In [8]:
#
# untar files - using members - and strip HTML tags etc. returns a list of members
#
def untar_members(tar_fn, mem):
    tar = tarfile.open(tar_fn, "r:xz")
    raw = []
    for m in tqdm(mem):
        fn = tar.extractfile(m)
        f = fn.read()
        f = f.decode(errors='ignore')
#         raw.append(clean_html(f))
        raw.append(f)
    tar.close()
    return raw

In [9]:
#
# Look for a substr in tars
#
for i,x in enumerate(get_tars()):
    if "silk" in x:
        print(i,x)

5 /home/hades/exjobb/dumps/dnm/silkroad1-20130915-aldridgehetu.tar.xz
11 /home/hades/exjobb/dumps/dnm/silkroad1-20120723-christin-censored.tar.xz
21 /home/hades/exjobb/dumps/dnm/silkroad2-20140129-sohhlz-vendors.tar.xz
30 /home/hades/exjobb/dumps/dnm/silkroad2-20140927-daryllau.tar.xz
31 /home/hades/exjobb/dumps/dnm/silkroad1-forums-stexo.tar.xz
32 /home/hades/exjobb/dumps/dnm/silkroad2-forums-2014093020141016-rasmusandersen.tar.xz
49 /home/hades/exjobb/dumps/dnm/silkroad1-forums-anonymous.tar.xz
56 /home/hades/exjobb/dumps/dnm/silkroad2.tar.xz
63 /home/hades/exjobb/dumps/dnm/silkroad1-wiki.tar.xz
64 /home/hades/exjobb/dumps/dnm/silkroad1-20130703-anonymous.tar.xz
65 /home/hades/exjobb/dumps/dnm/silkroad1-vendorprofiles-stexo.tar.xz
87 /home/hades/exjobb/dumps/dnm/silkroad2-forums.tar.xz
90 /home/hades/exjobb/dumps/dnm/silkroad1-20111103-delyankratunov.tar.xz
102 /home/hades/exjobb/dumps/dnm/silkroad2-forums-20140419-whom-astorposts.tar.xz
108 /home/hades/exjobb/dumps/dnm/silkroad1-for

In [10]:
#
# 1 - Get a list of tar-files (ret: tars)
# 2 - Get members from arg
# 3 - Get cleaned html-text from arg
#
tars = get_tars()
tar_fn = tars[35]  #tars[35] = alphabay, 103 = Poseidon

print(tar_fn)
print("Tar-files:",len(tars))

members = get_tar_members(tar_fn)
print("Tar-file members:",len(members))

/home/hades/exjobb/dumps/dnm/alphabay.tar.xz
Tar-files: 165
Tar-file members: 2024819


In [11]:
#
# See distribution: Listings vs. non-Listings
#
def get_dist(domain='alphabay'):
    
    if domain is 'poseidon':
        term = "item?id="
        listings = [x for x in tqdm(members) if term in str(x.name)]
        
    elif domain is 'alphabay':
        listings = [x for x in tqdm(members) if "listing" in str(x.name) and "&" not in str(x.name)]

    print("LISTINGS:", len(listings))
    print("NONLISTINGS:",len(members)-len(listings))
    return listings
    
listings = get_dist()

100%|██████████| 2024819/2024819 [00:01<00:00, 1864309.33it/s]

LISTINGS: 144455
NONLISTINGS: 1880364





In [12]:
#
# Sort out dublets from Alphabay
#

def get_unique(ls):
    tmp = []
    res = []
    for m in tqdm(ls):
        idt = m.name.split("&")[0].split("/")[-1]
        if idt not in tmp:
            tmp.append(idt)
            res.append(m)
    tmp = None
    return res

res = get_unique(listings)
print(len((res)))
print(len(set(listings)))

100%|██████████| 144455/144455 [00:21<00:00, 6756.34it/s]

16822
144455





In [14]:
#
# Store content from tar in list (HEAVY LOAD)
#

members_ = res
content = None
content = untar_members(tar_fn, members_)

100%|██████████| 16822/16822 [07:56<00:00, 13.05s/it] 


In [61]:
#
# Retireve Alphabay listing category. 
#

def get_category(str):
ind_start = substr(str, "Home /")+6
mid_stop = substr(str, "Listing Options", ind_start)
try:
    ind_stop = rev_substr(str, "/\r\n", mid_stop)
except:
    ind_stop = rev_substr(str, "/\n", mid_stop)

cate = str[ind_start:ind_stop]
cate = cate.replace("\n", "")
cate = cate.replace(" ", "")
return cate 

category = []
data = []
err = []

for fn in tqdm(content):
    clean = clean_html(fn)
    try:
        cat = get_category(clean)
        category.append(cat)
        data.append(clean)
    except:
        err.append(clean)

print("OK :", len(category))
print("Err:", len(err))

100%|██████████| 16822/16822 [18:07<00:00, 15.47it/s] 

OK : 15222
Err: 1600





In [13]:
# namn = "alphabay"
# pickle.dump((data, category), open("data/meta/{}_dataset.pickle".format(namn), "wb"), protocol=2)

In [16]:
#
# Retireve Alphabay Silkroad category. 
#

def get_category_silk():
    problem = []
    for x in tqdm(members):
        try:
            pr = rev_str_between(x,".php?","/")
            if pr == 'listings':
                pr = pr[:-1]
            problem.append(pr)
    #         problem.append(get_category(x).replace("\r","").split("/")[0])
        except:
            problem.append("other")
    pprint(set(problem))

In [14]:
def get_files(root):
    files_ = []
    for path, subdirs, files in tqdm(os.walk(root)):
        for name in files:
            if "page=" not in name:
                comb = os.path.join(path, name)
                files_.append(comb)
    return(files_)

fn_list = get_files("/home/hades/exjobb/dumps/dnm/silkroad/")
print("Files:", len(fn_list))

1923it [00:01, 1185.23it/s]

Files: 32016





In [53]:
# categories = {"error":0}

# for ind in tqdm(content):
#     tmp = {}
#     data = clean_html(ind)
#     try:
#         cat = str_between(data, "Search", "Alcohol")
#         cat = cat.replace(" ", "")
#         id = [x for x in cat.split("\n") if len(x)>2][0]
        
#         try:
#             categories[id] += 1 
#         except:
#             categories[id] = 1 
#     except:
#         categories['error'] += 1

# pprint(categories)