In [1]:
import pandas as pd
import numpy as np
import re
import json
import matplotlib.pyplot as plt
import glob, os, csv
import gc

from pprint import pprint
from langdetect import detect

import gensim
from gensim.utils import simple_preprocess
from gensim import corpora, models
from gensim.models import CoherenceModel

from bs4 import BeautifulSoup

import ast
from collections import Counter
from operator import add

from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

from sklearn.model_selection import train_test_split
import math
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction import DictVectorizer
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingClassifier, StackingClassifier, AdaBoostClassifier
import seaborn as sns
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('words')
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.corpus import words
from nltk.corpus import wordnet
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'http', 'javascript'])

pd.options.mode.chained_assignment = None

[nltk_data] Downloading package stopwords to /home/mwsb/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/mwsb/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/mwsb/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/mwsb/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package words to /home/mwsb/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [2]:
# Preprocessing

In [3]:
def language_detect(ls):
    text = ' '.join(ls)
    result = detect(text)
    return result

In [4]:
stemmer = PorterStemmer()
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text))

def preprocess(sentences):
    result = []
    
    for sent in sentences:
        lemmas = []
        # tokenize
        tokens = gensim.utils.simple_preprocess(sent)
        for token in tokens:
            lemma = lemmatize_stemming(token)
            # remove stopwords
            if lemma not in stop_words:
                lemmas.append(lemma)
#         # POS tagging
#         nltk_tagged = nltk.pos_tag(lemmas)
#         for word, tag in nltk_tagged:
#             # only keep nouns
#             if tag.startswith('N'):
#                 result.append(word)
        # Without POS tagging
        result += lemmas
    return result

In [5]:
# Preprocess texts in the list
def preprocess_texts(ls):
    parsed_ls = []
    if len(ls) > 0:
        for i in range(len(ls)):
            txt = str(ls[i])
            # Only include non-empty sentences with length > 1
            if len(txt) > 1 and txt != ' ':
                txt = remove_sp_char(txt)
                txt = remove_links(txt)
                txt = remove_email(txt)
                txt = remove_single_char(txt)
                txt = remove_multi_spaces(txt)
                txt = txt.strip()
                if len(txt) > 1:
                    txt = remove_noneng(txt)
                    if txt != '':
                        parsed_ls.append(txt)
        return parsed_ls
    else:
        return []

In [6]:
def remove_multi_spaces(text):
    return re.sub(' +', ' ', text)

In [7]:
def remove_single_char(text):
    return re.sub('(^| ).( |$)', '', text)

In [8]:
# Remove special characters in the text
def remove_sp_char(text):
    return re.sub('[^0-9a-zA-Z]+', ' ', text)

In [9]:
# Remove URL links in the text
def remove_links(text):
    text = re.sub('(?:https?://)?(?:www)?(\S*?\.onion)\b', '', text)
    text = re.sub('(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '', text)
    return text

In [10]:
# Remove email addressed in the text
def remove_email(text):
    return re.sub('\S*@\S*\s?', '', text)

In [11]:
# Replace non-english sentences with ''
def remove_noneng(text):
    try:
        lang = detect(text)
        if lang != 'en':
            return ''
        else:
            return text
    except:
        return ''

In [12]:
def get_domain(url):
    url = str(url)
    result = re.findall('^(?:https?\:\/\/)?[\w\-\.]+\.onion', url)
    if len(result) < 1:
        return 'NA'
    else:
        return result[0]

In [13]:
## Categories ##
# 1: Market: Drug, gun, 
# 2: Counterfeit: counterfeit credit cards, money, ID
# 3: Services: Hosting service, forum, email, pastebin, file-sharing
# 4: Security: Security-related information, tutorials or services, leaked data
# 5: Porn: Hosting pornographic material
# 6: Cryptocurrency
# 7: NoAccess: Login, Down, Empty
# 8: Other: Cannot be classified in any other category (e.g. personal blog)

In [14]:
# Dictionary of conversion of categories
conversion_dict = {
    'Art': 'Other',
    'Casino': 'Services',
    'Counterfeit Credit-Cards': 'Counterfeit',
    'Counterfeit Money': 'Counterfeit',
    'Counterfeit Personal-Identification': 'Counterfeit',
    'Cryptocurrency': 'Cryptocurrency',
    'Cryptolocker': 'Security',
    'Down': 'NoAccess',
    'Drugs': 'Market',
    'Empty': 'NoAccess',
    'Forum': 'Services',
    'Hacking': 'Security',
    'Hosting': 'Services',
    'Leaked-Data': 'Services',
    'Library': 'Other',
    'Locked': 'NoAccess',
    'Marketplace': 'Market',
#     'Onion Directory/Wiki': ['directory', 'dir', 'wiki'],
    'Personal': 'Other',
    'Politics': 'Other',
    'Porno': 'Porn',
    'Religion': 'Other',
    'Services': 'Services',
    'Social-Network': 'Services',
    'Violence': 'Market',
    'Other': 'Other'
}

In [15]:
# Read input data
model_training_dataset_domain = pd.read_csv('../data/model_training_dataset_domain.csv')
model_training_dataset_domain['body_text'] = model_training_dataset_domain['body_text'].apply(ast.literal_eval)
existing_domains = list(model_training_dataset_domain['domain_url'])
del model_training_dataset_domain
gc.collect()

datasets = []
dates = ['jan-15-0', 'jan-15-1', 'feb-19-0', 'feb-19-1']
for date in dates:
    df = pd.read_csv('../data/dataset-'+date+'.csv')
    df['body_text'] = df['body_text'].apply(ast.literal_eval)
    datasets.append(df)

In [16]:
datasets[0]

Unnamed: 0,domain_url,title,body_text
0,22222222n77jskuw.onion,Porn Videos - XONIONS - THE BEST ONION PORN SI...,"[Amateur anal, 5, 7 minMomcikoper - 292.3k Vi..."
1,22222223nm4siaje.onion,Porn Videos - XONIONS - THE BEST ONION PORN SI...,"[Amateur anal, 5, 7 minMomcikoper - 292.3k Vi..."
2,2222222afubjlbhm.onion,Porn Videos - XONIONS - THE BEST ONION PORN SI...,"[Amateur anal, 5, 7 minMomcikoper - 292.3k Vi..."
3,22oxht5ep3hvyboc.onion,Onion Dir - Adult,"[http://bitcoi6dzn2c24oa.onion, CP - Teens - J..."
4,22pp2nrnjcmtlzja.onion,PayPal Plaza | The Tor Marketplace For Buying ...,"[Order Now ($76.12), $721.60, Beware Of Fake S..."
...,...,...,...
16408,oscarn4se6ji4leq.onion,Stolen Credit Card Informations,"[ContactOscar, Privacy Policy, United States C..."
16409,oscaroo67vmbwsf3.onion,BANK ACCOUNT - 1000$ - BANK OF AMERICA,[Warning: unlink(/var/www/oscars/oscar2_storag...
16410,oscarw4be4xs2pct.onion,BANK ACCOUNT - 1000$ - BANK OF AMERICA,"[Credit Cards (CC's)11HomeAccounts(BANK,PAYPAL..."
16411,satforumnoo6sxgk.onion,download archives,[6 \t qX`F!A��–V� ����`J��P�#H�u._���k]�:S�p7...


In [17]:
df_combined = pd.concat(datasets)
df_combined.reset_index(inplace=True)
df_combined.drop(columns=['index'], inplace=True)
# Remove duplicated domains
df_combined = df_combined.drop_duplicates(subset ='domain_url', keep = 'first')
df_combined.reset_index(inplace=True)
df_combined.drop(columns=['index'], inplace=True)
# Remove existing domains in trainset
df_combined = df_combined[~df_combined['domain_url'].isin(existing_domains)]
df_combined.reset_index(inplace=True)
df_combined.drop(columns=['index'], inplace=True)
# Only keep unique sentences
df_combined['body_text'] = df_combined['body_text'].apply(lambda x: list(set(x)))
df_combined

Unnamed: 0,domain_url,title,body_text
0,22oxht5ep3hvyboc.onion,Onion Dir - Adult,[Baby Bitch CP is an unique new portal. We are...
1,22pp2nrnjcmtlzja.onion,PayPal Plaza | The Tor Marketplace For Buying ...,"[$47.02, Order Now ($11.05), $75.09, What You ..."
2,2464b3fu462tx2en.onion,CHILD PORN CENTER,"[6, CHILD PORN CENTER, http://x5y2b4xjf46idxzp..."
3,24adsavjl3u6tylr.onion,"CP Video HD 2021, PTHC, Loli, child porn, pret...","[70 Gb, Video HD 4K, 400 Gb, CC, © Copyright 2..."
4,2ajtkan56n6aiyl6.onion,Hard Porn Forum,"[Category, 36, Email, value 01, 268, 78, video..."
...,...,...,...
12551,rkvwooasau2goqeamkqbade4yv2hnt6dc4ol5kgnaaoihd...,Moneys spider - financial pyramid,"[Create account, Email address, No wallet? get..."
12552,e26whn2524322mkxb3cbyk27ev2ihhq2biz35hty7gzgsy...,BenTasker.co.uk - The Home of Ben Tasker - www...,"[Home / Uncategorised, Latest Posts, This site..."
12553,4vzhn5j644aa523g.onion,Canny,"[4vzhn5j644aa523g.onion, Buy the access to the..."
12554,cashgodr53umth4z.onion,Cash God - Real Cash Sellers,"[No Risk for You, Our Method, Contact us, 2020..."


In [18]:
df_combined.to_csv('../data/dataset-combined-jan-feb.csv', index=False)