In [1]:
import pandas as pd
import numpy as np
import re
import json
import matplotlib.pyplot as plt
import glob, os, csv

from langdetect import detect

import gensim
from gensim.utils import simple_preprocess
from gensim import corpora, models
from gensim.models import CoherenceModel

from pprint import pprint

import nltk
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'http', 'javascript'])

from bs4 import BeautifulSoup

import pyLDAvis
import pyLDAvis.gensim

# import stanza
# # stanza.download('en')
# nlp = stanza.Pipeline('en', processors='tokenize,mwt,pos,lemma')

In [2]:
# Convert html file to title and body texts
def body2text(file):
    html = open(file, "rb")
    soup = BeautifulSoup(html, "html.parser")
    body = []
    # kill all script and style elements
    for script in soup(["script", "style"]):
        script.extract()

    text = soup.get_text()
    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    
    for line in lines:
        if line:
            body.append(line)
    title = soup.title
    if title is None:
        title_text = None
    else:
        title_text = title.string
    html.close()
    
    return [title_text, body]

In [3]:
# convert html to string
def body2text_body(html):
    soup = BeautifulSoup(html)
    result = []
    # kill all script and style elements
    for script in soup(["script", "style"]):
        script.extract()

    text = soup.get_text()
    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    for line in lines:
        if line:
            result.append(line)

    return result

In [4]:
stemmer = PorterStemmer()
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text))

def preprocess(sentences):
    result = []
    
    for sent in sentences:
        lemmas = []
        # tokenize
        tokens = gensim.utils.simple_preprocess(sent)
        for token in tokens:
            lemma = lemmatize_stemming(token)
            # remove stopwords
            if lemma not in stop_words:
                lemmas.append(lemma)
        # POS tagging
        nltk_tagged = nltk.pos_tag(lemmas)
        for word, tag in nltk_tagged:
            # only keep nouns
            if tag.startswith('N'):
                result.append(word)
    return result

In [5]:
def language_detect(ls):
    text = ' '.join(ls)
    result = detect(text)
    return result

In [6]:
def get_cluster_name(x, y):
    return x[y]

def custom_strip(x):
    return x.strip().strip('\n\t')

In [7]:
def most_frequent(List): 
    counter = 0
    num = List[0] 
      
    for i in List: 
        curr_frequency = List.count(i) 
        if(curr_frequency> counter): 
            counter = curr_frequency 
            num = i 
  
    return num 

In [8]:
# Produce trainset

In [9]:
# Import labelled urls
df_url_labelled = pd.read_excel('../data/DUTA_10K.xls')
df_url_labelled.drop(columns='Unnamed: 0', inplace=True)
df_url_labelled

In [10]:
# Import all the crawled html files
df_html_ls = []

path = '../data/universe-labelled/'
all_files = glob.glob(os.path.join(path, '*.html'))

for filename in all_files:
    url = filename.strip('.html').replace(path, '')
    title_text = body2text(filename)
    title = title_text[0]
    text = title_text[1]
    tup = [url, title, text]
    df_html_ls.append(tup)

df_html = pd.DataFrame(df_html_ls, columns=['url', 'title', 'body_text'])
df_html

In [11]:
# Remove dead urls
df_html = df_html[df_html['body_text'].str.len() != 0]
df_html

In [12]:
df_train = pd.merge(left=df_html, right=df_url_labelled, left_on='url', right_on='Onion_Address')
df_train.drop(columns='Onion_Address', inplace=True)
df_train.rename(columns={"url": "url", "body_text": "body_text", "Main_Class": "main_class", 
                        "Sub_Class": "sub_class", "lang": "language"}, inplace=True)
# preprocess
df_train['body_token'] = df_train['body_text'].map(preprocess)
df_train = df_train[df_train['body_token'].str.len() != 0]

df_train

In [13]:
df_train_eng = df_train[df_train['language'] == 'en']
df_train_eng

In [14]:
df_train_noneng = df_train[df_train['language'] != 'en']
df_train_noneng

In [15]:
# Produce testset

In [16]:
# read in the urls with titles and htmls
df_url = pd.read_csv('../data/universe-url-sample.csv', index_col=None, lineterminator='\n')
df_url.drop(columns=['Unnamed: 0'], inplace=True)
df_url

Unnamed: 0,url,title,body,status,time,domain
0,2zljltoqfdkqcdmf.onion,Light Money,<!doctype html>\n<html>\n\n<head>\n <meta c...,live,2020-09-29T03:31:47.370962464Z,2zljltoqfdkqcdmf
1,4zyfu5qr3xg66ape.onion,Black Shop,"<!DOCTYPE html>\n<head>\n <meta charset=""ut...",live,2020-09-29T03:31:47.79917865Z,4zyfu5qr3xg66ape
2,6wlf4cyhq4u6rlqzngh3g6qrun4l3sf6yf54ez7eo2547k...,CHEAP EUROS,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.01 T...",live,2020-09-29T03:31:47.85466791Z,6wlf4cyhq4u6rlqzngh3g6qrun4l3sf6yf54ez7eo2547k...
3,6khquuorwuucomnxp5egqfolpeddgjfbacpxyrjkbgftng...,Walt Cards - Gift Card seller since 2013,<!DOCTYPE html>\r\n<html>\r\n\t<!-- Mirrored f...,live,2020-09-29T03:31:46.54407746Z,6khquuorwuucomnxp5egqfolpeddgjfbacpxyrjkbgftng...
4,5oesgn7p2zu6wpaavujjpold2sccpu7oia4afp24adujzt...,Light Money,<!doctype html>\n<html>\n\n<head>\n <meta c...,live,2020-09-29T03:31:46.748995354Z,5oesgn7p2zu6wpaavujjpold2sccpu7oia4afp24adujzt...
...,...,...,...,...,...,...
14660,3qr37zaumlym2jhcwkkqrarihdze2zp2xmmer4btkfqjrn...,,"!function(n){window.isEditMode=!1,n(window).on...",live,2020-09-30T02:09:56.213964118Z,3qr37zaumlym2jhcwkkqrarihdze2zp2xmmer4btkfqjrn...
14661,greenegbqkyk3ois.onion/wp-content/uploads/2020...,,����,live,2020-09-30T02:10:00.029866451Z,greenegbqkyk3ois
14662,greenegbqkyk3ois.onion/wp-content/plugins/cont...,,.wpcf7 .screen-reader-response {\n\tposition: ...,live,2020-09-30T02:09:59.250307811Z,greenegbqkyk3ois
14663,ctulnc6kyxkgt7fg.onion/carder-forum/14886-easy...,Easy ways to earn money as a teenager,"<!DOCTYPE html>\n<html lang=""en"">\n\t<head>\n\...",live,2020-09-30T02:09:56.863479772Z,ctulnc6kyxkgt7fg


In [17]:
# convert html to string
df_url = df_url[df_url['title'].notna()]
df_url.drop(columns=['status', 'time'], inplace=True)
# df_url = df_url[['url', 'title', 'body', 'domain']]
df_url = df_url[df_url['body'].str.len() != 0]
df_url['body_text'] = df_url['body'].map(body2text_body)

df_url

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


Unnamed: 0,url,title,body,domain,body_text
0,2zljltoqfdkqcdmf.onion,Light Money,<!doctype html>\n<html>\n\n<head>\n <meta c...,2zljltoqfdkqcdmf,"[Light Money, Discount, FAQ, Proofs, Contact, ..."
1,4zyfu5qr3xg66ape.onion,Black Shop,"<!DOCTYPE html>\n<head>\n <meta charset=""ut...",4zyfu5qr3xg66ape,"[Black Shop, Home, Services, Pricing, Black Sh..."
2,6wlf4cyhq4u6rlqzngh3g6qrun4l3sf6yf54ez7eo2547k...,CHEAP EUROS,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.01 T...",6wlf4cyhq4u6rlqzngh3g6qrun4l3sf6yf54ez7eo2547k...,"[CHEAP EUROS, INFO, High quality 20EUR bills w..."
3,6khquuorwuucomnxp5egqfolpeddgjfbacpxyrjkbgftng...,Walt Cards - Gift Card seller since 2013,<!DOCTYPE html>\r\n<html>\r\n\t<!-- Mirrored f...,6khquuorwuucomnxp5egqfolpeddgjfbacpxyrjkbgftng...,[Walt Cards - Gift Card seller since 2013]
4,5oesgn7p2zu6wpaavujjpold2sccpu7oia4afp24adujzt...,Light Money,<!doctype html>\n<html>\n\n<head>\n <meta c...,5oesgn7p2zu6wpaavujjpold2sccpu7oia4afp24adujzt...,"[Light Money, Discount, FAQ, Proofs, Contact, ..."
...,...,...,...,...,...
14656,hi6puvkvistna6us.onion/Order4.php,SELL CREDIT CARD & Transer WESTERN UNION & Tra...,<!DOCTYPE html>\r\n<html><head>\r\n<meta http-...,hi6puvkvistna6us,[SELL CREDIT CARD & Transer WESTERN UNION & Tr...
14657,imperi2m2v2ebxey.onion/products/master/prepaid...,Prepaid Credit Card MasterCard X1 - Imperial S...,"<!DOCTYPE html>\n<html class=""nojs html css_ve...",imperi2m2v2ebxey,[Prepaid Credit Card MasterCard X1 - Imperial ...
14658,uozu643uzpijef7f.onion/BuyMoneyGram3.php,Transfer MONEYGRAM & PAYPAL ACCOUNT & DUMPS & ...,"<!DOCTYPE html>\r\n<html lang=""en"">\r\n\r\n<he...",uozu643uzpijef7f,[Transfer MONEYGRAM & PAYPAL ACCOUNT & DUMPS &...
14663,ctulnc6kyxkgt7fg.onion/carder-forum/14886-easy...,Easy ways to earn money as a teenager,"<!DOCTYPE html>\n<html lang=""en"">\n\t<head>\n\...",ctulnc6kyxkgt7fg,"[Easy ways to earn money as a teenager, REGIST..."


In [18]:
# group urls, titles and body texts by domain
df_test = df_url.groupby('domain').agg(list)
df_test.reset_index(inplace=True)
df_test.drop(columns=['body'], inplace=True)
# df_test = df_test[['domain', 'url', 'title', 'body_text']]
df_test['body_text'] = df_test['body_text'].apply(lambda x: [z for y in x for z in y])

df_test

Unnamed: 0,domain,url,title,body_text
0,2.weedmetoas22rdij,[2.weedmetoas22rdij.onion],[Weed Me - Cannabis Weed Shop | UPS Express Sh...,[Weed Me - Cannabis Weed Shop | UPS Express Sh...
1,2222222222gwddfp,[2222222222gwddfp.onion],[Bitcoin Generator Exploit - Make Free Bitcoins!],[Bitcoin Generator Exploit - Make Free Bitcoin...
2,2222222222vf2a2e,[2222222222vf2a2e.onion],[FREECITY - Anonymous Escrow Platform],"[FREECITY - Anonymous Escrow Platform, Please ..."
3,222222222xn2ozdb2mjnkjrvcopf5thb6la6yj24jvyjqr...,[222222222xn2ozdb2mjnkjrvcopf5thb6la6yj24jvyjq...,"[Best Financial Market, Proofs, VISA Gift Card...","[Best Financial Market, × Blockonomics ..."
4,22222222dldy4lh7,[22222222dldy4lh7.onion],[100x Your Coins in 24 Hours - Officially Hidd...,[100x Your Coins in 24 Hours - Officially Hidd...
...,...,...,...,...
7650,zsolxunfmbfuq7wf,[zsolxunfmbfuq7wf.onion],[mail.riseup.net :: Welcome to mail.riseup.net],[mail.riseup.net :: Welcome to mail.riseup.net...
7651,zuvv2pd5znhhz47d,[zuvv2pd5znhhz47d.onion],"[money house: trusted, automatic Visa credit c...","[money house: trusted, automatic Visa credit c..."
7652,zw5snxarvt66el3ej5rdjjzk7frgf5tvk2enwc3yx2qq45...,[zw5snxarvt66el3ej5rdjjzk7frgf5tvk2enwc3yx2qq4...,"[Shop cloned cards, paypal, western union]","[Shop cloned cards, paypal, western union, CLO..."
7653,zwf5i7hiwmffq2bl7euedg6y5ydzze3ljiyrjmm7o42vhe...,[zwf5i7hiwmffq2bl7euedg6y5ydzze3ljiyrjmm7o42vh...,[OnionWallet Anonymous and secure Bitcoin Wall...,[OnionWallet Anonymous and secure Bitcoin Wall...


In [19]:
# preprocess
df_test['body_token'] = df_test['body_text'].map(preprocess)
df_test = df_test[df_test['body_token'].str.len() != 0]
df_test['title_token'] = df_test['title'].map(preprocess)

# detect language
df_test['language'] = df_test['body_token'].map(language_detect)
# df_test['language'] = df_test['title_token'].map(language_detect)
df_test.reset_index(inplace=True)
df_test.drop(columns='index' ,inplace=True)

df_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,domain,url,title,body_text,body_token,title_token,language
0,2.weedmetoas22rdij,[2.weedmetoas22rdij.onion],[Weed Me - Cannabis Weed Shop | UPS Express Sh...,[Weed Me - Cannabis Weed Shop | UPS Express Sh...,"[weed, cannabi, shop, express, ship, home, sho...","[weed, cannabi, shop, express, ship]",en
1,2222222222gwddfp,[2222222222gwddfp.onion],[Bitcoin Generator Exploit - Make Free Bitcoins!],[Bitcoin Generator Exploit - Make Free Bitcoin...,"[bitcoin, gener, exploit, bitcoin, time, limit...","[bitcoin, gener, exploit, bitcoin]",en
2,2222222222vf2a2e,[2222222222vf2a2e.onion],[FREECITY - Anonymous Escrow Platform],"[FREECITY - Anonymous Escrow Platform, Please ...","[anonym, escrow, platform, pleas]","[anonym, escrow, platform]",en
3,222222222xn2ozdb2mjnkjrvcopf5thb6la6yj24jvyjqr...,[222222222xn2ozdb2mjnkjrvcopf5thb6la6yj24jvyjq...,"[Best Financial Market, Proofs, VISA Gift Card...","[Best Financial Market, × Blockonomics ...","[financi, market, pay, btc, pay, exactli, btc,...","[financi, market, proof, gift, card, imperi, s...",en
4,22222222dldy4lh7,[22222222dldy4lh7.onion],[100x Your Coins in 24 Hours - Officially Hidd...,[100x Your Coins in 24 Hours - Officially Hidd...,"[hour, servic, anonym, hour, multipli, bitcoin...","[hour, servic, anonym]",en
...,...,...,...,...,...,...,...
7648,zsolxunfmbfuq7wf,[zsolxunfmbfuq7wf.onion],[mail.riseup.net :: Welcome to mail.riseup.net],[mail.riseup.net :: Welcome to mail.riseup.net...,"[mail, riseup, net, welcom, mail, riseup, net,...","[mail, riseup, net, welcom, mail, riseup, net]",en
7649,zuvv2pd5znhhz47d,[zuvv2pd5znhhz47d.onion],"[money house: trusted, automatic Visa credit c...","[money house: trusted, automatic Visa credit c...","[money, trust, automat, visa, credit, card, ma...","[money, trust, automat, visa, credit, card, ma...",en
7650,zw5snxarvt66el3ej5rdjjzk7frgf5tvk2enwc3yx2qq45...,[zw5snxarvt66el3ej5rdjjzk7frgf5tvk2enwc3yx2qq4...,"[Shop cloned cards, paypal, western union]","[Shop cloned cards, paypal, western union, CLO...","[shop, clone, card, paypal, union, clone, card...","[shop, clone, card, paypal, union]",en
7651,zwf5i7hiwmffq2bl7euedg6y5ydzze3ljiyrjmm7o42vhe...,[zwf5i7hiwmffq2bl7euedg6y5ydzze3ljiyrjmm7o42vh...,[OnionWallet Anonymous and secure Bitcoin Wall...,[OnionWallet Anonymous and secure Bitcoin Wall...,"[onionwallet, anonym, secur, bitcoin, wallet, ...","[onionwallet, anonym, secur, bitcoin, wallet, ...",en


In [20]:
df_test['url_array'] = df_test['url'].apply(lambda x: np.array(x))
df_test['name_index'] = df_test['url_array'].apply(lambda x: np.argmin(x))
df_test['cluster_name'] = df_test.apply(lambda x: get_cluster_name(x['title'], x['name_index']), axis=1)
df_test['cluster_name'] = df_test['cluster_name'].apply(lambda x: custom_strip(x))
df_test.drop(columns=['url_array', 'name_index'], inplace=True)
df_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

Unnamed: 0,domain,url,title,body_text,body_token,title_token,language,cluster_name
0,2.weedmetoas22rdij,[2.weedmetoas22rdij.onion],[Weed Me - Cannabis Weed Shop | UPS Express Sh...,[Weed Me - Cannabis Weed Shop | UPS Express Sh...,"[weed, cannabi, shop, express, ship, home, sho...","[weed, cannabi, shop, express, ship]",en,Weed Me - Cannabis Weed Shop | UPS Express Shi...
1,2222222222gwddfp,[2222222222gwddfp.onion],[Bitcoin Generator Exploit - Make Free Bitcoins!],[Bitcoin Generator Exploit - Make Free Bitcoin...,"[bitcoin, gener, exploit, bitcoin, time, limit...","[bitcoin, gener, exploit, bitcoin]",en,Bitcoin Generator Exploit - Make Free Bitcoins!
2,2222222222vf2a2e,[2222222222vf2a2e.onion],[FREECITY - Anonymous Escrow Platform],"[FREECITY - Anonymous Escrow Platform, Please ...","[anonym, escrow, platform, pleas]","[anonym, escrow, platform]",en,FREECITY - Anonymous Escrow Platform
3,222222222xn2ozdb2mjnkjrvcopf5thb6la6yj24jvyjqr...,[222222222xn2ozdb2mjnkjrvcopf5thb6la6yj24jvyjq...,"[Best Financial Market, Proofs, VISA Gift Card...","[Best Financial Market, × Blockonomics ...","[financi, market, pay, btc, pay, exactli, btc,...","[financi, market, proof, gift, card, imperi, s...",en,Best Financial Market
4,22222222dldy4lh7,[22222222dldy4lh7.onion],[100x Your Coins in 24 Hours - Officially Hidd...,[100x Your Coins in 24 Hours - Officially Hidd...,"[hour, servic, anonym, hour, multipli, bitcoin...","[hour, servic, anonym]",en,100x Your Coins in 24 Hours - Officially Hidde...
...,...,...,...,...,...,...,...,...
7648,zsolxunfmbfuq7wf,[zsolxunfmbfuq7wf.onion],[mail.riseup.net :: Welcome to mail.riseup.net],[mail.riseup.net :: Welcome to mail.riseup.net...,"[mail, riseup, net, welcom, mail, riseup, net,...","[mail, riseup, net, welcom, mail, riseup, net]",en,mail.riseup.net :: Welcome to mail.riseup.net
7649,zuvv2pd5znhhz47d,[zuvv2pd5znhhz47d.onion],"[money house: trusted, automatic Visa credit c...","[money house: trusted, automatic Visa credit c...","[money, trust, automat, visa, credit, card, ma...","[money, trust, automat, visa, credit, card, ma...",en,"money house: trusted, automatic Visa credit ca..."
7650,zw5snxarvt66el3ej5rdjjzk7frgf5tvk2enwc3yx2qq45...,[zw5snxarvt66el3ej5rdjjzk7frgf5tvk2enwc3yx2qq4...,"[Shop cloned cards, paypal, western union]","[Shop cloned cards, paypal, western union, CLO...","[shop, clone, card, paypal, union, clone, card...","[shop, clone, card, paypal, union]",en,"Shop cloned cards, paypal, western union"
7651,zwf5i7hiwmffq2bl7euedg6y5ydzze3ljiyrjmm7o42vhe...,[zwf5i7hiwmffq2bl7euedg6y5ydzze3ljiyrjmm7o42vh...,[OnionWallet Anonymous and secure Bitcoin Wall...,[OnionWallet Anonymous and secure Bitcoin Wall...,"[onionwallet, anonym, secur, bitcoin, wallet, ...","[onionwallet, anonym, secur, bitcoin, wallet, ...",en,OnionWallet Anonymous and secure Bitcoin Walle...


In [21]:
# Groupby domain name
df_test = df_test.groupby('cluster_name').agg({'domain': list, 'url': 'sum', 'title': 'sum', 'body_text': 'sum', 
                                             'body_token': 'sum', 'title_token': 'sum', 'language': list})
df_test.reset_index(inplace=True)
df_test

Unnamed: 0,cluster_name,domain,url,title,body_text,body_token,title_token,language
0,! CLONED CARDS !,[mhcge66dpacblyp55fv2i7bf37qg22xe34b4hcz2vyxpk...,[mhcge66dpacblyp55fv2i7bf37qg22xe34b4hcz2vyxpk...,[! CLONED CARDS !],"[! CLONED CARDS !, Home, Price, Shipping, Ques...","[clone, card, home, price, ship, question, con...","[clone, card]",[en]
1,$$ The Green Machine $$ - Error,[he22pncoselnm54h],[he22pncoselnm54h.onion],[$$ The Green Machine $$ - Error ],[$$ The Green Machine $$ - Error You have been...,"[machin, error, log, session, ha, expir, pleas]","[machin, error]",[en]
2,$$$ Credit Cards | PayPal | Wester Union | Mon...,[2cardsowr7u7uvpyrnc5lxuclhb4noj6q2cqf2so7ezg2...,[2cardsowr7u7uvpyrnc5lxuclhb4noj6q2cqf2so7ezg2...,[$$$ Credit Cards | PayPal | Wester Union | Mo...,[$$$ Credit Cards | PayPal | Wester Union | Mo...,"[credit, card, wester, union, moneygram, accou...","[credit, card, wester, union, moneygram, accou...",[en]
3,&#x64;&#65279;&#x61;&#x72;&#x6B;&#x2E;&#x66;&#...,[darkfailllnkf4vf],[darkfailllnkf4vf.onion],[&#x64;&#65279;&#x61;&#x72;&#x6B;&#x2E;&#x66;&...,"[d﻿ark.fai﻿l: Which darknet sites are online?,...","[ark, darknet, site, onlin, darknet, site, onl...","[darknet, site, onlin]",[en]
4,&lt;&lt;&lt;/&gt;&gt; Chaos Computer Club Dres...,[rvy6qmlqfstv6rlz],[rvy6qmlqfstv6rlz.onion],[&lt;&lt;&lt;/&gt;&gt; Chaos Computer Club Dre...,"[<<</>> Chaos Computer Club Dresden | c3d2, <<...","[chao, comput, club, dresden, chao, comput, cl...","[lt, lt, lt, gt, gt, chao, comput, club, dresden]",[en]
...,...,...,...,...,...,...,...,...
1330,淡路島,[a5h4uuqoovwvcbiecjkjlu7ew7z4lszfv67dodqol2ive...,[a5h4uuqoovwvcbiecjkjlu7ew7z4lszfv67dodqol2ive...,[淡路島],"[淡路島, 淡路島 - コシニテ人びとへの手紙 -, 玉ねぎを育もう, Update, 20...","[淡路島, 淡路島, コシニテ人びとへの手紙, 玉ねぎを育もう, updat, scale,...",[淡路島],[ja]
1331,约伊兹的萌狼乡手札,"[horoiomuy6xignjv, qty3gmiq3zhs7whorla4ynvmndp...","[horoiomuy6xignjv.onion, qty3gmiq3zhs7whorla4y...","[约伊兹的萌狼乡手札, 约伊兹的萌狼乡手札]","[约伊兹的萌狼乡手札, vertical_align_top重新开始？🙃, menu, 约伊...","[约伊兹的萌狼乡手札, menu, 约伊兹的萌狼乡手札, more_vert, ホロ, 且被...","[约伊兹的萌狼乡手札, 约伊兹的萌狼乡手札]","[en, en]"
1332,茶马古道,[7zj4oshsyhokgus6fyk7pmdiubu4mkjpjjprjkvopnhnw...,[7zj4oshsyhokgus6fyk7pmdiubu4mkjpjjprjkvopnhnw...,[茶马古道],"[茶马古道, 茶马护盾已开启欢迎来到茶马古道，页面跳转中...]","[茶马古道, 茶马护盾已开启欢迎来到茶马古道, 页面跳转中]",[茶马古道],[ko]
1333,藤原太一 犯罪者 窃盗 脅迫 たかり 亘心綜合音楽事務所 太眉,[6tert2z5hydswwvt],[6tert2z5hydswwvt.onion],[藤原太一 犯罪者 窃盗 脅迫 たかり 亘心綜合音楽事務所 太眉],[藤原太一 犯罪者 窃盗 脅迫 たかり 亘心綜合音楽事務所 太眉],"[犯罪者, 窃盗, 脅迫, たかり, 亘心綜合音楽事務所, 太眉]","[犯罪者, 窃盗, 脅迫, たかり, 亘心綜合音楽事務所, 太眉]",[ko]


In [22]:
df_test['language'] = df_test['language'].apply(lambda x: most_frequent(x))
df_test

Unnamed: 0,cluster_name,domain,url,title,body_text,body_token,title_token,language
0,! CLONED CARDS !,[mhcge66dpacblyp55fv2i7bf37qg22xe34b4hcz2vyxpk...,[mhcge66dpacblyp55fv2i7bf37qg22xe34b4hcz2vyxpk...,[! CLONED CARDS !],"[! CLONED CARDS !, Home, Price, Shipping, Ques...","[clone, card, home, price, ship, question, con...","[clone, card]",en
1,$$ The Green Machine $$ - Error,[he22pncoselnm54h],[he22pncoselnm54h.onion],[$$ The Green Machine $$ - Error ],[$$ The Green Machine $$ - Error You have been...,"[machin, error, log, session, ha, expir, pleas]","[machin, error]",en
2,$$$ Credit Cards | PayPal | Wester Union | Mon...,[2cardsowr7u7uvpyrnc5lxuclhb4noj6q2cqf2so7ezg2...,[2cardsowr7u7uvpyrnc5lxuclhb4noj6q2cqf2so7ezg2...,[$$$ Credit Cards | PayPal | Wester Union | Mo...,[$$$ Credit Cards | PayPal | Wester Union | Mo...,"[credit, card, wester, union, moneygram, accou...","[credit, card, wester, union, moneygram, accou...",en
3,&#x64;&#65279;&#x61;&#x72;&#x6B;&#x2E;&#x66;&#...,[darkfailllnkf4vf],[darkfailllnkf4vf.onion],[&#x64;&#65279;&#x61;&#x72;&#x6B;&#x2E;&#x66;&...,"[d﻿ark.fai﻿l: Which darknet sites are online?,...","[ark, darknet, site, onlin, darknet, site, onl...","[darknet, site, onlin]",en
4,&lt;&lt;&lt;/&gt;&gt; Chaos Computer Club Dres...,[rvy6qmlqfstv6rlz],[rvy6qmlqfstv6rlz.onion],[&lt;&lt;&lt;/&gt;&gt; Chaos Computer Club Dre...,"[<<</>> Chaos Computer Club Dresden | c3d2, <<...","[chao, comput, club, dresden, chao, comput, cl...","[lt, lt, lt, gt, gt, chao, comput, club, dresden]",en
...,...,...,...,...,...,...,...,...
1330,淡路島,[a5h4uuqoovwvcbiecjkjlu7ew7z4lszfv67dodqol2ive...,[a5h4uuqoovwvcbiecjkjlu7ew7z4lszfv67dodqol2ive...,[淡路島],"[淡路島, 淡路島 - コシニテ人びとへの手紙 -, 玉ねぎを育もう, Update, 20...","[淡路島, 淡路島, コシニテ人びとへの手紙, 玉ねぎを育もう, updat, scale,...",[淡路島],ja
1331,约伊兹的萌狼乡手札,"[horoiomuy6xignjv, qty3gmiq3zhs7whorla4ynvmndp...","[horoiomuy6xignjv.onion, qty3gmiq3zhs7whorla4y...","[约伊兹的萌狼乡手札, 约伊兹的萌狼乡手札]","[约伊兹的萌狼乡手札, vertical_align_top重新开始？🙃, menu, 约伊...","[约伊兹的萌狼乡手札, menu, 约伊兹的萌狼乡手札, more_vert, ホロ, 且被...","[约伊兹的萌狼乡手札, 约伊兹的萌狼乡手札]",en
1332,茶马古道,[7zj4oshsyhokgus6fyk7pmdiubu4mkjpjjprjkvopnhnw...,[7zj4oshsyhokgus6fyk7pmdiubu4mkjpjjprjkvopnhnw...,[茶马古道],"[茶马古道, 茶马护盾已开启欢迎来到茶马古道，页面跳转中...]","[茶马古道, 茶马护盾已开启欢迎来到茶马古道, 页面跳转中]",[茶马古道],ko
1333,藤原太一 犯罪者 窃盗 脅迫 たかり 亘心綜合音楽事務所 太眉,[6tert2z5hydswwvt],[6tert2z5hydswwvt.onion],[藤原太一 犯罪者 窃盗 脅迫 たかり 亘心綜合音楽事務所 太眉],[藤原太一 犯罪者 窃盗 脅迫 たかり 亘心綜合音楽事務所 太眉],"[犯罪者, 窃盗, 脅迫, たかり, 亘心綜合音楽事務所, 太眉]","[犯罪者, 窃盗, 脅迫, たかり, 亘心綜合音楽事務所, 太眉]",ko


In [23]:
df_test_eng = df_test[df_test['language'] == 'en']
df_test_eng

Unnamed: 0,cluster_name,domain,url,title,body_text,body_token,title_token,language
0,! CLONED CARDS !,[mhcge66dpacblyp55fv2i7bf37qg22xe34b4hcz2vyxpk...,[mhcge66dpacblyp55fv2i7bf37qg22xe34b4hcz2vyxpk...,[! CLONED CARDS !],"[! CLONED CARDS !, Home, Price, Shipping, Ques...","[clone, card, home, price, ship, question, con...","[clone, card]",en
1,$$ The Green Machine $$ - Error,[he22pncoselnm54h],[he22pncoselnm54h.onion],[$$ The Green Machine $$ - Error ],[$$ The Green Machine $$ - Error You have been...,"[machin, error, log, session, ha, expir, pleas]","[machin, error]",en
2,$$$ Credit Cards | PayPal | Wester Union | Mon...,[2cardsowr7u7uvpyrnc5lxuclhb4noj6q2cqf2so7ezg2...,[2cardsowr7u7uvpyrnc5lxuclhb4noj6q2cqf2so7ezg2...,[$$$ Credit Cards | PayPal | Wester Union | Mo...,[$$$ Credit Cards | PayPal | Wester Union | Mo...,"[credit, card, wester, union, moneygram, accou...","[credit, card, wester, union, moneygram, accou...",en
3,&#x64;&#65279;&#x61;&#x72;&#x6B;&#x2E;&#x66;&#...,[darkfailllnkf4vf],[darkfailllnkf4vf.onion],[&#x64;&#65279;&#x61;&#x72;&#x6B;&#x2E;&#x66;&...,"[d﻿ark.fai﻿l: Which darknet sites are online?,...","[ark, darknet, site, onlin, darknet, site, onl...","[darknet, site, onlin]",en
4,&lt;&lt;&lt;/&gt;&gt; Chaos Computer Club Dres...,[rvy6qmlqfstv6rlz],[rvy6qmlqfstv6rlz.onion],[&lt;&lt;&lt;/&gt;&gt; Chaos Computer Club Dre...,"[<<</>> Chaos Computer Club Dresden | c3d2, <<...","[chao, comput, club, dresden, chao, comput, cl...","[lt, lt, lt, gt, gt, chao, comput, club, dresden]",en
...,...,...,...,...,...,...,...,...
1304,| TheYOSH.nl,[ctzzqqimlfamyhrc],[ctzzqqimlfamyhrc.onion],[| TheYOSH.nl],"[| TheYOSH.nl, Skip to main content, User acco...","[theyosh, nl, skip, content, user, account, me...","[theyosh, nl]",en
1321,✉ Guerrilla Mail on Tor,[grrmailb3fxpjbwm],[grrmailb3fxpjbwm.onion],[✉ Guerrilla Mail on Tor],"[✉ Guerrilla Mail on Tor, TorGuerrillaMail - D...","[guerrilla, mail, tor, dispos, mail, address, ...","[guerrilla, mail, tor]",en
1322,✔ Legit Carding Services ♛ Money Transfer Worl...,[greenegbqkyk3ois],[greenegbqkyk3ois.onion],[✔ Legit Carding Services ♛ Money Transfer Wor...,[✔ Legit Carding Services ♛ Money Transfer Wor...,"[card, servic, money, transfer, worldwid, webs...","[card, servic, money, transfer, worldwid, webs...",en
1331,约伊兹的萌狼乡手札,"[horoiomuy6xignjv, qty3gmiq3zhs7whorla4ynvmndp...","[horoiomuy6xignjv.onion, qty3gmiq3zhs7whorla4y...","[约伊兹的萌狼乡手札, 约伊兹的萌狼乡手札]","[约伊兹的萌狼乡手札, vertical_align_top重新开始？🙃, menu, 约伊...","[约伊兹的萌狼乡手札, menu, 约伊兹的萌狼乡手札, more_vert, ホロ, 且被...","[约伊兹的萌狼乡手札, 约伊兹的萌狼乡手札]",en


In [24]:
df_test_noneng = df_test[df_test['language'] != 'en']
df_test_noneng

Unnamed: 0,cluster_name,domain,url,title,body_text,body_token,title_token,language
6,*** JES 2020 *** Кшись и Компания. Частное рад...,[73ovw2epsc4kga45lhz4sm3ulfzemdzlhxlj6p2b7i4py...,[73ovw2epsc4kga45lhz4sm3ulfzemdzlhxlj6p2b7i4py...,[*** JES 2020 *** Кшись и Компания. Частное ра...,[*** JES 2020 *** Кшись и Компания. Частное ра...,"[je, кшись, компания, частное, радио, телевиде...","[je, кшись, компания, частное, радио, телевиде...",ru
22,24BOT,[24boths2mh6sxaz5],[24boths2mh6sxaz5.onion],[24BOT],"[24BOT, 24boths2mh6sxaz5.onion - моментальный ...","[bot, mh, sxaz, onion, моментальный, магазин, ...",[bot],ru
23,2channel,[2chagyntyms53ruy],[2chagyntyms53ruy.onion],[2channel],"[2channel, 2channel.moe, Добро пожаловать., О ...","[channel, channel, moe, добро, пожаловать, общ...",[channel],ru
53,Accueil,[djypjjvw532evfw3],[djypjjvw532evfw3.onion],[Accueil],"[Accueil, Accueil, Produits, Contact/Commande,...","[accueil, accueil, produit, contact, command, ...",[accueil],fr
56,Adult puzzle ~ Galleria,[2adud6uuxl2tg7aj],[2adud6uuxl2tg7aj.onion],[Adult puzzle ~ Galleria],"[Adult puzzle ~ Galleria, Adult puzzle ~ Galle...","[adult, puzzl, galleria, adult, puzzl, galleri...","[adult, puzzl, galleria]",ro
...,...,...,...,...,...,...,...,...
1328,暗网中文担保交易市场,[txxh3pmeihpcw4pe],[txxh3pmeihpcw4pe.onion],[暗网中文担保交易市场],"[暗网中文担保交易市场, 本市场并无分号，独此一家，其他均为假冒。价格显示为美元，支付结算时...","[暗网中文担保交易市场, 独此一家, 其他均为假冒, 价格显示为美元, 支付结算时将转换为比...",[暗网中文担保交易市场],zh-cn
1329,暗网中文担保交易市场 - 一个小高端中文安全买卖平台暗网中文担保交易市场,"[m6onsfnvo4iolyix, moi3v3c77y5ckdsh]","[m6onsfnvo4iolyix.onion, moi3v3c77y5ckdsh.onio...","[暗网中文担保交易市场 - 一个小高端中文安全买卖平台暗网中文担保交易市场, 暗网中文担保交...","[暗网中文担保交易市场 - 一个小高端中文安全买卖平台暗网中文担保交易市场, 本市场独此一家...","[暗网中文担保交易市场, 其他均为假冒, 价格显示为美元, 支付结算时将转换为比特币, 莱特...","[暗网中文担保交易市场, 暗网中文担保交易市场, 关于春节期间安排的通知, 买家规则, 担保...",zh-cn
1330,淡路島,[a5h4uuqoovwvcbiecjkjlu7ew7z4lszfv67dodqol2ive...,[a5h4uuqoovwvcbiecjkjlu7ew7z4lszfv67dodqol2ive...,[淡路島],"[淡路島, 淡路島 - コシニテ人びとへの手紙 -, 玉ねぎを育もう, Update, 20...","[淡路島, 淡路島, コシニテ人びとへの手紙, 玉ねぎを育もう, updat, scale,...",[淡路島],ja
1332,茶马古道,[7zj4oshsyhokgus6fyk7pmdiubu4mkjpjjprjkvopnhnw...,[7zj4oshsyhokgus6fyk7pmdiubu4mkjpjjprjkvopnhnw...,[茶马古道],"[茶马古道, 茶马护盾已开启欢迎来到茶马古道，页面跳转中...]","[茶马古道, 茶马护盾已开启欢迎来到茶马古道, 页面跳转中]",[茶马古道],ko


In [25]:
df_train.to_csv("../data/trainset.csv")
df_test.to_csv("../data/testset.csv")