# Good URLs extractor

---

This notebook explains what exactly was kept and what was deleted from original
datasets

In [1]:
%pylab inline
import pandas as pd
from IPython.display import display

# URLs-good
ALEXA1M_PATH = "Data/URLs-good/alexa1m.csv"
ISCX_BENIGN_PATH = "Data/URLs-good/ISCX_benign.csv"
POLAND100_PATH = "Data/URLs-good/poland100.csv"
MOZ500_PATH = "Data/URLs-good/Moz500.csv"
CATEGORIZED_DFE_PATH = "Data/URLs-good/categorized_dfe.csv"
GOV_POLAND_PATH = "Data/URLs-good/gov_poland.csv"

# URLs-mixed
KAGGLE_LABELED_PATH = "Data/URLs-mixed/kaggle_labeled.csv"
MENDELEY_LABEL1M_PATH = "Data/URLs-mixed/mendeley_label1m_s.csv"
MENDELEY_LABEL300K_PATH = "Data/URLs-mixed/mendeley_label300k_s.csv"
PHISH_STORM_PATH = "Data/URLs-mixed/phish_storm.csv"

Populating the interactive namespace from numpy and matplotlib


In [2]:
print('Alexa top 641k:')
alexa1m = pd.read_csv(
    ALEXA1M_PATH,
    names=['iD','domain'],
    usecols=[1]
)
alexa1m.head(6)

Alexa top 641k:


Unnamed: 0,domain
0,google.com
1,youtube.com
2,tmall.com
3,baidu.com
4,qq.com
5,sohu.com


In [3]:
print('ISCX2016 benign 35k:')
iscx_benign = pd.read_csv(
    ISCX_BENIGN_PATH,
    names=['domain']
)
iscx_benign.head(6)

ISCX2016 benign 35k:


Unnamed: 0,domain
0,http://1337x.to/torrent/1048648/American-Snipe...
1,http://1337x.to/torrent/1110018/Blackhat-2015-...
2,http://1337x.to/torrent/1122940/Blackhat-2015-...
3,http://1337x.to/torrent/1124395/Fast-and-Furio...
4,http://1337x.to/torrent/1145504/Avengers-Age-o...
5,http://1337x.to/torrent/1160078/Avengers-age-o...


In [4]:
print('Poland top 100:')
poland100_chaste = pd.read_csv(
    POLAND100_PATH,
    index_col=0
)

drop_list = ['Type']
poland100 = poland100_chaste.drop(drop_list, axis='columns')
poland100.columns = ['domain']

print(
    "\nDropped columns:\n",
    drop_list,
    "\n\n'Domain' renamed to 'domain'\n"
)
display(poland100_chaste.head(6))
del poland100_chaste, drop_list

Poland top 100:

Dropped columns:
 ['Type'] 

'Domain' renamed to 'domain'



Unnamed: 0,Type,Domain
0,YouTube.com – druga pozycja też pokrywa się z ...,youtube.com
1,Facebook.com – serwis społecznościowy służący ...,facebook.com
2,Allegro.pl – rodzimy serwis aukcyjny i najpopu...,allegro.pl
3,Google.pl – polska wersja wyszukiwarki Google.,google.pl
4,Wp.pl – portal internetowy,wp.pl
5,Onet.pl – serwis internetowy z wiadomościami,onet.pl


In [5]:
print('MOZ top 500:')
moz500_chaste = pd.read_csv(
    MOZ500_PATH
)

drop_list = ['Rank', 'Linking Root Domains', 'Domain Authority']
moz500 = moz500_chaste.drop(drop_list, axis='columns')
moz500.columns = ['domain']

print(
    "\nDropped columns:\n",
    drop_list,
    "\n\n'Root Domain' renamed to 'domain'\n"
)
display(moz500_chaste.head(6))
del moz500_chaste, drop_list

MOZ top 500:

Dropped columns:
 ['Rank', 'Linking Root Domains', 'Domain Authority'] 

'Root Domain' renamed to 'domain'



Unnamed: 0,Rank,Root Domain,Linking Root Domains,Domain Authority
0,1,youtube.com,18657325,100
1,2,apple.com,5518034,100
2,3,www.google.com,12442787,100
3,4,linkedin.com,9511419,99
4,5,play.google.com,3395675,99
5,6,support.google.com,4593245,99


In [6]:
print('Categorization dataset 31k:')
categorized_dfe_chaste = pd.read_csv(
    CATEGORIZED_DFE_PATH
)

kept_list = ['main_category', 'main_category_confidence', 'url']
categorized_dfe = categorized_dfe_chaste[kept_list]
categorized_dfe.columns = ['main_category', 'main_category_confidence', 'domain']

print(
    "\nKept columns:\n",
    kept_list,
    "\n\n'url' renamed to 'domain'\n"
)

display(categorized_dfe_chaste.head(6))
del categorized_dfe_chaste, kept_list

Categorization dataset 31k:

Kept columns:
 ['main_category', 'main_category_confidence', 'url'] 

'url' renamed to 'domain'



  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,unit_id,golden,unit_state,trusted_judgments,last_judgment_at,main_category,main_category_confidence,sub_category_arts_and_entertainment,sub_category_arts_and_entertainment_confidence,sub_category_autos_and_vehicles,...,sub_category_news_and_media_gold,sub_category_people_and_society_gold,sub_category_pets_and_animals_gold,sub_category_recreation_and_hobbies_gold,sub_category_reference_gold,sub_category_science_gold,sub_category_shopping_gold,sub_category_sports_gold,sub_category_travel_gold,url
0,765574675,False,finalized,5,2015-08-04T07:47:00,Internet_and_Telecom,0.5581,,0.0,,...,,,,,,,,,,000webhost.com
1,765574676,False,finalized,5,2015-08-03T22:50:00,Not_working,0.6544,,0.0,,...,,,,,,,,,,007go.com
2,765574677,False,finalized,5,2015-08-04T00:31:00,Not_working,0.6565,,0.0,,...,,,,,,,,,,010.com
3,765574678,False,finalized,5,2015-08-04T14:43:00,Career_and_Education,1.0,,0.0,,...,,,,,,,,,,022menchuang.net
4,765574679,False,finalized,5,2015-08-03T18:52:00,Not_working,1.0,,0.0,,...,,,,,,,,,,050anshin.com
5,765574680,False,finalized,5,2015-08-03T22:34:00,News_and_Media,0.5524,,0.0,,...,,,,,,,,,,055firenze.it


In [7]:
print('Poland government pages dataset 69k:')
gov_poland_chaste = pd.read_csv(
    GOV_POLAND_PATH,
    delimiter=';'
)

drop_list = ['Nazwa podmiotu publicznego']
gov_poland = gov_poland_chaste.drop(drop_list, axis='columns')
gov_poland.columns = ['domain']

print(
    "\nDropped columns:\n",
    drop_list,
    "\n\n'Adres strony internetowej' renamed to 'domain'\n"
)

display(gov_poland_chaste.head(6))
del gov_poland_chaste, drop_list

Poland government pages dataset 69k:

Dropped columns:
 ['Nazwa podmiotu publicznego'] 

'Adres strony internetowej' renamed to 'domain'



Unnamed: 0,Nazwa podmiotu publicznego,Adres strony internetowej
0,"„EKSPLOATATOR"" SPÓŁKA Z OGRANICZONĄ ODPOWIEDZI...",eksploatator.bip.gov.pl
1,1 BATALION CZOŁGÓW,http://1bcz.wp.mil.pl
2,1 BATALION DROGOWO-MOSTOWY,http://1bdm.wp.mil.pl/pl/28.html
3,1 BATALION STRZELCÓW PODHALAŃSKICH,www.1bsp.wp.mil.pl
4,1 LICEUM OGÓLNOKSZTAŁCĄCE IM. KS. ADAMA JERZEG...,pulawypz.6.e-bip.pl
5,1 LICEUM OGÓLNOKSZTAŁCĄCE IM. WOJCIECHA KĘTRZY...,https://lo1.gizycko.edu.pl/


In [8]:
print('Kaggle labeled 345k:')
kaggle_labeled_chaste = pd.read_csv(
    KAGGLE_LABELED_PATH
)

kaggle_labeled = kaggle_labeled_chaste[kaggle_labeled_chaste.label != 'bad']\
    .drop('label', axis='columns')
kaggle_labeled.reset_index(drop=True,
                           inplace=True)
kaggle_labeled.columns = ['domain']

print(
    "\nAll rows with 'bad' label were dropped.",
    "\n'label' column was also dropped."
    "\n\n'url' renamed to 'domain'\n"
)

display(kaggle_labeled_chaste.head(6))
del kaggle_labeled_chaste

Kaggle labeled 345k:

All rows with 'bad' label were dropped. 
'label' column was also dropped.

'url' renamed to 'domain'



Unnamed: 0,url,label
0,diaryofagameaddict.com,bad
1,espdesign.com.au,bad
2,iamagameaddict.com,bad
3,kalantzis.net,bad
4,slightlyoffcenter.net,bad
5,toddscarwash.com,bad


In [9]:
print('Mendeley labeled 1.2m:')
mendeley_label1m_chaste = pd.read_csv(
    MENDELEY_LABEL1M_PATH,
    index_col=0
)

mendeley_label1m = mendeley_label1m_chaste.rename(columns = {'url':'domain'})
mendeley_label1m = mendeley_label1m[mendeley_label1m.label != '-']
drop_list = ['label'] # + 'content', 'tld', 'https', 'url_len'
mendeley_label1m.drop(
    drop_list,
    axis='columns',
    inplace=True
)
mendeley_label1m.reset_index(drop=True,
                             inplace=True)
print(
    "\nAll rows with 'bad' label were dropped.",
    "\n\nThe following columns were dropped:",
    drop_list,
    "\n\n'url' renamed to 'domain'\n"
)
display(mendeley_label1m_chaste.head(6))
del mendeley_label1m_chaste

Mendeley labeled 1.2m:


  mask |= (ar1 == a)



All rows with 'bad' label were dropped. 

The following columns were dropped: ['label'] 

'url' renamed to 'domain'



Unnamed: 0,url,ip_add,geo_loc,who_is,js_len,js_obf_len,label
0,http://members.tripod.com/russiastation/,42.77.221.155,Taiwan,+,58.0,0.0,+
1,http://www.ddj.com/cpp/184403822,3.211.202.180,United States,+,52.5,0.0,+
2,http://www.naef-usa.com/,24.232.54.41,Argentina,+,103.5,0.0,+
3,http://www.ff-b2b.de/,147.22.38.45,United States,-,720.0,532.8,-
4,http://us.imdb.com/title/tt0176269/,205.30.239.85,United States,+,46.5,0.0,+
5,http://efilmcritic.com/hbs.cgi?movie=311,8.28.167.23,United States,+,39.5,0.0,+


In [10]:
print('Mendeley labeled 300k:')
mendeley_label300k_chaste = pd.read_csv(
    MENDELEY_LABEL300K_PATH,
    index_col=0
)

mendeley_label300k = mendeley_label300k_chaste.rename(columns = {'url':'domain'})
mendeley_label300k = mendeley_label300k[mendeley_label300k.label != '-']
drop_list = ['label'] # + 'content', 'tld', 'https', 'url_len'
mendeley_label300k.drop(
    drop_list,
    axis='columns',
    inplace=True
)
mendeley_label300k.reset_index(drop=True,
                             inplace=True)
print(
    "\nAll rows with 'bad' label were dropped.",
    "\n\nThe following columns were dropped:",
    drop_list,
    "\n\n'url' renamed to 'domain'\n"
)
display(mendeley_label300k_chaste.head(6))
del mendeley_label300k_chaste, drop_list

Mendeley labeled 300k:

All rows with 'bad' label were dropped. 

The following columns were dropped: ['label'] 

'url' renamed to 'domain'



Unnamed: 0,url,ip_add,geo_loc,who_is,js_len,js_obf_len,label
0,http://www.dutchthewiz.com/freeware/,175.67.214.68,China,+,38.5,0.0,+
1,http://www.collectiblejewels.com,188.120.171.121,Sweden,-,187.0,0.0,+
2,http://www.deadlinedata.com,193.51.170.1,France,+,31.0,0.0,+
3,http://www.mil.fi/maavoimat/kalustoesittely/00...,13.237.35.44,Australia,+,152.0,0.0,+
4,http://www.avclub.com/content/node/24539,220.193.62.89,China,+,150.0,0.0,+
5,http://www.yellowstone.net/baptist,13.122.80.132,United States,+,140.5,0.0,+


In [11]:
print('PhisStorm data 96k:')
phish_storm_chaste = pd.read_csv(
    PHISH_STORM_PATH
)
phish_storm = phish_storm_chaste[phish_storm_chaste.label != 1.0]
drop_list = ['label', 'ranking']
phish_storm = phish_storm.drop(
    drop_list,
    axis='columns'
)
print(
    "\nAll rows with '1.0' label were dropped.",
    "\n\nThe following columns were dropped:",
    drop_list,
)
display(phish_storm_chaste.head(6))
del phish_storm_chaste, drop_list

PhisStorm data 96k:

All rows with '1.0' label were dropped. 

The following columns were dropped: ['label', 'ranking']


Unnamed: 0,domain,ranking,mld_res,mld.ps_res,card_rem,ratio_Rrem,ratio_Arem,jaccard_RR,jaccard_RA,jaccard_AR,jaccard_AA,jaccard_ARrd,jaccard_ARrem,label
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,10000000,1.0,0.0,18,107.611111,107.277778,0.0,0.0,0.0,0.0,0.8,0.795729,1.0
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,10000000,0.0,0.0,11,150.636364,152.272727,0.0,0.0,0.0,0.0,0.0,0.768577,1.0
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,10000000,0.0,0.0,14,73.5,72.642857,0.0,0.0,0.0,0.0,0.0,0.726582,1.0
3,mail.printakid.com/www.online.americanexpress....,10000000,0.0,0.0,6,562.0,590.666667,0.0,0.0,0.0,0.0,0.0,0.85964,1.0
4,thewhiskeydregs.com/wp-content/themes/widescre...,10000000,0.0,0.0,8,29.0,24.125,0.0,0.0,0.0,0.0,0.0,0.748971,1.0
5,smilesvoegol.servebbs.org/voegol.php,10000000,0.0,0.0,2,223.5,234.0,0.0,0.0,0.0,0.0,0.0,0.852227,1.0


In [12]:
good_df_to_concat = [
    alexa1m,
    categorized_dfe,
    gov_poland,
    iscx_benign,
    kaggle_labeled,
    mendeley_label1m,
    mendeley_label300k,
    moz500,
    phish_storm,
    poland100
]
URLs_good = pd.concat(good_df_to_concat,
                      axis=0,
                      ignore_index=True)
URLs_good.head()

Unnamed: 0,domain,main_category,main_category_confidence,ip_add,geo_loc,who_is,js_len,js_obf_len,mld_res,mld.ps_res,card_rem,ratio_Rrem,ratio_Arem,jaccard_RR,jaccard_RA,jaccard_AR,jaccard_AA,jaccard_ARrd,jaccard_ARrem
0,google.com,,,,,,,,,,,,,,,,,,
1,youtube.com,,,,,,,,,,,,,,,,,,
2,tmall.com,,,,,,,,,,,,,,,,,,
3,baidu.com,,,,,,,,,,,,,,,,,,
4,qq.com,,,,,,,,,,,,,,,,,,
