In [1]:
import pandas as pd

# Malicious URLs

In [2]:
fraudulent = pd.read_csv('HiddenFraudulentURLS.csv', sep = ';')

In [3]:
openphish = 'https://openphish.com/feed.txt'
openphish_df = pd.read_csv(openphish, sep = ' ', header = None)

In [4]:
fraudulent.shape

(185180, 8)

In [5]:
openphish_df.shape

(3467, 1)

In [6]:
fraudulent.head(2)

Unnamed: 0,url,compromissionType,isHiddenFraudulent,contentLength,serverType,poweredBy,contentType,lastModified
0,http://www.sinduscongoias.com.br/index.html,defacement,False,2474,Apache/2.2,,text/html,"Sat, 05 Jan 2013 19:36:29 GMT"
1,http://www.sinduscongoias.com.br/index.php/ins...,defacement,False,0,Apache/2.2,,text/html; charset=utf-8,"Mon, 21 Jan 2013 19:30:53 GMT"


In [7]:
fraudulent['compromissionType'].unique()

array(['defacement', 'normal', 'phishing'], dtype=object)

In [8]:
fraudulent.isna().sum()

url                       0
compromissionType         0
isHiddenFraudulent        0
contentLength             0
serverType             2082
poweredBy             83841
contentType            2306
lastModified          77592
dtype: int64

In [9]:
x = len(fraudulent[fraudulent['compromissionType'] == 'phishing'])

In [10]:
y = len(fraudulent[fraudulent['compromissionType'] == 'normal'])

In [11]:
z = len(fraudulent[fraudulent['compromissionType'] == 'defacement'])

In [12]:
x + y + z

185180

In [13]:
len(fraudulent[fraudulent['isHiddenFraudulent'] == False])

176471

In [14]:
fraudulent_df = fraudulent[['url']].copy()

In [15]:
fraudulent_df

Unnamed: 0,url
0,http://www.sinduscongoias.com.br/index.html
1,http://www.sinduscongoias.com.br/index.php/ins...
2,http://www.sinduscongoias.com.br/index.php/ins...
3,http://www.sinduscongoias.com.br/index.php/ins...
4,http://www.sinduscongoias.com.br/index.php/ins...
...,...
185175,http://pastehtml.com/info/1b8be47.html
185176,http://pastehtml.com/help/stats
185177,http://pastehtml.com/privacy_policy
185178,http://pastehtml.com/raw/1b8be47.html


In [16]:
openphish_df.head(4)

Unnamed: 0,0
0,https://powersstridebattery.com/exch/exchmaili...
1,https://msggroup.azurefd.net/messages/
2,https://lloydshelp-me.com/Login.php
3,https://business-confirm.com/


In [17]:
openphish_df.columns = ['url']

In [18]:
malicious_df = pd.concat([fraudulent_df, openphish_df])

In [19]:
malicious_df['label'] = 1

In [20]:
malicious_df.drop_duplicates(subset = 'url', inplace = True)

# Safe URLs

In [21]:
safe = pd.read_csv('majestic_million.csv')

In [22]:
safe.head()

Unnamed: 0,GlobalRank,TldRank,Domain,TLD,RefSubNets,RefIPs,IDN_Domain,IDN_TLD,PrevGlobalRank,PrevTldRank,PrevRefSubNets,PrevRefIPs
0,1,1,facebook.com,com,486204,2751962,facebook.com,com,1,1,490805,2819081
1,2,2,google.com,com,478321,2498099,google.com,com,2,2,483612,2564692
2,3,3,youtube.com,com,437772,2222523,youtube.com,com,3,3,442283,2279514
3,4,4,twitter.com,com,428852,2196571,twitter.com,com,4,4,433274,2250593
4,5,5,instagram.com,com,343583,1625566,instagram.com,com,5,5,347531,1667231


In [23]:
safe_df = safe[['Domain']].copy()

In [24]:
safe_df['label'] = 0

In [25]:
safe_df.columns = ['url','label']

# Combine

In [26]:
combined_df = pd.concat([malicious_df, safe_df], axis = 0)

In [27]:
#combined_df.head()

In [28]:
combined_df['url'] = combined_df['url'].str.replace('http://', '')
combined_df['url'] = combined_df['url'].str.replace('https://', '')

In [29]:
combined_df['url'] = [x.lstrip('www.') for x in combined_df['url']]

In [30]:
#combined_df[combined_df['label'] == 1]

In [31]:
combined_df.drop_duplicates(subset = 'url',inplace=True)

In [32]:
combined_df

Unnamed: 0,url,label
0,sinduscongoias.com.br/index.html,1
1,sinduscongoias.com.br/index.php/institucional.1,1
2,sinduscongoias.com.br/index.php/institucional/...,1
3,sinduscongoias.com.br/index.php/institucional/...,1
4,sinduscongoias.com.br/index.php/institucional/...,1
...,...,...
999995,deseretpress.net,0
999996,dieselbike.net,0
999997,hellogoodbye.net,0
999998,ikashmir.net,0


In [33]:
combined_df.to_csv('urls.csv', index = False)