# Preprocessing Files 

## In this file, urls from the collected data are cleaned and structured

### 1. Import libraries

In [41]:
import pandas as pd

### 2. Loading collected data from files

The data is mainly collected from Phishtank and URL dataset ISCX-URL-2016 (present on kaggle), but some data is polluted or unnecessary (urls with malware or defacement). Data sources:
- https://www.kaggle.com/datasets/sid321axn/malicious-urls-dataset/data (benign data from: https://www.unb.ca/cic/datasets/url-2016.html)
- https://phishtank.org/ (online validated and https://phishtank.org/phish_search.php?valid=n&active=All&Search=Search)

In [42]:
phish_data_from_phishtank = pd.read_csv('./datasets/verified_online.csv')
non_phish_data_from_phishtank_scraped = pd.read_csv('./datasets/non-phishing-urls-phishtank-final.csv')
data_from_kaggle = pd.read_csv('./datasets/malicious_phish.csv')

### 3. First look on the data

In [43]:
phish_data_from_phishtank.head(10)

Unnamed: 0,phish_id,url,phish_detail_url,submission_time,verified,verification_time,online,target
0,8546313,https://ollx.1792836.xyz/captcha,http://www.phishtank.com/phish_detail.php?phis...,2024-04-22T10:55:17+00:00,yes,2024-04-22T11:04:04+00:00,yes,Other
1,8546314,https://ufakick456.com/captcha,http://www.phishtank.com/phish_detail.php?phis...,2024-04-22T10:55:17+00:00,yes,2024-04-22T11:04:04+00:00,yes,Other
2,8546312,https://keyeseuropean.com/captcha,http://www.phishtank.com/phish_detail.php?phis...,2024-04-22T10:55:16+00:00,yes,2024-04-22T11:04:04+00:00,yes,Other
3,8546311,https://allegr0lokalnie.83473636.xyz/captcha,http://www.phishtank.com/phish_detail.php?phis...,2024-04-22T10:55:14+00:00,yes,2024-04-22T11:04:04+00:00,yes,Other
4,8546309,http://ufakick456.com/fb7pl5qw,http://www.phishtank.com/phish_detail.php?phis...,2024-04-22T10:55:13+00:00,yes,2024-04-22T11:04:04+00:00,yes,Other
5,8546310,http://ollx.1792836.xyz/fb7pl5qw,http://www.phishtank.com/phish_detail.php?phis...,2024-04-22T10:55:13+00:00,yes,2024-04-22T11:04:04+00:00,yes,Other
6,8546308,http://keyeseuropean.com/fb7pl5qw,http://www.phishtank.com/phish_detail.php?phis...,2024-04-22T10:55:11+00:00,yes,2024-04-22T11:04:04+00:00,yes,Other
7,8546307,http://allegr0lokalnie.83473636.xyz/fb7pl5qw,http://www.phishtank.com/phish_detail.php?phis...,2024-04-22T10:55:10+00:00,yes,2024-04-22T11:04:04+00:00,yes,Other
8,8546279,http://atrgptol.pages.dev/connection-module/,http://www.phishtank.com/phish_detail.php?phis...,2024-04-22T10:25:50+00:00,yes,2024-04-22T10:33:21+00:00,yes,Other
9,8546280,https://atrgptol.pages.dev/connection-module/,http://www.phishtank.com/phish_detail.php?phis...,2024-04-22T10:25:50+00:00,yes,2024-04-22T10:33:21+00:00,yes,Other


In [44]:
non_phish_data_from_phishtank_scraped.head(10)

Unnamed: 0,ID,PHISH_URL,ADDITIONAL_INFO
0,8550621,http://login.live.com.office.autooffc29536.eup...,added on Apr 25th 2024 1:01 AM by cleanmx INVA...
1,8535110,https://releases.jquery.com/,added on Apr 13th 2024 1:57 PM by verifrom INV...
2,8535109,https://code.jquery.com/,added on Apr 13th 2024 1:57 PM by verifrom INV...
3,8533875,https://app.ether.fi/blocked,added on Apr 12th 2024 3:27 PM by Felix0101 IN...
4,8530207,https://fonts.googleapis.com/css2?family=Open+...,added on Apr 10th 2024 8:25 AM by tchatzi INVA...
5,8529188,https://urlscan.io/result/31a21a49-f77d-406d-a...,added on Apr 9th 2024 8:09 PM by lhernandez IN...
6,8518134,https://buff.163.com/news/?news_game=dota2&sou...,added on Apr 3rd 2024 2:40 AM by Felix0101 INV...
7,8511079,https://login.microsoftonline.com/638fcbaf-ba4...,added on Mar 30th 2024 7:03 PM by cleanmx INVA...
8,8511077,https://login.microsoftonline.com/638fcbaf-ba4...,added on Mar 30th 2024 7:03 PM by cleanmx INVA...
9,8511075,https://login.microsoftonline.com/638fcbaf-ba4...,added on Mar 30th 2024 7:03 PM by cleanmx INVA...


In [45]:
data_from_kaggle.head(10)

Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign
3,http://www.garage-pirenne.be/index.php?option=...,defacement
4,http://adventure-nicaragua.net/index.php?optio...,defacement
5,http://buzzfil.net/m/show-art/ils-etaient-loin...,benign
6,espn.go.com/nba/player/_/id/3457/brandon-rush,benign
7,yourbittorrent.com/?q=anthony-hamilton-soulife,benign
8,http://www.pashminaonline.com/pure-pashminas,defacement
9,allmusic.com/album/crazy-from-the-heat-r16990,benign


##### Length of these datasets:

In [46]:
len(phish_data_from_phishtank)

60804

In [47]:
len(non_phish_data_from_phishtank_scraped)

127320

In [48]:
len(data_from_kaggle)

651191

In [49]:
phish_data_from_phishtank.shape

(60804, 8)

In [50]:
non_phish_data_from_phishtank_scraped.shape

(127320, 3)

In [51]:
data_from_kaggle.shape

(651191, 2)

### 4. Prepare data to save in new files

#### 4.1 Preprocessing phish urls from phishtank
##### In this part the index is being reset urls are saved to new DataFrame.

In [52]:
phish_data_from_phishtank.head()

Unnamed: 0,phish_id,url,phish_detail_url,submission_time,verified,verification_time,online,target
0,8546313,https://ollx.1792836.xyz/captcha,http://www.phishtank.com/phish_detail.php?phis...,2024-04-22T10:55:17+00:00,yes,2024-04-22T11:04:04+00:00,yes,Other
1,8546314,https://ufakick456.com/captcha,http://www.phishtank.com/phish_detail.php?phis...,2024-04-22T10:55:17+00:00,yes,2024-04-22T11:04:04+00:00,yes,Other
2,8546312,https://keyeseuropean.com/captcha,http://www.phishtank.com/phish_detail.php?phis...,2024-04-22T10:55:16+00:00,yes,2024-04-22T11:04:04+00:00,yes,Other
3,8546311,https://allegr0lokalnie.83473636.xyz/captcha,http://www.phishtank.com/phish_detail.php?phis...,2024-04-22T10:55:14+00:00,yes,2024-04-22T11:04:04+00:00,yes,Other
4,8546309,http://ufakick456.com/fb7pl5qw,http://www.phishtank.com/phish_detail.php?phis...,2024-04-22T10:55:13+00:00,yes,2024-04-22T11:04:04+00:00,yes,Other


In [53]:
phish_data_df1 = phish_data_from_phishtank['url']
phish_data_df1

0                         https://ollx.1792836.xyz/captcha
1                           https://ufakick456.com/captcha
2                        https://keyeseuropean.com/captcha
3             https://allegr0lokalnie.83473636.xyz/captcha
4                           http://ufakick456.com/fb7pl5qw
                               ...                        
60799     http://www.ezblox.site/free/jennifer111/helpdesk
60800    http://www.formbuddy.com/cgi-bin/formdisp.pl?u...
60801    http://www.formbuddy.com/cgi-bin/formdisp.pl?u...
60802         http://www.habbocreditosparati.blogspot.com/
60803    http://creditiperhabbogratissicuro100.blogspot...
Name: url, Length: 60804, dtype: object

#### 4.2 Preprocessing non phish urls from phishtank (scraped)
##### In this part the index is being reset urls are saved to new DataFrame. Data is saved to two DataFrames. Df2 contains urls with deleted dots '...' occurring on the end of url (not all url scraped). Df3 contains urls without this urls (they are dropped).

##### Shortening data to not to old (oldest November 2015)

In [54]:
non_phish_data_from_phishtank_scraped = non_phish_data_from_phishtank_scraped[:(len(non_phish_data_from_phishtank_scraped)//4)]

#### Dropping dots: 

In [55]:
non_phish_data_df2 = non_phish_data_from_phishtank_scraped['PHISH_URL'].apply(lambda x: x[:-3] if x.endswith('...') else x).reset_index(drop=True)
non_phish_data_df2.head()

0    http://login.live.com.office.autooffc29536.eup...
1                         https://releases.jquery.com/
2                             https://code.jquery.com/
3                         https://app.ether.fi/blocked
4    https://fonts.googleapis.com/css2?family=Open+...
Name: PHISH_URL, dtype: object

#### Deleting urls with dots

In [56]:
full_url_filter = ~non_phish_data_from_phishtank_scraped['PHISH_URL'].str.endswith('...')
non_phish_data_df3 = non_phish_data_from_phishtank_scraped['PHISH_URL'].where(full_url_filter).dropna().reset_index(drop=True)
non_phish_data_df3.head()

0                https://releases.jquery.com/
1                    https://code.jquery.com/
2                https://app.ether.fi/blocked
3    https://efty.com/d/2zfr2/NeonClaims.com/
4           https://www.yahoo.com/?&sid=74508
Name: PHISH_URL, dtype: object

#### 4.3 Preprocessing urls from kaggle
##### In this part the index is being reset urls are saved to new DataFrame, adding missing scheme and filtering the only benign data.


In [57]:
data_from_kaggle.head()

Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign
3,http://www.garage-pirenne.be/index.php?option=...,defacement
4,http://adventure-nicaragua.net/index.php?optio...,defacement


In [58]:
filter_benign_data = data_from_kaggle['type'] == 'benign'
benign_data = data_from_kaggle.where(filter_benign_data).dropna().reset_index(drop=True)
benign_data.head()

Unnamed: 0,url,type
0,mp3raid.com/music/krizz_kaliko.html,benign
1,bopsecrets.org/rexroth/cr/1.htm,benign
2,http://buzzfil.net/m/show-art/ils-etaient-loin...,benign
3,espn.go.com/nba/player/_/id/3457/brandon-rush,benign
4,yourbittorrent.com/?q=anthony-hamilton-soulife,benign


In [59]:
filter_phish_data = data_from_kaggle['type'] =='phishing'
phish_data = data_from_kaggle.where(filter_phish_data).dropna().reset_index(drop=True)
phish_data.head()

Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,signin.eby.de.zukruygxctzmmqi.civpro.co.za,phishing
2,http://www.marketingbyinternet.com/mo/e56508df...,phishing
3,https://docs.google.com/spreadsheet/viewform?f...,phishing
4,retajconsultancy.com,phishing


##### Adding missing https scheme:

In [60]:
benign_data_from_kaggle_df4 = benign_data['url'].apply(lambda x: 'https://' + x if not x.startswith('https://') else x)

In [61]:
phish_data_from_kaggle_df4_1 = phish_data['url']

In [62]:
benign_data_from_kaggle_df4.head()

0          https://mp3raid.com/music/krizz_kaliko.html
1              https://bopsecrets.org/rexroth/cr/1.htm
2    https://http://buzzfil.net/m/show-art/ils-etai...
3    https://espn.go.com/nba/player/_/id/3457/brand...
4    https://yourbittorrent.com/?q=anthony-hamilton...
Name: url, dtype: object

In [63]:
phish_data_from_kaggle_df4_1.head()

0                                     br-icloud.com.br
1           signin.eby.de.zukruygxctzmmqi.civpro.co.za
2    http://www.marketingbyinternet.com/mo/e56508df...
3    https://docs.google.com/spreadsheet/viewform?f...
4                                 retajconsultancy.com
Name: url, dtype: object

### 5. Save data to files

In [64]:
phish_data_df1 = pd.DataFrame(phish_data_df1)
phish_data_df1.rename({'': 'ID', 'url': 'PHISH_URL'}, axis=1, inplace=True)
phish_data_df1.head()

Unnamed: 0,PHISH_URL
0,https://ollx.1792836.xyz/captcha
1,https://ufakick456.com/captcha
2,https://keyeseuropean.com/captcha
3,https://allegr0lokalnie.83473636.xyz/captcha
4,http://ufakick456.com/fb7pl5qw


In [65]:
non_phish_data_df2 = pd.DataFrame(non_phish_data_df2)
non_phish_data_df2.rename({'': 'ID', 'PHISH_URL': 'NON_PHISH_URL'}, axis=1, inplace=True)
non_phish_data_df2.head()

Unnamed: 0,NON_PHISH_URL
0,http://login.live.com.office.autooffc29536.eup...
1,https://releases.jquery.com/
2,https://code.jquery.com/
3,https://app.ether.fi/blocked
4,https://fonts.googleapis.com/css2?family=Open+...


In [66]:
non_phish_data_df3 = pd.DataFrame(non_phish_data_df3)
non_phish_data_df3.rename({'': 'ID', 'PHISH_URL': 'NON_PHISH_URL'}, axis=1, inplace=True)
non_phish_data_df3.head()

Unnamed: 0,NON_PHISH_URL
0,https://releases.jquery.com/
1,https://code.jquery.com/
2,https://app.ether.fi/blocked
3,https://efty.com/d/2zfr2/NeonClaims.com/
4,https://www.yahoo.com/?&sid=74508


In [67]:
benign_data_from_kaggle_df4 = pd.DataFrame(benign_data_from_kaggle_df4)
benign_data_from_kaggle_df4.rename({'': 'ID', 'url': 'NON_PHISH_URL'}, axis=1, inplace=True)
benign_data_from_kaggle_df4.head()

Unnamed: 0,NON_PHISH_URL
0,https://mp3raid.com/music/krizz_kaliko.html
1,https://bopsecrets.org/rexroth/cr/1.htm
2,https://http://buzzfil.net/m/show-art/ils-etai...
3,https://espn.go.com/nba/player/_/id/3457/brand...
4,https://yourbittorrent.com/?q=anthony-hamilton...


In [68]:
phish_data_from_kaggle_df4_1 = pd.DataFrame(phish_data_from_kaggle_df4_1)
phish_data_from_kaggle_df4_1.rename({'': 'ID', 'url': 'PHISH_URL'}, axis=1, inplace=True)
phish_data_from_kaggle_df4_1.head()

Unnamed: 0,PHISH_URL
0,br-icloud.com.br
1,signin.eby.de.zukruygxctzmmqi.civpro.co.za
2,http://www.marketingbyinternet.com/mo/e56508df...
3,https://docs.google.com/spreadsheet/viewform?f...
4,retajconsultancy.com


In [69]:
legitimate_concatenated_with_deleted_dots_df5 = pd.concat([non_phish_data_df2, benign_data_from_kaggle_df4])
legitimate_concatenated_with_deleted_dots_df5.head()

Unnamed: 0,NON_PHISH_URL
0,http://login.live.com.office.autooffc29536.eup...
1,https://releases.jquery.com/
2,https://code.jquery.com/
3,https://app.ether.fi/blocked
4,https://fonts.googleapis.com/css2?family=Open+...


In [75]:
legitimate_concatenated_with_doped_urls_with_dots_df6 = pd.concat([non_phish_data_df3, benign_data_from_kaggle_df4]).reset_index(drop=True)
legitimate_concatenated_with_doped_urls_with_dots_df6.head()

Unnamed: 0,NON_PHISH_URL
0,https://releases.jquery.com/
1,https://code.jquery.com/
2,https://app.ether.fi/blocked
3,https://efty.com/d/2zfr2/NeonClaims.com/
4,https://www.yahoo.com/?&sid=74508


In [76]:
phish_concatenated_urls_df7 = pd.concat([phish_data_df1, phish_data_from_kaggle_df4_1]).reset_index(drop=True)
phish_concatenated_urls_df7.tail()

Unnamed: 0,PHISH_URL
154910,xbox360.ign.com/objects/850/850402.html
154911,games.teamxbox.com/xbox-360/1860/Dead-Space/
154912,www.gamespot.com/xbox360/action/deadspace/
154913,en.wikipedia.org/wiki/Dead_Space_(video_game)
154914,www.angelfire.com/goth/devilmaycrytonite/


##### Length of concatenated DataFrames:

In [83]:
phish_concatenated_urls_df7.shape

(154832, 1)

In [92]:
phish_concatenated_urls_df7 = pd.DataFrame(phish_concatenated_urls_df7['PHISH_URL'].drop_duplicates())
phish_concatenated_urls_df7 = phish_concatenated_urls_df7.assign(label=pd.Series([1]*len(phish_concatenated_urls_df7), index=phish_concatenated_urls_df7.index))
phish_concatenated_urls_df7.shape

(154832, 2)

In [93]:
phish_concatenated_urls_df7.head()

Unnamed: 0,PHISH_URL,label
0,https://ollx.1792836.xyz/captcha,1
1,https://ufakick456.com/captcha,1
2,https://keyeseuropean.com/captcha,1
3,https://allegr0lokalnie.83473636.xyz/captcha,1
4,http://ufakick456.com/fb7pl5qw,1


In [85]:
legitimate_concatenated_with_deleted_dots_df5.shape

(457807, 1)

In [94]:
legitimate_concatenated_with_deleted_dots_df5 = pd.DataFrame(legitimate_concatenated_with_deleted_dots_df5['NON_PHISH_URL'].drop_duplicates())
legitimate_concatenated_with_deleted_dots_df5 = legitimate_concatenated_with_deleted_dots_df5.assign(
    label=pd.Series([0]*len(legitimate_concatenated_with_deleted_dots_df5), index=legitimate_concatenated_with_deleted_dots_df5.index))
legitimate_concatenated_with_deleted_dots_df5.shape

(457807, 2)

In [95]:
legitimate_concatenated_with_deleted_dots_df5.head()

Unnamed: 0,NON_PHISH_URL,label
0,http://login.live.com.office.autooffc29536.eup...,0
1,https://releases.jquery.com/,0
2,https://code.jquery.com/,0
3,https://app.ether.fi/blocked,0
4,https://fonts.googleapis.com/css2?family=Open+...,0


In [87]:
legitimate_concatenated_with_doped_urls_with_dots_df6.shape

(443531, 1)

In [98]:
legitimate_concatenated_with_doped_urls_with_dots_df6 = pd.DataFrame(legitimate_concatenated_with_doped_urls_with_dots_df6['NON_PHISH_URL'].drop_duplicates())
legitimate_concatenated_with_doped_urls_with_dots_df6 = legitimate_concatenated_with_doped_urls_with_dots_df6.assign(
    label=pd.Series([0]*len(legitimate_concatenated_with_doped_urls_with_dots_df6), index=legitimate_concatenated_with_doped_urls_with_dots_df6.index))
legitimate_concatenated_with_doped_urls_with_dots_df6.shape

(443531, 2)

In [99]:
legitimate_concatenated_with_doped_urls_with_dots_df6.head()

Unnamed: 0,NON_PHISH_URL,label
0,https://releases.jquery.com/,0
1,https://code.jquery.com/,0
2,https://app.ether.fi/blocked,0
3,https://efty.com/d/2zfr2/NeonClaims.com/,0
4,https://www.yahoo.com/?&sid=74508,0


#### Saving into files

In [100]:
phish_concatenated_urls_df7.to_csv('preprocessed_data/phish_data.csv')

In [101]:
legitimate_concatenated_with_deleted_dots_df5.to_csv('preprocessed_data/non_phish_data_longer.csv')

In [102]:
legitimate_concatenated_with_doped_urls_with_dots_df6.to_csv('preprocessed_data/non_phish_data_shorter.csv')

### 6. Data are prepared for analysis and ML