# Preprocessing Files 

## In this file, urls from the collected data are cleaned and structured

### 1. Import libraries

In [None]:
import pandas as pd

### 2. Loading collected data from files

The data is mainly collected from Phishtank and URL dataset ISCX-URL-2016 (present on kaggle), but some data is polluted or unnecessary (urls with malware or defacement). Data sources:
- https://www.kaggle.com/datasets/sid321axn/malicious-urls-dataset/data (benign data from: https://www.unb.ca/cic/datasets/url-2016.html)
- https://phishtank.org/ (online validated and https://phishtank.org/phish_search.php?valid=n&active=All&Search=Search)

In [None]:
phish_data_from_phishtank = pd.read_csv('./datasets/verified_online.csv')
non_phish_data_from_phishtank_scraped = pd.read_csv('./datasets/non-phishing-urls-phishtank-final.csv')
data_from_kaggle = pd.read_csv('./datasets/malicious_phish.csv')

### 3. First look on the data

In [None]:
phish_data_from_phishtank.head(10)

In [None]:
non_phish_data_from_phishtank_scraped.head(10)

In [None]:
data_from_kaggle.head(10)

##### Length of these datasets:

In [None]:
len(phish_data_from_phishtank)

In [None]:
len(non_phish_data_from_phishtank_scraped)

In [None]:
len(data_from_kaggle)

In [None]:
phish_data_from_phishtank.shape

In [None]:
non_phish_data_from_phishtank_scraped.shape

In [None]:
data_from_kaggle.shape

### 4. Prepare data to save in new files

#### 4.1 Preprocessing phish urls from phishtank
##### In this part the index is being reset urls are saved to new DataFrame.

In [None]:
phish_data_from_phishtank.head()

In [None]:
phish_data_df1 = phish_data_from_phishtank['url']
phish_data_df1

#### 4.2 Preprocessing non phish urls from phishtank (scraped)
##### In this part the index is being reset urls are saved to new DataFrame. Data is saved to two DataFrames. Df2 contains urls with deleted dots '...' occurring on the end of url (not all url scraped). Df3 contains urls without this urls (they are dropped).

##### Shortening data to not to old (oldest November 2015)

In [None]:
non_phish_data_from_phishtank_scraped = non_phish_data_from_phishtank_scraped[:(len(non_phish_data_from_phishtank_scraped)//4)]

#### Dropping dots: 

In [None]:
non_phish_data_df2 = non_phish_data_from_phishtank_scraped['PHISH_URL'].apply(lambda x: x[:-3] if x.endswith('...') else x).reset_index(drop=True)
non_phish_data_df2.head()

#### Deleting urls with dots

In [None]:
full_url_filter = ~non_phish_data_from_phishtank_scraped['PHISH_URL'].str.endswith('...')
non_phish_data_df3 = non_phish_data_from_phishtank_scraped['PHISH_URL'].where(full_url_filter).dropna().reset_index(drop=True)
non_phish_data_df3.head()

#### 4.3 Preprocessing urls from kaggle
##### In this part the index is being reset urls are saved to new DataFrame, adding missing scheme and filtering the only benign data.


In [None]:
data_from_kaggle.head()

In [None]:
filter_benign_data = data_from_kaggle['type'] == 'benign'
benign_data = data_from_kaggle.where(filter_benign_data).dropna().reset_index(drop=True)
benign_data

##### Adding missing https scheme:

In [None]:
benign_data_from_kaggle_df4 = benign_data['url'].apply(lambda x: 'https://' + x if not x.startswith('https://') else x)

In [None]:
benign_data_from_kaggle_df4.head()

### 5. Save data to files

In [None]:
phish_data_df1 = pd.DataFrame(phish_data_df1)
phish_data_df1.rename({'': 'ID', 'url': 'PHISH_URL'}, axis=1, inplace=True)
phish_data_df1.head()

In [None]:
non_phish_data_df2 = pd.DataFrame(non_phish_data_df2)
non_phish_data_df2.rename({'': 'ID', 'PHISH_URL': 'NON_PHISH_URL'}, axis=1, inplace=True)
non_phish_data_df2.head()

In [None]:
non_phish_data_df3 = pd.DataFrame(non_phish_data_df3)
non_phish_data_df3.rename({'': 'ID', 'PHISH_URL': 'NON_PHISH_URL'}, axis=1, inplace=True)
non_phish_data_df3.head()

In [None]:
benign_data_from_kaggle_df4 = pd.DataFrame(benign_data_from_kaggle_df4)
benign_data_from_kaggle_df4.rename({'': 'ID', 'url': 'NON_PHISH_URL'}, axis=1, inplace=True)
benign_data_from_kaggle_df4.head()

In [None]:
legitimate_concatenated_with_deleted_dots_df5 = pd.concat([non_phish_data_df2, benign_data_from_kaggle_df4])
legitimate_concatenated_with_deleted_dots_df5.head()

In [None]:
legitimate_concatenated_with_doped_urls_with_dots_df6 = pd.concat([non_phish_data_df3, benign_data_from_kaggle_df4])
legitimate_concatenated_with_doped_urls_with_dots_df6

##### Length of concatenated DataFrames:

In [None]:
phish_data_df1.shape

In [None]:
phish_data_df1 = pd.DataFrame(phish_data_df1['PHISH_URL'].drop_duplicates())
phish_data_df1.shape

In [None]:
legitimate_concatenated_with_deleted_dots_df5.shape

In [None]:
legitimate_concatenated_with_deleted_dots_df5 = pd.DataFrame(legitimate_concatenated_with_deleted_dots_df5['NON_PHISH_URL'].drop_duplicates())
legitimate_concatenated_with_deleted_dots_df5.shape

In [None]:
legitimate_concatenated_with_doped_urls_with_dots_df6.shape

In [None]:
legitimate_concatenated_with_doped_urls_with_dots_df6 = pd.DataFrame(legitimate_concatenated_with_doped_urls_with_dots_df6['NON_PHISH_URL'].drop_duplicates())
legitimate_concatenated_with_doped_urls_with_dots_df6.shape

#### Saving into files

In [None]:
phish_data_df1.to_csv('preprocessed_data/phish_data.csv')

In [None]:
legitimate_concatenated_with_deleted_dots_df5.to_csv('preprocessed_data/non_phish_data_longer.csv')

In [None]:
legitimate_concatenated_with_doped_urls_with_dots_df6.to_csv('preprocessed_data/non_phish_data_shorter.csv')

### 6. Data prepared for analysis and ML