In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn  as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

%matplotlib inline 

In [2]:
#data cleaning
def load_dataset(url):
    df = pd.read_csv(url)
    df.columns = df.columns.str.lower().str.strip()
    df = df.drop(['filename','url','domain'],axis=1)
    return df
    

In [3]:
df = load_dataset("./PhiUSIIL_Phishing_URL_Dataset.csv")
df

Unnamed: 0,urllength,domainlength,isdomainip,tld,urlsimilarityindex,charcontinuationrate,tldlegitimateprob,urlcharprob,tldlength,noofsubdomain,...,pay,crypto,hascopyrightinfo,noofimage,noofcss,noofjs,noofselfref,noofemptyref,noofexternalref,label
0,31,24,0,com,100.000000,1.000000,0.522907,0.061933,3,1,...,0,0,1,34,20,28,119,0,124,1
1,23,16,0,de,100.000000,0.666667,0.032650,0.050207,2,1,...,0,0,1,50,9,8,39,0,217,1
2,29,22,0,uk,100.000000,0.866667,0.028555,0.064129,2,2,...,0,0,1,10,2,7,42,2,5,1
3,26,19,0,com,100.000000,1.000000,0.522907,0.057606,3,1,...,1,1,1,3,27,15,22,1,31,1
4,33,26,0,org,100.000000,1.000000,0.079963,0.059441,3,1,...,1,0,1,244,15,34,72,1,85,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235790,29,22,0,com,100.000000,1.000000,0.522907,0.058739,3,1,...,1,0,1,51,7,21,187,2,191,1
235791,28,21,0,uk,100.000000,0.785714,0.028555,0.053834,2,2,...,1,0,0,50,1,7,88,0,31,1
235792,30,23,0,be,100.000000,1.000000,0.003319,0.063093,2,1,...,0,0,1,27,10,30,58,2,67,1
235793,55,47,0,dev,28.157537,0.465116,0.000961,0.050211,3,2,...,0,0,0,0,0,3,0,0,0,0


In [4]:
#feature scaling
def scale_features():
    from sklearn.preprocessing import StandardScaler

    num_features = ['urllength','domainlength','noofsubdomain',
                'obfuscationratio','charcontinuationrate','letterratioinurl',
                'spacialcharratioinurl']
    
    scaler = StandardScaler()
    df[num_features] = scaler.fit_transform(df[num_features])

In [5]:
scale_features()
df

Unnamed: 0,urllength,domainlength,isdomainip,tld,urlsimilarityindex,charcontinuationrate,tldlegitimateprob,urlcharprob,tldlength,noofsubdomain,...,pay,crypto,hascopyrightinfo,noofimage,noofcss,noofjs,noofselfref,noofemptyref,noofexternalref,label
0,-0.086486,0.276436,0,com,100.000000,0.713153,0.522907,0.061933,3,-0.274154,...,0,0,1,34,20,28,119,0,124,1
1,-0.280125,-0.597807,0,de,100.000000,-0.825555,0.032650,0.050207,2,-0.274154,...,0,0,1,50,9,8,39,0,217,1
2,-0.134896,0.057875,0,uk,100.000000,0.097670,0.028555,0.064129,2,1.389828,...,0,0,1,10,2,7,42,2,5,1
3,-0.207510,-0.269966,0,com,100.000000,0.713153,0.522907,0.057606,3,-0.274154,...,1,1,1,3,27,15,22,1,31,1
4,-0.038077,0.494997,0,org,100.000000,0.713153,0.079963,0.059441,3,-0.274154,...,1,0,1,244,15,34,72,1,85,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235790,-0.134896,0.057875,0,com,100.000000,0.713153,0.522907,0.058739,3,-0.274154,...,1,0,1,51,7,21,187,2,191,1
235791,-0.159101,-0.051405,0,uk,100.000000,-0.276016,0.028555,0.053834,2,1.389828,...,1,0,0,50,1,7,88,0,31,1
235792,-0.110691,0.167156,0,be,100.000000,0.713153,0.003319,0.063093,2,-0.274154,...,0,0,1,27,10,30,58,2,67,1
235793,0.494430,2.789885,0,dev,28.157537,-1.755936,0.000961,0.050211,3,1.389828,...,0,0,0,0,0,3,0,0,0,0


In [None]:
#train-test split
def train_test(data):
    X = data.iloc[:,:-1].copy()
    y = data.iloc[:,-1].copy()
    X_main, X_test, y_main, y_test = train_test_split(X,y, test_size=0.2, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_main,y_main, test_size=0.25, random_state=42)

    return X_train, X_val, X_test, y_train, y_val, y_test


In [7]:
X_train, X_val, X_test, y_train, y_val, y_test = train_test(df)
print(X_train.shape, X_val.shape, X_test.shape, y_train.shape, y_val.shape, y_test.shape)

(141477, 52) (47159, 52) (47159, 52) (141477,) (47159,) (47159,)


In [9]:
#feature selection/importance
def feature_selection(x_train,y_train):
    model = RandomForestClassifier()
    model.fit(x_train,y_train)

    importance = model.feature_importances_

    for i,v in  enumerate(importance):
        print(f"Feature:{i}, importance{v}")
    
    sns.barplot([x for x in range(len(importance))], importance)
    plt.show()

In [13]:
X_train.select_dtypes(include=["object"]).columns

Index(['tld', 'title'], dtype='object')

In [17]:
X_train['tld'].value_counts()

tld
com    67364
org    11250
net     4293
uk      3903
app     3854
       ...  
237        1
233        1
bj         1
sb         1
240        1
Name: count, Length: 606, dtype: int64