<a href="https://colab.research.google.com/github/jy9922/URLMachineLearing/blob/main/URL_detection_work1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import urllib.request
import seaborn as sns
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

data = pd.read_csv('/content/url_data_2.csv', encoding='latin1')
display(data.info())

In [None]:
print('총 샘플의 수 :',len(data))

In [None]:
data[:5]

In [None]:
data = data[['url','label','result','url_length','hostname_length','path_length','count-','count@','count?','count%','count,','count=','count-http','count-https','count-www','no_ip']]

data[:5]

In [None]:
data['label'] = data['label'].replace(['url','murl'],[1,0])

In [None]:
data['url'].nunique(), data['label'].nunique() 

In [None]:
data.drop_duplicates(subset=['url'], inplace=True)   
print('총 샘플의 수 : ', len(data))

In [None]:
from urllib.parse import urlparse

In [None]:
import ipaddress

In [None]:
def having_ip_address(url):
   try:
     ipaddress.ip_address(url)
   except:
     if(urlparse(url).path):
       return 0.0
     else:
       return 0.0
   else:
     return 1.0

In [None]:
data['url_length'] = data['url'].apply(lambda i: len(str(i)))
data['hostname_length'] = data['url'].astype(str).apply(lambda i: len(urlparse(i).netloc))
data['path_length'] = data['url'].astype(str).apply(lambda i : len(urlparse(i).path))
data['count-'] = data['url'].astype(str).apply(lambda i : i.count('-'))
data['count@'] = data['url'].astype(str).apply(lambda i : i.count('@'))
data['count?'] = data['url'].astype(str).apply(lambda i : i.count('?'))
data['count%'] = data['url'].astype(str).apply(lambda i : i.count('%'))
data['count,'] = data['url'].astype(str).apply(lambda i : i.count(','))
data['count='] = data['url'].astype(str).apply(lambda i : i.count('='))
data['count-http'] = data['url'].astype(str).apply(lambda i : i.count('http'))
data['count-https'] = data['url'].astype(str).apply(lambda i : i.count('https'))
data['count-www'] = data['url'].astype(str).apply(lambda i : i.count('www'))
data['no_ip'] = data['url'].astype(str).apply(lambda i : having_ip_address(i))
data[:5]

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

x = data[['hostname_length','path_length','count-','count@','count?','count%','count,','count=','count-http','count-https','count-www','no_ip']]
y = data['result']

print(x.shape)
print(y.shape)
print("Percent Of Malicious URLs:{:.2f} %".format(len(data[data['label']==0.0])/len(data['label'])*100))
print("Percent Of Benign URLs:{:.2f} %".format(len(data[data['label']==1.0])/len(data['label'])*100))


In [None]:
plt.figure(figsize=(15,5))
sns.countplot(x="label",data=data)
plt.title("Count Of URLs",fontsize="15")
plt.xlabel("Type of URLS", fontsize="13")
plt.ylabel("Number of URLs", fontsize="13")

In [None]:
x_df = pd.DataFrame(x)
y_df = pd.DataFrame(y)

In [None]:
x_df.dropna(axis=0, how='any')
y_df.dropna(axis=0, how='any')

In [None]:
from imblearn.over_sampling import SMOTE

x_sample, y_sample = SMOTE().fit_resample(x,y.ravel())

x_sample = pd.DataFrame(x_sample)
y_sample = pd.DataFrame(y_sample)

print("Size of x_sample:", x_sample.shape)
print("Size of y_sample:", y_sample.shape)

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3)
print("Shape of x_train:", x_train.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of x_test:", x_test.shape)
print("Shape of y_test:", y_test.shape)

x_train, x_valid, y_train, y_valid = train_test_split(x_train,y_train,test_size=0.2)
print("Shape of x_train:", x_train.shape)
print("Shape of x_train:", x_test.shape)
print("Shape of x_valid:", x_valid.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_train:", y_test.shape)
print("Shape of y_valid:", y_valid.shape)



In [None]:
from tensorflow.keras.layers import Dense, Conv1D, GlobalMaxPooling1D, Embedding, Dropout, MaxPooling1D
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model

In [None]:
model = Sequential()
model.add(Dense(32, activation='relu', input_shape=(12, )))

model.add(Dense(16, activation='relu'))

model.add(Dense(8, activation='relu'))

model.add(Dense(1, activation='sigmoid'))
model.summary()

In [None]:
from tensorflow import keras

opt = keras.optimizers.Adam(lr=0.0001)
model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['acc'])

checkpointer = ModelCheckpoint('best_model.h5', monitor = 'val_acc', mode = 'max', verbose = 2, save_best_only = True)

In [None]:
history = model.fit(x_train, y_train, epochs = 50, batch_size=265, validation_data=(x_valid, y_valid), callbacks=[checkpointer])

In [None]:
print("\n 테스트 정확도: %.4f" % (model.evaluate(x_test, y_test)[1]))

In [None]:
pred_test = model.predict(x_test)

In [None]:
preds_1d = pred_test.flatten() # 차원 펴주기
pred_class = np.where(preds_1d > 0.5, 1 , 0) #0.5보다크면 2, 작으면 1

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

def get_clf_eval (y_test, pred):
  confusion = confusion_matrix(y_test, pred)
  accuracy = accuracy_score(y_test, pred)
  precision = precision_score(y_test, pred)
  recall = recall_score(y_test, pred)
  print("Confusion Matrix")
  print(confusion)
  print('정확도:{}, 정밀도:{}, 재현율:{}'.format(accuracy, precision, recall))

get_clf_eval(y_test,pred_class)

model.save('model.h5')