# Modelling
---
Build a simple model with logistic regression and simple feature eng

In [1]:
import pandas as pd

In [113]:
df = pd.read_csv("../data/malicious_phish.csv")
df.head(2)

Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign


# Feature Engineering

In [50]:
for htt in ['http',"https"]:
    df[htt] = df['url'].apply(lambda x: htt in x)
    df['url'] = df['url'].apply(lambda x: x.split("://")[-1])

In [55]:
df['url'] = df['url'].apply(lambda x: x.replace("www.",""))

In [62]:
for end in [".com",".net",".gov",".edu"]:
    df[end] = df['url'].apply(lambda x: end in x)

In [64]:
df['dotCount'] = df['url'].apply(lambda x: x.count("."))
df['SlashCount'] = df['url'].apply(lambda x: x.count("/"))
df['InterrogationCount'] = df['url'].apply(lambda x: x.count("?"))
df['lenUrl'] = df['url'].apply(lambda x: len(x))

In [75]:
df.sample(10)

Unnamed: 0,url,type,http,https,.com,.net,.gov,.edu,dotCount,SlashCount,InterrogationCount,lenUrl
176977,jtbgreece.com/europe-short-escapes,defacement,True,False,True,False,False,False,1,1,0,34
328242,swimming.about.com/od/olympicswimmingmedalists...,benign,False,False,True,False,False,False,3,4,0,68
40747,biola.edu/,benign,False,False,False,False,False,True,1,1,0,10
350630,peaceoneday.org/,benign,False,False,False,False,False,False,1,1,0,16
457719,skyrock.com/videos/eDJjcno0dDoxMw==-Jason-Sude...,benign,True,False,True,False,False,False,2,2,1,122
572390,antica.co/en/wp-content//themes/antica/js/dir/...,benign,False,False,False,False,False,False,1,9,0,79
175783,wijzijnnietgevaarlijk.nl/honden-fotos/honden-v...,defacement,True,False,False,False,False,False,2,2,0,58
458675,blog.angryasianman.com/2011/10/california-asse...,benign,False,False,True,False,False,False,3,3,0,73
569016,eu.diablo.net.ms.sy-login.in/login.html?app=wa...,benign,False,False,True,True,False,False,7,4,1,107
78696,people.famouswhy.com/henry_blodgett/,benign,False,False,True,False,False,False,2,2,0,36


In [111]:
def url_model_format(url="mp3raid.com/music/krizz_kaliko.html"):
    df = {}
    for htt in ['http',"https"]:
        df[htt] = int(htt in url)
        url = url.split("://")[-1]    
    
    url = url.replace("www.","")
    
    for end in [".com",".net",".gov",".edu"]:
        df[end] = int(end in url)
    
    df['dotCount'] = url.count(".")
    df['SlashCount'] = url.count("/")
    df['InterrogationCount'] = url.count("?")
    df['lenUrl'] = len(url)
    return df
url_model_format()

{'http': 0,
 'https': 0,
 '.com': 1,
 '.net': 0,
 '.gov': 0,
 '.edu': 0,
 'dotCount': 2,
 'SlashCount': 2,
 'InterrogationCount': 0,
 'lenUrl': 35}

# Data Selection

In [76]:
df['target'] = df['type'] != "benign"
df.drop(['url','type'],axis=1,inplace=True)
df.head(2)

Unnamed: 0,http,https,.com,.net,.gov,.edu,dotCount,SlashCount,InterrogationCount,lenUrl,target
0,False,False,True,False,False,False,2,0,0,16,True
1,False,False,True,False,False,False,2,2,0,35,False


In [78]:
bools_to_int = df.select_dtypes(bool).columns
df[bools_to_int] = df[bools_to_int].astype(int)
df.head(2)

Unnamed: 0,http,https,.com,.net,.gov,.edu,dotCount,SlashCount,InterrogationCount,lenUrl,target
0,0,0,1,0,0,0,2,0,0,16,1
1,0,0,1,0,0,0,2,2,0,35,0


EDA 
---
Space to build eda further and add new features

In [82]:
df.corr()['target']

http                  0.624526
https                 0.001853
.com                 -0.275113
.net                 -0.022036
.gov                 -0.024181
.edu                 -0.000241
dotCount              0.101167
SlashCount           -0.172996
InterrogationCount    0.131495
lenUrl                0.011327
target                1.000000
Name: target, dtype: float64

# Models

In [84]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [85]:
x_cols = df.columns.drop("target").tolist()

X_train, X_test, y_train, y_test = train_test_split(
    df[x_cols], df['target'], test_size=0.33, 
    random_state=42, stratify=df['target']
)

In [88]:
model = LogisticRegression(max_iter=1e4)
model.fit(X_train,y_train)

LogisticRegression(max_iter=10000.0)

# Evaluation

In [96]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [104]:
confusion_matrix(y_true=y_test,y_pred=model.predict(X_test),normalize='true')

array([[0.93526102, 0.06473898],
       [0.33775248, 0.66224752]])

# Save Model

In [250]:
from joblib import dump, load
import os

In [252]:
if not os.path.exists("../models/"):
    os.mkdir("../models")
dump(model,"../models/mallicius-detector.pkl")

In [224]:
model = load("../models/mallicius-detector.pkl")

# Test Flask API

In [227]:
import requests
url_sample = df['url'].sample(10).iloc[0]
res = requests.post("http://127.0.0.1:5000/",data={"url":url_sample})
res.json()
print(url_sample)
#print(url_model_format(url_sample))
print(model.predict_proba([list(url_model_format(url_sample).values())])[0][1])

http://www.polska.travel.pl/pogoda-polska/gdansk/pogoda-polska/gdynia/1?miasto=27
0.8325411992989592
