# Import Dependencies

In [1]:

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib
import re
import random
from sklearn.metrics import accuracy_score

In [2]:

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [3]:

def getTokens(input):
    tokensBySlash = str(input.encode('utf-8')).split('/')
    allTokens = []
    for i in tokensBySlash:
        tokens = str(i).split('-')
        tokensByDot = []
        for j in range(0,len(tokens)):
            tempTokens = str(tokens[j]).split('.')
            tokensByDot = tokensByDot + tempTokens
        allTokens = allTokens + tokens + tokensByDot
    allTokens = list(set(allTokens))
    if 'com' in allTokens:
        allTokens.remove('com')
    return allTokens

#function to remove "http://" from URL
def trim(url):
    return re.match(r'(?:\w*://)?(?:.*\.)?([a-zA-Z-1-9]*\.[a-zA-Z]{1,}).*', url).groups()[0]

# Prepare Dataset

In [5]:
#read from a file
data = pd.read_csv("../data/dataNN.csv",',',error_bad_lines=False)	#reading file
data['url'].values

array(['diaryofagameaddict.com', 'espdesign.com.au', 'iamagameaddict.com',
       ..., 'owens.edu/news-releases/?p=2052',
       '1.safesecureweb.com/egale/index.asp?item=1173',
       'yurika.otakuthon.com/reg/main.pl/en/'], dtype=object)

In [6]:
len(data)

388447

In [7]:
#convert it into numpy array and shuffle the dataset
data = np.array(data)
random.shuffle(data)


In [None]:
#convert text data into numerical data for machine learning models
y = [d[1] for d in data]
corpus = [d[0] for d in data]
vectorizer = TfidfVectorizer(tokenizer=getTokens)
X = vectorizer.fit_transform(corpus)



In [None]:
#split the data set inot train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Train Machine Learning Models 

In [17]:
#1 - Logistic Regression
model = LogisticRegression(C=1)
model.fit(X_train, y_train)


LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [18]:
print(model.score(X_test,y_test))

0.984245076586


In [10]:
#save the model and vectorizer
joblib.dump(model, "mal-logireg1.pkl", protocol=2)
joblib.dump(vectorizer, "vectorizer1.pkl", protocol=2)

['vectorizer1.pkl']

In [19]:
#make prediction
a = "http://www.savanvisalpara.com"
aa = vectorizer.transform([trim(a)])
s = model.predict(aa)
s[0] #0 for good

0

In [None]:
#2 - SVM
from sklearn.svm import SVC
svcModel = SVC()
svcModel.fit(X_train, y_train)
# lsvcModel = svm.LinearSVC.fit(X_train, y_train)

In [None]:
svcModel.score(X_test, y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
m = RandomForestClassifier(n_estimators=100)
m.fit(X_train, y_train)

# Further experiment

In [None]:
index = int(0.3 * len(data))

In [13]:
from sklearn.utils import shuffle
data = pd.read_csv("data/data.csv",',',error_bad_lines=False)

data = shuffle(data)
url_train = data['url'][index:].values
label_train = data['label'][index:].values
url_test = data['url'][:index].values
label_test = data['label'][:index].values

In [15]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
        ("vectorizer", TfidfVectorizer(tokenizer=getTokens)),
        ("classifier", LogisticRegression())])

pipeline.fit(url_train, label_train)


Pipeline(steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=Tr...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [16]:
pipeline.score(url_test, label_test)

0.97097842689687131

I have stopped working on this. You may want to use advanced methods to achieve higher accuracy(i.e LSTM). Also, a very critical part here is the feature engineering. Simlpy taking input URL as an input is not a good idea. We may find other features,which are more useful than only URL string, from host info, ip, or page content.