In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import tree
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score

In [3]:
db = pd.read_csv('../Spam_URL_Detection/data-set/spam_URL.csv') 

In [4]:
db

Unnamed: 0,url,is_spam
0,https://briefingday.us8.list-manage.com/unsubs...,True
1,https://www.hvper.com/,True
2,https://briefingday.com/m/v4n3i4f3,True
3,https://briefingday.com/n/20200618/m#commentform,False
4,https://briefingday.com/fan,True
...,...,...
148298,"https://cdn.substack.com/image/fetch/f_auto,q_...",True
148299,https://numlock.substack.com/subscribe,True
148300,https://docs.google.com/forms/d/e/1FAIpQLSfDLO...,True
148301,https://bookclub.substack.com/,True


In [5]:
db.shape

(148303, 2)

In [6]:
db.describe()

Unnamed: 0,url,is_spam
count,148303,148303
unique,87581,2
top,https://www.bloomberg.com/tosv2.html,False
freq,1086,101021


In [7]:
db.groupby(db['is_spam']).size() #total of 148.303 data. There are 101.021 safe and 47.282 spam.

is_spam
False    101021
True      47282
dtype: int64

In [8]:
db['is_spam'] = db.is_spam.apply(str) # Convert boolean column to string column

In [9]:
db['is_spam'] = db['is_spam'].apply(lambda x : 1 if x == "True" in x else 0)

In [10]:
db

Unnamed: 0,url,is_spam
0,https://briefingday.us8.list-manage.com/unsubs...,1
1,https://www.hvper.com/,1
2,https://briefingday.com/m/v4n3i4f3,1
3,https://briefingday.com/n/20200618/m#commentform,0
4,https://briefingday.com/fan,1
...,...,...
148298,"https://cdn.substack.com/image/fetch/f_auto,q_...",1
148299,https://numlock.substack.com/subscribe,1
148300,https://docs.google.com/forms/d/e/1FAIpQLSfDLO...,1
148301,https://bookclub.substack.com/,1


In [11]:
urls = db.iloc[:,0]

In [12]:
urls.head()

0    https://briefingday.us8.list-manage.com/unsubs...
1                               https://www.hvper.com/
2                   https://briefingday.com/m/v4n3i4f3
3     https://briefingday.com/n/20200618/m#commentform
4                          https://briefingday.com/fan
Name: url, dtype: object

In [13]:
ifSpam = db.iloc[:,1]

In [14]:
ifSpam.head()

0    1
1    1
2    1
3    0
4    1
Name: is_spam, dtype: int64

In [None]:
#Tokenizer

In [15]:
def extractUrl(data):
    db = str(data)
    extractSlash = db.split('/')
    result = []
    
    for i in extractSlash:
        extractDash = str(i).split('-')
        dotExtract = []
        
        for j in range(0,len(extractDash)):
            extractDot = str(extractDash[j]).split('.')
            dotExtract += extractDot
            
        result += extractDash + dotExtract
    result = list(set(result))

    return result


In [16]:
example = "http://www.emrahyldrm-1234.net"
a = extractUrl(example)
print(a)

['', 'www.emrahyldrm', 'http:', '1234.net', 'net', 'emrahyldrm', '1234', 'www']


In [17]:
urls_train, urls_test, ifSpam_train, ifSpam_test = train_test_split(urls, ifSpam, test_size=0.25)

In [18]:
cv = CountVectorizer(tokenizer=extractUrl) # tokenizer in CountVectorizer

In [19]:
features = cv.fit_transform(urls_train)



In [20]:
features_test = cv.transform(urls_test)

In [None]:
#Learning and Predicts

In [21]:
dtModel = tree.DecisionTreeClassifier() 

In [22]:
dtModel.fit(features, ifSpam_train)

In [23]:
dtPredict = dtModel.predict(features_test)

In [24]:
lsvcModel = LinearSVC()

In [25]:
lsvcModel.fit(features, ifSpam_train)

In [26]:
lsvcPredict = lsvcModel.predict(features_test)

In [27]:
sgdcModel = SGDClassifier()

In [28]:
sgdcModel.fit(features, ifSpam_train) 

In [29]:
sgdcPredict = sgdcModel.predict(features_test)

In [30]:
nbModel = MultinomialNB()

In [31]:
nbModel.fit(features, ifSpam_train)

In [32]:
nbPredict = nbModel.predict(features_test)

In [None]:
#Visualization

In [33]:
pip install scikit-learn==1.0

Collecting scikit-learn==1.0
  Using cached scikit-learn-1.0.tar.gz (7.8 MB)
  Installing build dependencies: started
  Installing build dependencies: still running...
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'error'
Note: you may need to restart the kernel to use updated packages.


  error: subprocess-exited-with-error
  
  Preparing metadata (pyproject.toml) did not run successfully.
  exit code: 1
  
  [23 lines of output]
  Partial import of sklearn during the build process.
  Traceback (most recent call last):
    File "C:\Users\user\anaconda3\lib\site-packages\pip\_vendor\pep517\in_process\_in_process.py", line 351, in <module>
      main()
    File "C:\Users\user\anaconda3\lib\site-packages\pip\_vendor\pep517\in_process\_in_process.py", line 333, in main
      json_out['return_val'] = hook(**hook_input['kwargs'])
    File "C:\Users\user\anaconda3\lib\site-packages\pip\_vendor\pep517\in_process\_in_process.py", line 152, in prepare_metadata_for_build_wheel
      return hook(metadata_directory, config_settings)
    File "C:\Users\user\AppData\Local\Temp\pip-build-env-8_ewy2zm\overlay\Lib\site-packages\setuptools\build_meta.py", line 374, in prepare_metadata_for_build_wheel
      self.run_setup()
    File "C:\Users\user\AppData\Local\Temp\pip-build-env-8_ewy2z

In [36]:
import sklearn.metrics 
import plot_confusion_matrix,plot_precision_recall_curve,plot_roc_curve


ModuleNotFoundError: No module named 'plot_confusion_matrix'

In [37]:
import sklearn.metrics
from sklearn.metrics import plot_confusion_matrix, plot_precision_recall_curve, plot_roc_curve


ImportError: cannot import name 'plot_confusion_matrix' from 'sklearn.metrics' (C:\Users\user\anaconda3\lib\site-packages\sklearn\metrics\__init__.py)

In [39]:
def visualization(model):
    predict = model.predict(features_test)
    plot_confusion_matrix(model,features_test,ifSpam_test)
    plot_precision_recall_curve(model,features_test,ifSpam_test)
    plot_roc_curve(model,features_test,ifSpam_test)

In [None]:
# Results

In [None]:
#Decision Tree

In [40]:
print("Number of mislabeled out of a total of %d test entries: %d" % (urls_test.shape[0], 
                                                                      (ifSpam_test != dtPredict).sum()))

Number of mislabeled out of a total of 37076 test entries: 444


In [41]:
Accuracy = 100.0 * f1_score(ifSpam_test, dtPredict, average='micro')

In [42]:
print("Accuracy % : " + str(Accuracy) + " with Decision Tree")

Accuracy % : 98.80245981227749 with Decision Tree


In [43]:
visualization(dtModel)

NameError: name 'plot_confusion_matrix' is not defined

In [48]:
# Linear Support Vector

In [49]:
print("Number of mislabeled out of a total of %d test entries: %d" %  (urls_test.shape[0], 
                                                                       (ifSpam_test != lsvcPredict).sum()))

Number of mislabeled out of a total of 37076 test entries: 535


In [50]:
Accuracy = 100.0 * f1_score(ifSpam_test, lsvcPredict, average='micro')

In [52]:
print("Accuracy % : " + str(Accuracy) + " with LinearSVC")

Accuracy % : 98.55701801704608 with LinearSVC


In [53]:
visualization(lsvcModel)

NameError: name 'plot_confusion_matrix' is not defined

In [None]:
#Stochastic Gradient Descent

In [54]:
print("Number of mislabeled out of a total of %d test entries: %d" %  (urls_test.shape[0], 
                                                                       (ifSpam_test != sgdcPredict).sum()))

Number of mislabeled out of a total of 37076 test entries: 1660


In [55]:
Accuracy = 100.0 * f1_score(ifSpam_test, sgdcPredict, average='micro')

In [57]:
print("Accuracy % : " + str(Accuracy) + " with Stochastic Gradient Descent")

Accuracy % : 95.52271010896538 with Stochastic Gradient Descent


In [58]:
visualization(sgdcModel)

NameError: name 'plot_confusion_matrix' is not defined

In [59]:
#Multinomial Naive Bayes

In [60]:
print("Number of mislabeled out of a total of %d test entries: %d" % (urls_test.shape[0], 
                                                                      (ifSpam_test != nbPredict).sum()))

Number of mislabeled out of a total of 37076 test entries: 3327


In [62]:
Accuracy = 100.0 * f1_score(ifSpam_test, nbPredict, average='micro')

In [63]:
print("Accuracy % : " + str(Accuracy) + " with Multinomial Naive Bayes")

Accuracy % : 91.02654007983601 with Multinomial Naive Bayes


In [64]:
visualization(nbModel)

NameError: name 'plot_confusion_matrix' is not defined

In [45]:
import pickle

In [None]:
#DT MODEL PICKLE

In [46]:
with open('classifier.pkl','wb') as file:
    pickle.dump(dtModel,file)

In [47]:
accuracy = 100.0 * f1_score(ifSpam_test, dtPredict, average='micro')

In [48]:
data_to_save = {
    'model': dtModel,
    'accuracy': accuracy
}

In [49]:
with open('DTclassifier.pkl', 'wb') as file:
    pickle.dump(data_to_save, file)

In [None]:
## Linear Support Vector MODEL PICKLE

In [50]:
Accuracy = 100.0 * f1_score(ifSpam_test, lsvcPredict, average='micro')

In [51]:
data_to_save = {
    'model': lsvcModel,
    'accuracy': accuracy
}

In [52]:
with open('LSVclassifier.pkl', 'wb') as file:
    pickle.dump(data_to_save, file)