In [1]:
import pandas as pd #Biblioteca responsavel pela manipulação dos dados
import numpy as np #Biblioteca pra trabalhar com Array e Matriz
import warnings #Biblioteca que sera usado pra geramos alertas no programa
import seaborn as sns #Biblioteca de Plot
import matplotlib.pyplot as plt #Tambem uma biblioteca de plot(Trabalha em conjunto com seaborn)

#Vamos vendo com o tempo o que sera usado e descartado, como exemplo vamos deixar esses 3 como principais pra teste de acuracia.
from sklearn.ensemble import RandomForestClassifier #Modelo de Machine Learning - TESTE 1
from sklearn.linear_model import LinearRegression, LogisticRegression #Modelo de Machine Learning - TESTE 2
from sklearn.naive_bayes import MultinomialNB #use to predict

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score #Anda em conjunto com o ML, 3 tipos de métricas para aprendizagem da maquina e testes

#modelo de seleção
from sklearn.model_selection import train_test_split #Modelo de Machine Learning - TESTE 3

from nltk.tokenize import RegexpTokenizer #split words frin text
from nltk.stem.snowball import SnowballStemmer #stemmes words
from sklearn.feature_extraction.text import CountVectorizer #create sparse matrix of words usind regexp

from sklearn.tree import export_graphviz
import graphviz

In [2]:
df = pd.read_csv('malicious_phish.csv') #Coletando o primeiro dataset

In [3]:
df.fillna(df.mode().iloc[0], inplace=True) #substituir os campos nulos por zero
print(df.info()) #conteudo do dataset 1

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 651191 entries, 0 to 651190
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   url     651191 non-null  object
 1   type    651191 non-null  object
dtypes: object(2)
memory usage: 9.9+ MB
None


In [4]:
df.head(10)

Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign
3,http://www.garage-pirenne.be/index.php?option=...,defacement
4,http://adventure-nicaragua.net/index.php?optio...,defacement
5,http://buzzfil.net/m/show-art/ils-etaient-loin...,benign
6,espn.go.com/nba/player/_/id/3457/brandon-rush,benign
7,yourbittorrent.com/?q=anthony-hamilton-soulife,benign
8,http://www.pashminaonline.com/pure-pashminas,defacement
9,allmusic.com/album/crazy-from-the-heat-r16990,benign


In [5]:
df.isnull().sum() #mostra os valores nulos no dataset 1

url     0
type    0
dtype: int64

In [6]:
df.describe() #Conteudo do dataset 1: 641.191 dados e 4 tipos de classificação

Unnamed: 0,url,type
count,651191,651191
unique,641119,4
top,http://style.org.hc360.com/css/detail/mysite/s...,benign
freq,180,428103


In [7]:
print(f"Colunas do dataframe DF {df.columns}, \nValores para ML: {df['type'].value_counts()}, \nConteúdo: {df.index}\n" )

Colunas do dataframe DF Index(['url', 'type'], dtype='object'), 
Valores para ML: type
benign        428103
defacement     96457
phishing       94111
malware        32520
Name: count, dtype: int64, 
Conteúdo: RangeIndex(start=0, stop=651191, step=1)



In [8]:
tokenizer = RegexpTokenizer(r'[A-Za-z]+')

In [10]:
print(df.url[0])

tokenizer.tokenize(df.url[0])

br-icloud.com.br


['br', 'icloud', 'com', 'br']

In [11]:
df['text_tokenized'] = df.url.map(lambda t: tokenizer.tokenize(t))

In [12]:
df.sample(5)

Unnamed: 0,url,type,text_tokenized
32742,youtube.com/watch?v=RMwy-wCSlig,benign,"[youtube, com, watch, v, RMwy, wCSlig]"
31068,legrandcharles.wordpress.com/,benign,"[legrandcharles, wordpress, com]"
3820,central-pages-security.ga,phishing,"[central, pages, security, ga]"
234623,euescuto.com.br/2009/08/06/howlin-rain-download/,benign,"[euescuto, com, br, howlin, rain, download]"
554716,https://onedrive.live.com/download?cid=DD0A0EC...,malware,"[https, onedrive, live, com, download, cid, DD..."


In [13]:
stemmer = SnowballStemmer("english")

In [14]:
df['text_stemmed'] = df['text_tokenized'].map(lambda l: [stemmer.stem(word) for word in l])

In [15]:
df.sample(5)

Unnamed: 0,url,type,text_tokenized,text_stemmed
40851,cannon-digital-cameras-1.blogspot.com/2008_11_...,benign,"[cannon, digital, cameras, blogspot, com, arch...","[cannon, digit, camera, blogspot, com, archiv,..."
297148,cbc.ca/news/canadavotes/riding/217/candidate.html,benign,"[cbc, ca, news, canadavotes, riding, candidate...","[cbc, ca, news, canadavot, ride, candid, html]"
246705,http://9779.info/ppt%E5%89%AA%E8%B4%B4%E7%94%B...,malware,"[http, info, ppt, E, AA, E, B, B, E, BB, E, B,...","[http, info, ppt, e, aa, e, b, b, e, bb, e, b,..."
611599,www.amec.es/amec/ServletControler?accion=amtex...,phishing,"[www, amec, es, amec, ServletControler, accion...","[www, amec, es, amec, servletcontrol, accion, ..."
165150,http://rubybrand.com/index.php?option=com_cont...,defacement,"[http, rubybrand, com, index, php, option, com...","[http, rubybrand, com, index, php, option, com..."


In [16]:
df['text_sent'] = df['text_stemmed'].map(lambda l: ' '.join(l))

In [17]:
df.sample(5)

Unnamed: 0,url,type,text_tokenized,text_stemmed,text_sent
429751,guitar-rigs.com/,benign,"[guitar, rigs, com]","[guitar, rig, com]",guitar rig com
21123,yazehra.com/mubarak.htm,benign,"[yazehra, com, mubarak, htm]","[yazehra, com, mubarak, htm]",yazehra com mubarak htm
138247,http://www.motiv8.cz/osobni-rozvoj/role-obchod...,defacement,"[http, www, motiv, cz, osobni, rozvoj, role, o...","[http, www, motiv, cz, osobni, rozvoj, role, o...",http www motiv cz osobni rozvoj role obchodnih...
215566,http://www.billh.net/camelotcraftingdaoc/phpBB...,defacement,"[http, www, billh, net, camelotcraftingdaoc, p...","[http, www, billh, net, camelotcraftingdaoc, p...",http www billh net camelotcraftingdaoc phpbb v...
169301,askville.amazon.com/SimilarQuestions.do?req=li...,benign,"[askville, amazon, com, SimilarQuestions, do, ...","[askvill, amazon, com, similarquest, do, req, ...",askvill amazon com similarquest do req list sp...


In [18]:
CountVectorize = CountVectorizer()

In [19]:
feat = CountVectorize.fit_transform(df.text_sent) #transforma todo o texto que foi stemado e tokenizado

In [20]:
feat[:5].toarray() #convertendo uma matriz sparse para um array para imprimir a variavel feat

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [21]:
trainX, testX, trainY, testY = train_test_split(feat, df.type)

In [22]:
#LOGISTIC REGRESSION
lr = LogisticRegression()

In [23]:
lr.fit(trainX,trainY)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [24]:
lr.score(testX,testY)

0.9505276477598005