### Importando bibliotecas

In [12]:
from scipy.io.arff import loadarff
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import numpy as np

### Carregando o dataset em um DataFrame

In [13]:
df = pd.read_csv('../datasets/training_dataset.csv')

### Tratando o dataset

In [14]:
columns = list(filter(lambda x: 'Unnamed' not in x, df.columns))

In [15]:
df = df[columns]

### Randomicamente selecionando linhas do dataset para serem treinadas

In [16]:
df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75

### Separando em dois Dataframes, um para ser treinado no algoritmo com 75% do tamanho e outro para Teste com 25% do tamanho da dataset

In [17]:
train, test = df[df['is_train']==True], df[df['is_train']==False]

### Separando as variáveis(características) do dataset

In [18]:
features = df.columns[:30]

### Pegando as etiquetas(resultados) de cada linha no dataset

In [19]:
y = pd.factorize(train['Result'])[0]

### Criando o classificador e "treinando" os dados

In [20]:
clf = RandomForestClassifier(n_jobs=2, random_state=0)

In [21]:
clf.fit(train[features], y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

### Aplicando a previsão do classificador treinando nos dados separados anteriormente para testes do modelo.

In [24]:
preds = clf.predict(test[features])



In [25]:
clf.predict_proba(test[features])

array([[0.73      , 0.27      ],
       [0.        , 1.        ],
       [1.        , 0.        ],
       ...,
       [0.65705128, 0.34294872],
       [0.        , 1.        ],
       [1.        , 0.        ]])

### Geramdo a "Confusion Matrix" com os resultados previstos

In [26]:
pd.crosstab(test['Result'], preds, rownames=['Result'], colnames=['Predicted'])

Predicted,0,1
Result,Unnamed: 1_level_1,Unnamed: 2_level_1
-1,1177,44
1,35,1490


### Pegando o peso de cada característica(variável) no dataset

In [27]:
features_weight = list(zip(train[features], clf.feature_importances_))

### Pegando as características mais importantes

In [24]:
list(filter(lambda x: x[1] > 0.01, features_weight))


[('having_IP_Address', 0.013309623974683937),
 ('Prefix_Suffix', 0.043451074382336925),
 ('having_Sub_Domain', 0.065823832914477159),
 ('SSLfinal_State', 0.30969862106064566),
 ('Domain_registeration_length', 0.013276880856033069),
 ('Request_URL', 0.016514685998827387),
 ('URL_of_Anchor', 0.26476687811669192),
 ('Links_in_tags', 0.050880397441754134),
 ('SFH', 0.020367888263323274),
 ('age_of_domain', 0.015638145761864904),
 ('DNSRecord', 0.012084666018642904),
 ('web_traffic', 0.067561369594475895),
 ('Page_Rank', 0.011236541233798903),
 ('Google_Index', 0.012180437502074613),
 ('Links_pointing_to_page', 0.017604748971183453)]

In [26]:
df.tail()

Unnamed: 0,having_IP_Address,URL_Length,Shortining_Service,having_At_Symbol,double_slash_redirecting,Prefix_Suffix,having_Sub_Domain,SSLfinal_State,Domain_registeration_length,Favicon,...,Iframe,age_of_domain,DNSRecord,web_traffic,Page_Rank,Google_Index,Links_pointing_to_page,Statistical_report,Result,is_train
11050,1,-1,1,-1,1,1,1,1,-1,-1,...,-1,1,1,-1,-1,1,1,1,1,True
11051,-1,1,1,-1,-1,-1,1,-1,-1,-1,...,1,1,1,1,1,1,-1,1,-1,True
11052,1,-1,1,1,1,-1,1,-1,-1,1,...,1,1,1,1,-1,1,0,1,-1,True
11053,-1,-1,1,1,1,-1,-1,-1,1,-1,...,1,1,1,1,-1,1,1,1,-1,False
11054,-1,-1,1,1,1,-1,-1,-1,1,1,...,1,-1,1,-1,-1,-1,1,-1,-1,True
