### Importando bibliotecas

In [30]:
from scipy.io.arff import loadarff
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import numpy as np

### Carregando o dataset em um DataFrame

In [31]:
df = pd.read_csv('../datasets/training_dataset.csv')

### Tratando o dataset

In [32]:
columns = list(filter(lambda x: 'Unnamed' not in x, df.columns))

In [33]:
df = df[columns]

### Randomicamente selecionando linhas do dataset para serem treinadas

In [34]:
df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75

### Separando em dois Dataframes, um para ser treinado no algoritmo com 75% do tamanho e outro para Teste com 25% do tamanho da dataset

In [35]:
train, test = df[df['is_train']==True], df[df['is_train']==False]

### Separando as variáveis(características) do dataset

In [36]:
features = df.columns[:30]

### Pegando as etiquetas(resultados) de cada linha no dataset

In [37]:
y = pd.factorize(train['Result'])[0]

### Criando o classificador e "treinando" os dados

In [38]:
clf = RandomForestClassifier(n_jobs=2, random_state=0)

In [39]:
clf.fit(train[features], y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

### Aplicando a previsão do classificador treinando nos dados separados anteriormente para testes do modelo.

In [40]:
preds = clf.predict(test[features])



In [41]:
clf.predict_proba(test[features])

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       ...,
       [0., 1.],
       [1., 0.],
       [1., 0.]])

### Gerando a "Confusion Matrix" com os resultados previstos

In [42]:
pd.crosstab(test['Result'], preds, rownames=['Result'], colnames=['Predicted'])

Predicted,0,1
Result,Unnamed: 1_level_1,Unnamed: 2_level_1
-1,1152,46
1,27,1552


### Pegando o peso de cada característica(variável) no dataset

In [43]:
features_weight = list(zip(train[features], clf.feature_importances_))

### Pegando as características mais importantes calculadas pelo modelo

In [44]:
list(filter(lambda x: x[1] > 0.01, features_weight))


[('having_IP_Address', 0.012673829135057471),
 ('Prefix_Suffix', 0.043617667912837374),
 ('having_Sub_Domain', 0.05816936456220641),
 ('SSLfinal_State', 0.3175100971594438),
 ('Domain_registeration_length', 0.013005935330562084),
 ('Request_URL', 0.015142672905389102),
 ('URL_of_Anchor', 0.27782648691703316),
 ('Links_in_tags', 0.03777530416632548),
 ('SFH', 0.02076374793978199),
 ('age_of_domain', 0.016188119170397284),
 ('DNSRecord', 0.011401984776528058),
 ('web_traffic', 0.0651281159964309),
 ('Page_Rank', 0.013780341862069046),
 ('Google_Index', 0.011887303361420142),
 ('Links_pointing_to_page', 0.02107217933873587)]

In [45]:
df.tail()

Unnamed: 0,having_IP_Address,URL_Length,Shortining_Service,having_At_Symbol,double_slash_redirecting,Prefix_Suffix,having_Sub_Domain,SSLfinal_State,Domain_registeration_length,Favicon,...,Iframe,age_of_domain,DNSRecord,web_traffic,Page_Rank,Google_Index,Links_pointing_to_page,Statistical_report,Result,is_train
11050,1,-1,1,-1,1,1,1,1,-1,-1,...,-1,1,1,-1,-1,1,1,1,1,True
11051,-1,1,1,-1,-1,-1,1,-1,-1,-1,...,1,1,1,1,1,1,-1,1,-1,True
11052,1,-1,1,1,1,-1,1,-1,-1,1,...,1,1,1,1,-1,1,0,1,-1,True
11053,-1,-1,1,1,1,-1,-1,-1,1,-1,...,1,1,1,1,-1,1,1,1,-1,False
11054,-1,-1,1,1,1,-1,-1,-1,1,1,...,1,-1,1,-1,-1,-1,1,-1,-1,False
