In [4]:
from scipy.io.arff import loadarff
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import numpy as np

In [5]:
dataset = loadarff(open('../datasets/training_dataset.arff', 'r'))

In [6]:
df = pd.DataFrame(data=dataset[0])

In [7]:
def convert_columns_to_int(df):
    for column in df.columns:
        df[column] = df[column].apply(int)
    return df

In [8]:
df = convert_columns_to_int(df)

In [9]:
df.to_csv('../datasets/training_dataset.csv')

In [10]:
df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75

In [14]:
train, test = df[df['is_train']==True], df[df['is_train']==False]

In [15]:
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 8329
Number of observations in the test data: 2726


In [21]:
features = df.columns[:30]

In [24]:
y = pd.factorize(train['Result'])[0]

In [26]:
clf = RandomForestClassifier(n_jobs=2, random_state=0)

In [27]:
clf.fit(train[features], y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [32]:
preds = clf.predict(test[features])



In [33]:
clf.predict_proba(test[features])

array([[ 0.8875,  0.1125],
       [ 1.    ,  0.    ],
       [ 0.    ,  1.    ],
       ..., 
       [ 0.    ,  1.    ],
       [ 1.    ,  0.    ],
       [ 1.    ,  0.    ]])

In [34]:
pd.crosstab(test['Result'], preds, rownames=['Result'], colnames=['Predicted'])

Predicted,0,1
Result,Unnamed: 1_level_1,Unnamed: 2_level_1
-1,1178,53
1,24,1471


In [35]:
list(zip(train[features], clf.feature_importances_))

[('having_IP_Address', 0.012262954561116748),
 ('URL_Length', 0.010888269044474565),
 ('Shortining_Service', 0.0075180702238634173),
 ('having_At_Symbol', 0.0056220149610107795),
 ('double_slash_redirecting', 0.0028961607878335359),
 ('Prefix_Suffix', 0.04734443651119534),
 ('having_Sub_Domain', 0.063707722544143794),
 ('SSLfinal_State', 0.29965294872251186),
 ('Domain_registeration_length', 0.013873630900239121),
 ('Favicon', 0.0041701156789959205),
 ('port', 0.0029167876090423583),
 ('HTTPS_token', 0.0058147530665955587),
 ('Request_URL', 0.021264942805631369),
 ('URL_of_Anchor', 0.26450650539357956),
 ('Links_in_tags', 0.043952239553589345),
 ('SFH', 0.018711593358801386),
 ('Submitting_to_email', 0.0058041451605082518),
 ('Abnormal_URL', 0.0042000236946454859),
 ('Redirect', 0.005498655245836548),
 ('on_mouseover', 0.0033856780386199757),
 ('RightClick', 0.001107090549134456),
 ('popUpWidnow', 0.0039831112983051368),
 ('Iframe', 0.0023733147480981759),
 ('age_of_domain', 0.01760420

In [59]:
newPd = pd.read_csv('../datasets/training_dataset.csv')

In [60]:
columns = newPd.columns

In [61]:
columns = list(filter(lambda x: 'Unnamed' not in x, columns))

In [62]:
bPd = newPd[columns]

In [64]:
bPd

Unnamed: 0,having_IP_Address,URL_Length,Shortining_Service,having_At_Symbol,double_slash_redirecting,Prefix_Suffix,having_Sub_Domain,SSLfinal_State,Domain_registeration_length,Favicon,...,popUpWidnow,Iframe,age_of_domain,DNSRecord,web_traffic,Page_Rank,Google_Index,Links_pointing_to_page,Statistical_report,Result
0,-1,1,1,1,-1,-1,-1,-1,-1,1,...,1,1,-1,-1,-1,-1,1,1,-1,-1
1,1,1,1,1,1,-1,0,1,-1,1,...,1,1,-1,-1,0,-1,1,1,1,-1
2,1,0,1,1,1,-1,-1,-1,-1,1,...,1,1,1,-1,1,-1,1,0,-1,-1
3,1,0,1,1,1,-1,-1,-1,1,1,...,1,1,-1,-1,1,-1,1,-1,1,-1
4,1,0,-1,1,1,-1,1,1,-1,1,...,-1,1,-1,-1,0,-1,1,1,1,1
5,-1,0,-1,1,-1,-1,1,1,-1,1,...,1,1,1,1,1,-1,1,-1,-1,1
6,1,0,-1,1,1,-1,-1,-1,1,1,...,1,1,1,-1,-1,-1,1,0,-1,-1
7,1,0,1,1,1,-1,-1,-1,1,1,...,1,1,-1,-1,0,-1,1,0,1,-1
8,1,0,-1,1,1,-1,1,1,-1,1,...,1,1,1,-1,1,1,1,0,1,1
9,1,1,-1,1,1,-1,-1,1,-1,1,...,1,1,1,-1,0,-1,1,0,1,-1


In [75]:
test[0:1][features]

Unnamed: 0,having_IP_Address,URL_Length,Shortining_Service,having_At_Symbol,double_slash_redirecting,Prefix_Suffix,having_Sub_Domain,SSLfinal_State,Domain_registeration_length,Favicon,...,RightClick,popUpWidnow,Iframe,age_of_domain,DNSRecord,web_traffic,Page_Rank,Google_Index,Links_pointing_to_page,Statistical_report
1,1,1,1,1,1,-1,0,1,-1,1,...,1,1,1,-1,-1,0,-1,1,1,1


In [77]:
result = clf.predict(test[0:1][features])

In [79]:
print(result[0])

0


In [80]:
features

Index(['having_IP_Address', 'URL_Length', 'Shortining_Service',
       'having_At_Symbol', 'double_slash_redirecting', 'Prefix_Suffix',
       'having_Sub_Domain', 'SSLfinal_State', 'Domain_registeration_length',
       'Favicon', 'port', 'HTTPS_token', 'Request_URL', 'URL_of_Anchor',
       'Links_in_tags', 'SFH', 'Submitting_to_email', 'Abnormal_URL',
       'Redirect', 'on_mouseover', 'RightClick', 'popUpWidnow', 'Iframe',
       'age_of_domain', 'DNSRecord', 'web_traffic', 'Page_Rank',
       'Google_Index', 'Links_pointing_to_page', 'Statistical_report'],
      dtype='object')