In [3]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import numpy as np

In [4]:
df = pd.read_csv("dataset.csv")
X = df.drop(["id", "is_malicious"], axis=1)
y = df['is_malicious']

# RandomForest

In [5]:
from sklearn.ensemble import RandomForestClassifier

# Create classifier
rf = RandomForestClassifier()

In [6]:
# Create param grid
param_grid_rf = {
    "n_estimators": [100, 300, 500, 800, 1200],
    "max_depth": [5, 8, 15, 25, 30],
    "min_samples_split": [2, 5, 10, 15, 100],
    "min_samples_leaf": [1, 2, 5, 10]
}

In [7]:
# Instantiate RandomSearch
rscv_rf = RandomizedSearchCV(rf, param_grid_rf, cv=10, scoring="f1", n_jobs=-1, random_state=42)

# Instantiate GridSearch
gscv_rf = GridSearchCV(rf, param_grid_rf, cv=10, scoring="f1", n_jobs=-1)

In [None]:
# Fit dataset
rscv_rf.fit(X, y)

In [None]:
gscv_rf.fit(X, y)

In [70]:
print("F1 Score: " + str(rscv_rf.best_score_))
print("Best Params: " + str(rscv_rf.best_params_))

F1 Score: 0.9114339584105234
Best Params: {'n_estimators': 300, 'min_samples_split': 10, 'min_samples_leaf': 5, 'max_depth': 25}


# DecisionTree

In [58]:
from sklearn.tree import DecisionTreeClassifier

# Create classifier
dt = DecisionTreeClassifier()

In [59]:
# Create param grid
param_grid_dt = {
    "criterion": ["gini", "entropy"],
    "max_depth": [5, 8, 15, 25, 30, 40, 50, 70, 90, 120, 150],
    "min_samples_split": [2, 5, 10, 15, 100],
    "min_samples_leaf": [1, 2, 5, 10]
}

In [60]:
# Instantiate RandomSearch
rscv_dt = RandomizedSearchCV(dt, param_grid_dt, cv=10, scoring="f1", n_jobs=-1, random_state=42)

# Instantiate GridSearch
gscv_dt = GridSearchCV(dt, param_grid_dt, cv=10, scoring="f1", n_jobs=-1)

In [None]:
# Fit dataset
rscv_dt.fit(X, y)

In [None]:
gscv_dt.fit(X, y)

In [69]:
print("F1 Score: " + str(rscv_dt.best_score_))
print("Best Params: " + str(rscv_dt.best_params_))

F1 Score: 0.8794412490338199
Best Params: {'min_samples_split': 10, 'min_samples_leaf': 1, 'max_depth': 30, 'criterion': 'gini'}


# KNN

In [63]:
from sklearn.neighbors import KNeighborsClassifier

# Create classifier
knn = KNeighborsClassifier()

In [64]:
# Create param grid
param_grid_knn = {
    "n_neighbors": range(3,100,2)
}

In [65]:
# Instantiate RandomSearch
rscv_knn = RandomizedSearchCV(knn, param_grid_knn, cv=10, scoring="f1", n_jobs=-1, random_state=42)

# Instantiate GridSearch
gscv_knn = GridSearchCV(knn, param_grid_knn, cv=10, scoring="f1", n_jobs=-1)

In [None]:
rscv_knn.fit(X, y)

In [None]:
gscv_knn.fit(X, y)

In [68]:
print("F1 Score: " + str(rscv_knn.best_score_))
print("Best Params: " + str(rscv_knn.best_params_))

F1 Score: 0.8294048673630139
Best Params: {'n_neighbors': 29}


# MLP

In [72]:
from sklearn.neural_network import MLPClassifier

# Create classifier
mlp = MLPClassifier()

In [73]:
# Create param grid
param_grid_mlp = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}

In [74]:
# Instantiate RandomSearch
rscv_mlp = RandomizedSearchCV(mlp, param_grid_mlp, cv=10, scoring="f1", n_jobs=-1, random_state=42)

# Instantiate GridSearch
gscv_mlp = GridSearchCV(knn, param_grid_mlp, cv=10, scoring="f1", n_jobs=-1)

In [None]:
rscv_mlp.fit(X, y)

In [None]:
gscv_mlp.fit(X, y)

In [76]:
print("F1 Score: " + str(rscv_mlp.best_score_))
print("Best Params: " + str(rscv_mlp.best_params_))

F1 Score: 0.8316050441639973
Best Params: {'solver': 'adam', 'learning_rate': 'constant', 'hidden_layer_sizes': (50, 50, 50), 'alpha': 0.0001, 'activation': 'relu'}


# SVC

In [102]:
from sklearn.svm import SVC

# Create classifier
svc = SVC()

In [115]:
# Create param grid
param_grid_svc = {
    "kernel": ["linear", "rbf", "poly"],
    "gamma": [0.1, 1, 10, 100, 500],
    "C": [0.1, 1, 10, 100, 1000],
    "degree": [0, 1, 2, 3, 4, 5, 6]
}

In [116]:
# Instantiate RandomSearch
rscv_svc = RandomizedSearchCV(svc, param_grid_svc, cv=10 scoring="f1", n_jobs=-1, random_state=42)

# Instantiate GridSearch
gscv_svc = GridSearchCV(svc, param_grid_mlp, cv=10, scoring="f1", n_jobs=-1)

In [None]:
rscv_svc.fit(X, y)

In [None]:
gscv_svc.fit(X, y)

In [None]:
print("F1 Score: " + str(rscv_svc.best_score_))
print("Best Params: " + str(rscv_svc.best_params_))

# Naive-Bayes

In [97]:
from sklearn.naive_bayes import GaussianNB

# Create classifier
nb = GaussianNB()

In [98]:
# Create param grid
param_grid_nb = {
    'var_smoothing': np.logspace(0,-9, num=100)
}

In [99]:
# Instantiate RandomSearch
rscv_nb = RandomizedSearchCV(nb, param_grid_nb, cv=10, scoring="f1", n_jobs=-1, random_state=42)

# Instantiate GridSearch
gscv_nb = GridSearchCV(nb, param_grid_mlp, cv=10, scoring="f1", n_jobs=-1)

In [None]:
rscv_nb.fit(X, y)

In [None]:
gscv_nb.fit(X, y)

In [101]:
print("F1 Score: " + str(rscv_nb.best_score_))
print("Best Params: " + str(rscv_nb.best_params_))

F1 Score: 0.81696749409685
Best Params: {'var_smoothing': 5.336699231206302e-08}
