# 0. Imports

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
import lightgbm

from os import cpu_count

# custom modules
from code.util import *
from code.preprocessor import *
from code.gofaster import *

# parallelizer
gf = GoFaster(n_jobs=cpu_count()-1, n_partitions=cpu_count()*3)

---

In [2]:
data = load("data/urls_preprocessed.pkl")

# 1. Train/Test Split

In [3]:
x, y = data.url, data.target
x_train0, x_test0, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

# 2. Vectorization

In [4]:
vect = TfidfVectorizer(
    tokenizer=tokenize,
    max_df=0.9999,
    min_df=0.0001,
    max_features=3000
)

In [5]:
vect.fit(x_train0)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.9999,
                max_features=3000, min_df=0.0001, ngram_range=(1, 1), norm='l2',
                preprocessor=None, smooth_idf=True, stop_words=None,
                strip_accents=None, sublinear_tf=False,
                token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=<function tokenize at 0x12efa0560>, use_idf=True,
                vocabulary=None)

In [6]:
x_train = vect.transform(x_train0).toarray()
x_test = vect.transform(x_test0).toarray()

# 3. Models

## 3.1 Multinomial Naive Bayes

In [7]:
multnb = MultinomialNB()

In [8]:
multnb.fit(x_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [9]:
y_pred = multnb.predict(x_test)

In [10]:
print("MODEL: MultinomialNB")
print(classification_report(y_test, y_pred))

MODEL: MultinomialNB
              precision    recall  f1-score   support

           0       0.96      0.99      0.97     17271
           1       0.96      0.85      0.90      5067

    accuracy                           0.96     22338
   macro avg       0.96      0.92      0.94     22338
weighted avg       0.96      0.96      0.96     22338



In [12]:
scores = cross_val_score(
    estimator=multnb,
    X=x_train,
    y=y_train,
    scoring="accuracy",
    cv=10,
    n_jobs=cpu_count()-1
)

print_scores(scores, label="MultinomialNB")

Accuracy: 0.96 (+/- 0.00) [MultinomialNB]


## 3.2 Decision Tree

In [26]:
tree = DecisionTreeClassifier(
    criterion="gini",
    max_depth=10,
    random_state=42
)

In [27]:
tree.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best')

In [37]:
y_pred = tree.predict(x_test)

In [38]:
print("MODEL: DecisionTreeClassifier")
print(classification_report(y_test, y_pred))

MODEL: DecisionTreeClassifier
              precision    recall  f1-score   support

           0       0.90      0.99      0.95     17271
           1       0.97      0.63      0.77      5067

    accuracy                           0.91     22338
   macro avg       0.94      0.81      0.86     22338
weighted avg       0.92      0.91      0.91     22338



In [39]:
scores = cross_val_score(
    estimator=tree,
    X=x_train,
    y=y_train,
    scoring="accuracy",
    cv=10,
    n_jobs=cpu_count()-1
)

print_scores(scores, label="DecisionTreeClassifier")

Accuracy: 0.91 (+/- 0.00) [DecisionTreeClassifier]


In [23]:
param_grid = {
    "criterion": ["gini", "entropy"],
    "max_depth": range(1, 11)
}

gs = GridSearchCV(
    estimator=DecisionTreeClassifier(),
    param_grid=params_grid,
    cv=5,
    n_jobs=cpu_count()-1
)

In [24]:
# gs.fit(x_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=11,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': range(1, 11)},
             pre_dispatch=

In [25]:
# gs.best_params_

{'criterion': 'gini', 'max_depth': 10}

## 3.3 Random Forest

In [32]:
forest = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42
)

In [33]:
forest.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=10, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [34]:
y_pred = forest.predict(x_test)

In [35]:
print("MODEL: RandomForestClassifier")
print(classification_report(y_test, y_pred))

MODEL: RandomForestClassifier
              precision    recall  f1-score   support

           0       0.83      1.00      0.91     17271
           1       1.00      0.29      0.45      5067

    accuracy                           0.84     22338
   macro avg       0.91      0.65      0.68     22338
weighted avg       0.87      0.84      0.80     22338



In [36]:
scores = cross_val_score(
    estimator=tree,
    X=x_train,
    y=y_train,
    scoring="accuracy",
    cv=10,
    n_jobs=cpu_count()-1
)

print_scores(scores, label="RandomForestClassifier")

Accuracy: 0.91 (+/- 0.00) [RandomForestClassifier]
