# Imports

In [1]:
import numpy as np
import pandas as pd

from pprint import pprint

from os import makedirs
from os.path import join, exists

from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, scale
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import make_scorer, accuracy_score, balanced_accuracy_score, f1_score, precision_score, recall_score, classification_report, roc_auc_score

# Utils Functions  

In [2]:
scaler = StandardScaler()

In [3]:
label_encoder = LabelEncoder()

In [4]:
def down_sample(raw_df, column_name):
    """
        Down samples the dataset to balance the densities of each target. First, this method begins by
        ingesting the dataframe and name of the target column. Next, the dataset is split into groups 
        based on the target column. The groups are then reduced to the size of the smallest group. Finally,
        the groups are concated together into a single dataframe and returned.

        Args:
            raw_df: raw dataframe 
            column_name: target column name

        Returns:
            sampled_df: downsampled dataframe
    """
    dfs = raw_df.groupby([column_name])
    sampled_df = dfs.apply(lambda x: x.sample(dfs.size().min()))
    sampled_df = sampled_df.reset_index(drop=True)
    return sampled_df

In [5]:
def clean_df(df, column_name, balance=False):
    """
        Cleans dataset and preprocesses it in preparation for ML model ingestion, training testing. This method
        starts off by filling in empty values with zeros. Next, it converts all columns' datatype to numeric as
        some columns' datatype is by default set to objects during ingestion. Next, columns that are all zero are
        removed from the dataset. Finally, the dataset is sampled to balance the targets before being returned.
    """
    balanced=False
    df = df.fillna(0.0)
    df.loc[:, (df.columns != column_name)] = df.loc[:, (df.columns != column_name)].apply(pd.to_numeric)
    # remove all zero columns
    df = df.loc[:, (df != 0).any(axis=0)]
    df.drop_duplicates(subset=None, keep='first', inplace=True)
    if balance==True:
        balanced=True
        df = down_sample(df, column_name)
    df[column_name] = df[column_name].astype('category')
    df[column_name] = df[column_name].cat.codes
    return df, balanced

In [6]:
def log_reg_grid_search(balanced, hyperparameters, x_train, y_train):
    
    new_list=[]
    if balanced == False:
        hyperparameters.update({"class_weight":["balanced"]})
    else:
        hyperparameters.pop("class_weight",None)
        
    if balanced == False:
        scoring = {
            'accuracy_score': make_scorer(balanced_accuracy_score),
            'f1_score': make_scorer(f1_score, average="weighted"),
            'precision_score': make_scorer(precision_score, average="weighted"),
            'recall_score': make_scorer(recall_score, average="weighted")
        }
    else:
        scoring = {
            'accuracy_score': make_scorer(accuracy_score),
            'f1_score': make_scorer(f1_score, average="macro"),
            'precision_score': make_scorer(precision_score, average="macro"),
            'recall_recall_score': make_scorer(recall_score, average="macro")
        }
        
    
    log_reg_final_model = GridSearchCV(
        estimator = LogisticRegression(), 
        param_grid = hyperparameters, 
        scoring=scoring,
        verbose=2, 
        n_jobs=-1,
        cv=5,
        refit="accuracy_score"
    )

    training_results = log_reg_final_model.fit(x_train, y_train)
    
    return training_results.best_params_

# Load Data

In [7]:
dataset = pd.read_csv("all.csv", low_memory = False, na_values = [-1, "nan", "Infinity"])

**Explore dataset**

In [8]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36707 entries, 0 to 36706
Data columns (total 80 columns):
Querylength                        36707 non-null int64
domain_token_count                 36707 non-null int64
path_token_count                   36707 non-null int64
avgdomaintokenlen                  36707 non-null float64
longdomaintokenlen                 36707 non-null int64
avgpathtokenlen                    36427 non-null float64
tld                                36707 non-null int64
charcompvowels                     36707 non-null int64
charcompace                        36707 non-null int64
ldl_url                            36707 non-null int64
ldl_domain                         36707 non-null int64
ldl_path                           36707 non-null int64
ldl_filename                       36707 non-null int64
ldl_getArg                         36707 non-null int64
dld_url                            36707 non-null int64
dld_domain                         36707 non-nu

# Preprocess data

In [41]:
df, balanced = clean_df(dataset, "URL_Type_obf_Type")

In [42]:
df

Unnamed: 0,Querylength,domain_token_count,path_token_count,avgdomaintokenlen,longdomaintokenlen,avgpathtokenlen,tld,charcompvowels,charcompace,ldl_url,...,SymbolCount_FileName,SymbolCount_Extension,SymbolCount_Afterpath,Entropy_URL,Entropy_Domain,Entropy_DirectoryName,Entropy_Filename,Entropy_Extension,Entropy_Afterpath,URL_Type_obf_Type
0,0,4,5,5.500000,14,4.400000,4,8,3,0,...,1.0,0.0,0.0,0.726298,0.784493,0.894886,0.850608,0.000000,0.000000,0
1,0,4,5,5.500000,14,6.000000,4,12,4,0,...,0.0,0.0,0.0,0.688635,0.784493,0.814725,0.859793,0.000000,0.000000,0
2,0,4,5,5.500000,14,5.800000,4,12,5,0,...,0.0,0.0,0.0,0.695049,0.784493,0.814725,0.801880,0.000000,0.000000,0
3,0,4,12,5.500000,14,5.500000,4,32,16,0,...,0.0,0.0,0.0,0.640130,0.784493,0.814725,0.663210,0.000000,0.000000,0
4,0,4,6,5.500000,14,7.333334,4,18,11,0,...,0.0,0.0,0.0,0.681307,0.784493,0.814725,0.804526,0.000000,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36699,6,3,7,2.666667,4,4.800000,3,11,8,1,...,3.0,2.0,1.0,0.783024,0.879588,0.783307,0.918457,0.931745,0.947443,4
36701,0,3,5,4.666666,10,3.000000,3,2,2,1,...,1.0,0.0,0.0,0.799054,0.843750,0.898227,0.814038,1.000000,0.000000,4
36702,29,4,14,5.750000,12,3.666667,4,20,24,3,...,3.0,2.0,7.0,0.690555,0.791265,0.777498,0.690227,0.656684,0.796205,4
36704,58,3,27,6.666666,16,3.375000,3,41,34,20,...,8.0,7.0,9.0,0.656807,0.801139,0.684777,0.713622,0.717187,0.705245,4


#  Modelling

### Split dataset

In [43]:
x = df.iloc[:,:-1].values
y = df.iloc[:, -1].values
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.90, random_state=42)

**Scale data**

In [44]:
x_train = scale(x_train)

## Max Voting approach

Will be using:
- Logistic Regression
- Decision Trees
- Support Vector Machines

### Logistic Regression

**Define grid search parameters**

In [52]:
hyperparameters = {
            "penalty":["l2","l1"],
            "solver":["lbfgs","newton-cg","liblinear"],
            "multi_class":["multinomial"],
            "class_weight":None,
            "max_iter":[500],
            "tol":[1e-4]
}

In [46]:
best_params = log_reg_grid_search(balanced, hyperparameters, x_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    1.2s finished


In [47]:
best_params

{'class_weight': 'balanced',
 'max_iter': 500,
 'multi_class': 'multinomial',
 'penalty': 'l2',
 'solver': 'lbfgs',
 'tol': 0.005}

In [48]:
log_reg = LogisticRegression(multi_class="multinomial",max_iter=5000,n_jobs=-1,verbose=5,tol=1e-7)

In [49]:
model = log_reg.fit(x_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.4s finished


In [50]:
y_pred = model.predict(x_test)

In [51]:
accuracy_score(y_pred,y_test)

0.37595844669799655

In [70]:
recall_score(y_pred,y_test,average="micro")

0.37595844669799655

In [69]:
precision_score(y_pred,y_test,average="micro")

0.37595844669799655

In [68]:
f1_score(y_pred,y_test,average="micro")

0.3759584466979966