##Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)
plt.rcParams['figure.figsize'] = (12,6)

## Loading dataset

In [None]:
data = pd.read_csv("Phishing_Legitimate_full.csv")

## Converting dataset

In [None]:
float_cols = data.select_dtypes('float64').columns
for c in float_cols:
  data[c] = data[c].astype('float32')

int_cols = data.select_dtypes('int64').columns
for c in int_cols:
  data[c] = data[c].astype('int32')
# data.info()

In [None]:
data.rename(columns = {'CLASS_LABEL': 'labels'}, inplace = True)

## View the data

In [None]:
data.sample(5)

## Summary Statistics

In [None]:
data.describe()

## Balance Check

In [None]:
data['labels'].value_counts().plot(kind = 'bar')

## Spearman Correlation

In [None]:
def corr_heatmap(data, idx_s, idx_e):
  y = data['labels']
  temp = data.iloc[:, idx_s:idx_e]
  if 'id' in temp.columns:
    del temp['id']
  temp["labels"] = y
  sns.heatmap(temp.corr(), annot= True, fmt = '.2f')
  plt.show()

## Heatmap of first 50 columns

In [None]:
# First 10 columns
corr_heatmap(data, 0, 10)

In [None]:
# Column 11 to 20
corr_heatmap(data, 10, 20)

In [None]:
# Column 21 to 30
corr_heatmap(data, 20, 30)

In [None]:
# Column 31 to 40
corr_heatmap(data, 30, 40)

In [None]:

# Column 41 to 50
corr_heatmap(data, 40, 50)

##Mutual Info Classifier

In [None]:
from sklearn.feature_selection import mutual_info_classif

In [None]:
X = data.drop(['id', 'labels'], axis = 1)
y = data['labels']

In [None]:
discrete_features = X.dtypes == int

In [None]:
# Process the scores and compare with spearman corr
mi_scores = mutual_info_classif(X, y, discrete_features=discrete_features)
mi_scores = pd.Series(mi_scores, name = 'MI Scores', index = X.columns)
mi_scores = mi_scores.sort_values(ascending = False)
mi_scores

In [None]:
def plot_mi_scores(scores):
  # Graph_name: MI Scores
  plt.figure(figsize = (15, 10))
  mi_scores.plot.bar(x=None, y=None)
  plt.show()

In [None]:
plt.figure(dpi = 100, figsize = (12, 12))
plot_mi_scores(mi_scores)

##Prediction
We will first use logistic regression as for baseline, then try to beat the baseline using random forest classifer

Our evaluation metrics will be accuracy, precision, recall and f1 score

Below we import all the required modules

In [None]:
!nvidia-smi
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/env-check.py
!bash rapidsai-csp-utils/colab/update_gcc.sh
import os
os._exit(00)
import condacolab
condacolab.install()
import condacolab
condacolab.check()
!python rapidsai-csp-utils/colab/install_rapids.py stable
import os
os.environ['NUMBAPRO_NVVM'] = '/usr/local/cuda/nvvm/lib64/libnvvm.so'
os.environ['NUMBAPRO_LIBDEVICE'] = '/usr/local/cuda/nvvm/libdevice/'
os.environ['CONDA_PREFIX'] = '/usr/local'
from sklearn.linear_model import LogisticRegression
from cuml.ensemble import RandomForestClassifier as cuRfc
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

##Train logistic models
This method is to perform a repetative training process using logistic regression model, the purpose for this is to find the optimal number of features that can be used to find the best fitted model without adjusting much of the hyperparameters, hence the idea here is to go with Data-Centric training, basically the method takes number of top N features to be used for training the model and all the evaluation metrics are returned for evaluation purpose

In [None]:
def train_logistic(data, top_n):
    top_n_features = mi_scores.sort_values(ascending=False).head(top_n).index.tolist()
    X = data[top_n_features]
    y = data['labels']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)
    
    lr = LogisticRegression(max_iter=10000)
    lr.fit(X_train, y_train)
    
    y_pred = lr.predict(X_test)
    
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    
    return precision, recall, f1, accuracy

Here the loop will be starting from 20 as we will start training with top 20 features up to all 50 features to find optimal number of features needed for this problem

In [None]:
arr = []
for i in range(20,51,1):
    precision, recall, f1, accuracy = train_logistic(data, i)
    print("Performance for Logistic Model with Top {} features is precision : {}, recall : {}, f1 score : {}, accuracy : {}".format(i, precision, recall, f1, accuracy))
    arr.append([i, precision, recall, f1, accuracy])

In [None]:
df = pd.DataFrame(arr, columns=['num_of_features', 'precision', 'recall', 'f1_score', 'accuracy'])
df

## Visualize Logistic Regression Performance

In [None]:
sns.lineplot(x = 'num_of_features', y = 'precision', data = df, label = 'Precision Score')
sns.lineplot(x = 'num_of_features', y = 'recall', data = df, label = 'Recall Score')
sns.lineplot(x = 'num_of_features', y = 'f1_score', data = df, label = 'F1 Score')
sns.lineplot(x = 'num_of_features', y = 'accuracy', data = df, label = 'Accuracy Score')