In [6]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pickle

In [4]:
dataRaw = pd.read_csv('https://raw.githubusercontent.com/jwaldroop/phishing-url-project/main/dataset_full.csv')

In [5]:
# Noticed a discrepancy in the data, some values are recorded as -1 even though it makes not practical sense, i.e. you can't have a negative quantity of a character
# This changes all -1 to 0

def remove_negatives(df):
    df[df == -1] = 0

In [7]:
Features = dataRaw.iloc[:,:-1] # target is in last column
X = Features
y = dataRaw['phishing']

# creating pipelined model
pipe = Pipeline([('no_negatives' , remove_negatives(X)) , ('RF' , RandomForestClassifier(n_jobs = -1, verbose = False, n_estimators = 200 , random_state = 426, max_depth = 15))])

In [8]:
# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X , y, random_state = 0)

In [9]:
# fitting the pipeline
pipe.fit(X_train, y_train)

Pipeline(steps=[('no_negatives', None),
                ('RF',
                 RandomForestClassifier(max_depth=15, n_estimators=200,
                                        n_jobs=-1, random_state=426,
                                        verbose=False))])

In [11]:
# Pickling the fitted pipeline
with open('model.pkl' , 'wb') as f:
    # pickle the data dictionary using the highest protocol availabe
    pickle.dump(pipe, f, pickle.HIGHEST_PROTOCOL)

In [21]:
# Grabbing a couple rows to test the pickled model
newdata = dataRaw.iloc[55:57,:]
newdata.values.tolist() # save this list as its own varaible in a seperate .py file. 

[[3.0,
  2.0,
  0.0,
  3.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  45.0,
  2.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  6.0,
  16.0,
  0.0,
  0.0,
  1.0,
  1.0,
  0.0,
  3.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  29.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  10.0,
  -1.0,
  -1.0,
  -1.0,
  -1.0,
  -1.0,
  -1.0,
  -1.0,
  -1.0,
  -1.0,
  -1.0,
  -1.0,
  -1.0,
  -1.0,
  -1.0,
  -1.0,
  -1.0,
  -1.0,
  -1.0,
  -1.0,
  -1.0,
  0.0,
  0.4175930000000001,
  0.0,
  8100.0,
  -1.0,
  -1.0,
  1.0,
  4.0,
  0.0,
  1792.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0],
 [2.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  22.0,
  2.0,
  1.0,
  0.0,
  0.0,
  0.0,