In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import StratifiedKFold

In [2]:
#get training csv file as pandas dataframe
df = pd.read_csv("Phising_Training_Dataset.csv")

#drop column key as it is not useful for training the model
df = df.drop(columns=["key"])

#use result column as as the predicted variable
y = df[["Result"]]

# use 0, 1 for result variable. While generating the output submission file, we will convert 0 back to -1
y = y.replace(-1, 0)
y = np.array(y).ravel()

# use all columns other than key and result as features
x = df.drop(columns=["Result"])
# replace -1 feature with 2 as some algos like lighgbm are sensitive to this
x = x.replace(-1, 2)
x = np.array(x)

# create 5 splits for out of fold cross validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

# array to hold trained models
models = []
scores = []
for train_index, test_index in kf.split(x,y):
    model = ExtraTreesClassifier(random_state=48)
    x_train = x[train_index]
    y_train = y[train_index]
    x_test = x[test_index]
    y_test = y[test_index]
    
    #train each model with 80% data and use other 20% for cross validation accuracy prediction
    model.fit(x_train, y_train)
    scores.append(model.score(x_test, y_test))
    models.append(model)
    
#print validation accuracy as mean of 5 models. This is used to choose the algo
print(sum(scores)/5.0)

0.9690675600223339


In [3]:
# read submission fataset
df = pd.read_csv("Phising_Testing_Dataset.csv")

# store keys, we cannot drop them as in training as we need this later for generating output csv
x_idx = list(df["key"])

# take remaining 30 columns as test set features
x_sub = df.drop(columns=["key"])
# replace -1 with 2 as we did in training
x_sub = x_sub.replace(-1, 2)
x_sub = np.array(x_sub)

# compute the class probablities from the 5 models and compute mean
y_sub = np.zeros((x_sub.shape[0], 2))
for model in models:
    y_sub += model.predict_proba(x_sub)
y_sub = y_sub / len(models)

# apply a threshold of 0.5 on the predicted probablity to get to final labels
y_pred = [1 if x[0] < x[1] else -1 for x in y_sub]

# generate output dataframe and generate csv file for submission
df_sub = pd.DataFrame([(a, b) for a, b in zip(x_idx,y_pred)], columns=['key','Result'])
df_sub.to_csv("out.csv", index=False)