In [2]:
import pandas as pd 
import numpy as np
import sklearn

In [3]:
import sklearn
print(sklearn.__version__)

1.0.1


In [4]:
import time
def timing(f):
    def wrap(*args, **kwargs):
        time1 = time.time()
        ret = f(*args, **kwargs)
        time2 = time.time()
        print('{:s} function took {:.3f} ms'.format(f.__name__, (time2-time1)*1000.0))

        return ret
    return wrap

# Load data and x/y

In [5]:
all_data = pd.read_csv("preparedData03-06_15.csv")

In [6]:
all_data.drop('Unnamed: 0', axis=1, inplace=True)

In [7]:
all_data = all_data.drop_duplicates()

In [8]:
all_data.shape

(4994457, 39)

In [9]:
X = all_data.drop("Label",axis = 1)
y =  all_data.Label

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.08, random_state=42)

In [11]:
print(str(sum(y_test)) + " / " + str(len(y_test)))

37391 / 399557


In [12]:
print(str(sum(y_train)) + " / " + str(len(y_train)))

430332 / 4594900


## train

In [13]:
@timing
def train_model(model,X_train, y_train):
    model.fit(X_train, y_train)
    return model

In [14]:
@timing
def run_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    return y_pred   

In [15]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
def eval_model(y_test,y_pred):
    pres = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    roc = roc_auc_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    return pres,rec, roc, f1

In [15]:
from sklearn.ensemble import HistGradientBoostingClassifier
model = HistGradientBoostingClassifier(learning_rate = 0.08, max_depth = 50, max_iter = 300, max_leaf_nodes =  127, min_samples_leaf = 50)
model = train_model(model,X_train,y_train)

train_model function took 69880.585 ms


In [16]:
y_pred = run_model(model, X_test, y_test)

run_model function took 551.467 ms


In [17]:
eval_model(y_test,y_pred)

(0.9775679186059097,
 0.9661950736808322,
 0.9819530340433562,
 0.9718482252141982)

In [18]:
from sklearn.ensemble import BaggingClassifier
model = BaggingClassifier(n_estimators=150, random_state=0,n_jobs = -1)
model = train_model(model,X_train,y_train)

train_model function took 227561.100 ms


In [19]:
y_pred = run_model(model, X_test, y_test)

run_model function took 3671.271 ms


In [20]:
eval_model(y_test,y_pred)

(0.8340927846148326,
 0.6217271535931106,
 0.8044797638489014,
 0.7124207042383008)

In [16]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators = 400, random_state=0,n_jobs = -1)
model = train_model(model,X_train,y_train)

train_model function took 228551.462 ms


In [17]:
y_pred = run_model(model, X_test, y_test)

run_model function took 1624.897 ms


In [18]:
eval_model(y_test,y_pred)

(0.8896509545236119,
 0.5446497820331095,
 0.7688375399123649,
 0.6756577419461863)

In [24]:
from xgboost import XGBClassifier
model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
model = train_model(model,X_train,y_train)

train_model function took 323574.416 ms


In [25]:
y_pred = run_model(model, X_test, y_test)

run_model function took 91.341 ms


In [26]:
eval_model(y_test,y_pred)

(0.8084296106118956, 0.50528201973737, 0.7464601425316075, 0.621879165912345)

In [32]:
from sklearn.neural_network import MLPClassifier
model = MLPClassifier(random_state=1, max_iter=200)
model = train_model(model,X_train.head(1000000),y_train.head(1000000))

train_model function took 1705627.769 ms


In [33]:
y_pred = run_model(model, X_test, y_test)

run_model function took 1115.883 ms


In [34]:
eval_model(y_test,y_pred)

(0.6844532606000496,
 0.36912626033002593,
 0.6757784844500646,
 0.47960247411216905)

In [29]:
from sklearn.svm import SVC
model = SVC(random_state=1, max_iter=200)
model = train_model(model,X_train.head(1000000),y_train.head(1000000))

train_model function took 47161.134 ms




In [30]:
y_pred = run_model(model, X_test, y_test)

run_model function took 15868.659 ms


In [31]:
eval_model(y_test,y_pred)

(0.017807370703004026,
 0.018453638576127945,
 0.4566846148845584,
 0.018124745531199516)

In [25]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
estimators=[("bag", BaggingClassifier(n_estimators=150, random_state=0,n_jobs = -1)), ("gard", HistGradientBoostingClassifier(learning_rate = 0.08, max_depth = 50, max_iter = 300, max_leaf_nodes =  127, min_samples_leaf = 50))]
model = VotingClassifier(estimators, voting="hard")
model = train_model(model,X_train,y_train)

train_model function took 280128.901 ms


In [26]:
y_pred = run_model(model, X_test, y_test)

run_model function took 6260.053 ms


In [27]:
eval_model(y_test,y_pred)

(0.9898758274564098,
 0.6118852130191758,
 0.8056195502315275,
 0.7562805764908106)