In [30]:
# Import necessary modules
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, \
    roc_auc_score

In [18]:
# Import session dataframe
file_location = "/Users/jmeyers/Desktop/networkML/paper/data/dummy_data/dummy_session.csv.gz"
session_df = pd.read_csv(file_location)

In [42]:
# Extract Y
y = session_df.filename # identify filename column
y = y.str.split('-')
y = y.str[0] # device type is always first field

# !! DANGER !!
# CODE MUST BE REMOVED FOR REAL ANALYSIS
# Making first thousand rows a printer ensures
# that all cross validation runs have more than
# two target values
y[0:500] = "printer"
y[501:1000] = "phone"
# !! DANGER !!

# Convert y into categorical variable
le = LabelEncoder()
le.fit(y)
y = le.transform(y)

# Extract X
# Drop filename and host_key columns
X = session_df.drop(columns=['filename', 'host_key'])

# Split into train and test (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=20200313)

# Normalize data
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## FOR MONDAY
## To do: make this whole thing a function where the inputs
## are the model type and the parameters

# Do 5-fold cross validation with training data
# Combine with hyper-parameter optimization to do a simple
# but principled search
parameters = {'C':[1, 2]}
mod = LogisticRegression(max_iter=1000)
clf = GridSearchCV(mod, parameters, cv=3, n_jobs=-1, scoring='f1_weighted')
clf.fit(X_train_scaled, y_train)

# Try models from model-list
# - Random forest
# - Support vector machine (training SVM might be too slow)
# - Neural network

# Get best performing weight model for each model class
# Predict on test set
# Get accuracy, precision, recall, F1, AUC, time for 1 inference
best_mod = clf.best_estimator_
y_pred = best_mod.predict(X_test_scaled)
y_pred_prob = best_mod.predict_proba(X_test_scaled)

## Calculate metrics
acc  = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='weighted')
rec  = recall_score(y_test, y_pred, average='weighted')
f1   = f1_score(y_test, y_pred, average='weighted')
#auc  = roc_auc_score(y_test, y_pred_prob, average='weighted')
print([acc, prec, rec, f1])

[0.9529001308329699, 0.9237765904044453, 0.9529001308329699, 0.9316660540641136]


  _warn_prf(average, modifier, msg_start, len(result))
