In [9]:
# Import necessary modules
import numpy as np
import pandas as pd
from sklearn import svm

from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, \
    roc_auc_score

In [2]:
# Import session dataframe
file_location = "/Users/jmeyers/Desktop/networkML/paper/data/dummy_data/dummy_session.csv.gz"
session_df = pd.read_csv(file_location)

In [None]:
# Extract Y
y = session_df.filename # identify filename column
y = y.str.split('-')
y = y.str[0] # device type is always first field

# !! DANGER !!
# CODE MUST BE REMOVED FOR REAL ANALYSIS
# Making first thousand rows a printer ensures
# that all cross validation runs have more than
# two target values
y[0:500] = "printer"
y[501:1000] = "phone"
# !! DANGER !!

# Convert y into categorical variable
le = LabelEncoder()
le.fit(y)
y = le.transform(y)

# Extract X
# Drop filename and host_key columns
X = session_df.drop(columns=['filename', 'host_key'])

# Split into train and test (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=20200313)

# Normalize data
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
def assessModel(model, parameters):
    """Assess model performance given model type and paramters
    
    INPUT:
    --model: type of machine learning model (sklearn models)
    --parameters: Dictionary of parameters and values to do grid search
    
    OUTPUT:
    --prints performance metrics (accuracy, precision, recall, F1s)
    """
    
    # Do 5-fold cross validation with training data
    # Combine with hyper-parameter optimization to do a simple
    # but principled search
    # TODO: SET CV=5 WHEN DOING ACTUAL EXPERIMENT
    clf = GridSearchCV(model, parameters, cv=3, n_jobs=-1, scoring='f1_weighted')
    clf.fit(X_train_scaled, y_train)

    # Get best performing weight model for each model class
    # Predict on test set
    best_mod = clf.best_estimator_
    y_pred = best_mod.predict(X_test_scaled)
    y_pred_prob = best_mod.predict_proba(X_test_scaled)

    ## Calculate metrics
    acc  = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='weighted')
    rec  = recall_score(y_test, y_pred, average='weighted')
    f1   = f1_score(y_test, y_pred, average='weighted')
    
    print([acc, prec, rec, f1])


In [4]:
# Test with logistic regression for simplicity
parameters = {'C':[1, 2]}
model = LogisticRegression(max_iter=1000)
assessModel(model, parameters)

[0.9529001308329699, 0.9237765904044453, 0.9529001308329699, 0.9316660540641136]


  _warn_prf(average, modifier, msg_start, len(result))


In [6]:
# Test with decision tree for next level of complexity
parameters = {'max_depth':[4, 5]}
model = tree.DecisionTreeClassifier(random_state = 1865)
assessModel(model, parameters)

[0.9993458351504579, 0.9993517463861991, 0.9993458351504579, 0.9993477630599079]


In [7]:
# Test with random forests for next level of complexity
parameters = {'max_depth':[2,3]}
model = RandomForestClassifier(max_depth=2, random_state=55)
assessModel(model, parameters)

[0.9550806803314436, 0.9571041616023209, 0.9550806803314436, 0.9354503259417334]


In [8]:
# Test with KNN for next level of complexity
parameters = {'n_neighbors':[3,4]}
model = KNeighborsClassifier()
assessModel(model, parameters)

[0.9529001308329699, 0.9340380562554633, 0.9529001308329699, 0.9381854118703289]


In [None]:
# Maybe consider XGboost too

In [11]:
# Test with neural network from sklearn
from sklearn.neural_network import MLPClassifier
parameters = {'hidden_layer_sizes':[(64,32), (32,16)]}
model = MLPClassifier()
assessModel(model, parameters)

[0.9520279110335804, 0.9312333647150078, 0.9520279110335804, 0.931981581284562]


In [None]:
# Try models from model-list
# - KNN
# - Decision tree
# - Random forest
# - Neural network (Do I need to use Keras/tensorflow? Maybe)
