In [98]:
import numpy as np
import seaborn as sns
import os
import pandas as pd
import matplotlib.pyplot as plt
import torch
from tqdm import tqdm
import json
from collections import OrderedDict

# Preparation

In [2]:
MODEL_FILEDIR = '/scratch/data/TrojAI/cyber-pdf-dec2022-train/models/'
METADATA_FILEPATH = '/scratch/data/TrojAI/cyber-pdf-dec2022-train/METADATA.csv'
MODEL_NUM = 120
# MODEL_ARCH = ['classification:' + arch for arch in ['resnet50', 'vit_base_patch32_224', 'mobilenet_v2']]
# OUTPUT_FILEDIR = '/scratch/jialin/image-classification-sep2022/projects/weight_analysis/extracted_source/'


def num_to_model_id(num):
    return 'id-' + str(100000000+num)[1:]

# Load Metadata

In [3]:
METADATA = pd.read_csv(METADATA_FILEPATH)
METADATA.head()

Unnamed: 0,model_name,data_split,ground_truth,poisoned,poisoned_level,arch_level,nn_layers_level,nn_activation_function_level,svm_kernel_level,rf_trees_level,...,prepoison-unwatermarked-benign-support,prepoison-unwatermarked-malicious-precision,prepoison-unwatermarked-malicious-recall,prepoison-unwatermarked-malicious-f1-score,prepoison-unwatermarked-malicious-support,prepoison-watermarked-accuracy,prepoison-watermarked-malicious-precision,prepoison-watermarked-malicious-recall,prepoison-watermarked-malicious-f1-score,prepoison-watermarked-malicious-support
0,id-00000000,train,0,False,0,0,0,0,,,...,,,,,,,,,,
1,id-00000001,train,1,True,1,0,5,0,,,...,2250.0,0.996365,0.998179,0.997271,2746.0,0.0,0.0,0.0,0.0,1215.0
2,id-00000002,train,1,True,1,0,4,0,,,...,2250.0,0.996,0.997451,0.996725,2746.0,0.378422,1.0,0.378422,0.549065,1242.0
3,id-00000003,train,1,True,1,0,3,0,,,...,2250.0,0.99745,0.997087,0.997268,2746.0,1.0,1.0,1.0,1.0,1265.0
4,id-00000004,train,1,True,1,0,2,0,,,...,2250.0,0.992764,0.999272,0.996007,2746.0,0.021721,1.0,0.021721,0.042518,1197.0


# Feature Extraction

## Aggregated Weight

In [107]:
def extract_weight(model_repr : dict, layers=['fc1.weight', 'fc1.bias'], axis=0):
    params = []
    for layer in layers:
        param = model_repr[layer]
        if len(param.shape) > 1:
            params += np.amax(param, axis=axis).tolist()
            params += np.mean(param, axis=axis).tolist()
            sub = np.mean(param, axis=axis) - np.median(param, axis=axis)
            params += sub.tolist()
            params += np.median(param, axis=axis).tolist()
            params += np.sum(param, axis=axis).tolist()
            params.append(np.linalg.norm(param, ord='fro')**2/np.linalg.norm(param, ord=2)**2)
        else:
            params.append(param.max().tolist())
            params.append(param.mean().tolist())
            sub = param.mean() - np.median(param)
            params.append(sub.tolist())
            params.append(np.median(param).tolist())
            params.append(param.sum().tolist())
            params.append((np.linalg.norm(param.reshape(param.shape[0], -1), ord='fro')**2/np.linalg.norm(param.reshape(param.shape[0], -1), ord=2)**2).tolist())
    return np.asarray(params)

In [108]:
model_num = 0
model_id = num_to_model_id(model_num)
model_filepath = os.path.join(MODEL_FILEDIR, model_id, 'model.pt')
model = torch.load(model_filepath)
model_repr = OrderedDict({layer: tensor.numpy() for (layer, tensor) in model.state_dict().items()})

p = extract_weight(model_repr)

In [109]:
p.shape

(682,)

In [114]:
weight_dict_X, weight_dict_y = [], []
for model_num in tqdm(range(MODEL_NUM)):
    model_id = num_to_model_id(model_num)
    model_filepath = os.path.join(MODEL_FILEDIR, model_id, 'model.pt')
    model = torch.load(model_filepath)
    model_repr = OrderedDict({layer: tensor.numpy() for (layer, tensor) in model.state_dict().items()})

    p = extract_weight(model_repr)
    poisoned = METADATA[METADATA['model_name'] == model_id]['poisoned'].item()

    weight_dict_X.append(p.tolist()) # + p[-501:].tolist())
    # weight_dict_X.append(p[:500+1].tolist()+p[-11:].tolist())
    weight_dict_y.append(poisoned)
weight_dict_X = np.asarray(weight_dict_X)

100%|██████████| 120/120 [00:01<00:00, 92.11it/s]


In [99]:
X, y = [], []
for model_num in range(MODEL_NUM):
    model_id = num_to_model_id(model_num)

    X.append(weight_dict[model_id])
    
    poisoned = METADATA[METADATA['model_name'] == model_id]['poisoned'].item()
    y.append(poisoned)
X = np.asarray(X)

## Eigen Value

In [21]:
def extract_eigen(model):
    params = []
    num_param_per_layer = []
    min_shape = 1
    for param in model.parameters():
        if len(param.shape) > min_shape:
            reshaped_param = param.reshape(param.shape[0], -1)
            singular_values = torch.linalg.svd(reshaped_param, False).S
            squared_singular_values = torch.square(singular_values)
            ssv = squared_singular_values.tolist()
            params += ssv
            num_param_per_layer.append(len(ssv))
        return np.asarray(params), np.asarray(num_param_per_layer)

In [23]:
eigen_dict, eigen_shape_dict = [], []
for model_num in tqdm(range(MODEL_NUM)):
    model_id = num_to_model_id(model_num)
    model_filepath = os.path.join(MODEL_FILEDIR, model_id, 'model.pt')
    model = torch.load(model_filepath)
    model.eval()

    e, es = extract_eigen(model)
    # eigen_dict[model_id] = e
    # eigen_shape_dict[model_id] = es
    eigen_dict.append(e)
eigen_dict = np.asarray(eigen_dict)

100%|██████████| 120/120 [00:00<00:00, 370.09it/s]


In [24]:
eigen_dict.shape

(120, 100)

In [105]:
X, y = [], []
for model_num in range(MODEL_NUM):
    model_id = num_to_model_id(model_num)
    
    # x_weight = weight_dict[model_id][:507].tolist() + weight_dict[model_id][-17:].tolist()
    x_weight = weight_dict[model_id].tolist()
    x_eigen = eigen_dict[model_id][:100].tolist() + eigen_dict[model_id][-2:].tolist()
    X.append(x_weight + x_eigen)
    
    poisoned = METADATA[METADATA['model_name'] == model_id]['poisoned'].item()
    y.append(poisoned)
X = np.asarray(X)

# Visualization

In [10]:
from sklearn.metrics import log_loss
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

def bootstrap_performance(X, y, clf, n=10, test_size=.2, eps=.01):
    all_cross_entropy, all_accuracy = [], []
    for i in range(n):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=i)

        if np.unique(y_train).shape[0] == 1 or np.unique(y_test).shape[0] == 1:
            continue
        
        clf.set_params(random_state=i)            
        clf.fit(X_train, y_train)
        
        all_cross_entropy.append(log_loss(y_test, clf.predict_proba(X_test), eps=eps))
        all_accuracy.append(clf.score(X_test, y_test))
    return all_cross_entropy, all_accuracy

In [115]:
weight_dict_X.shape, eigen_dict.shape

((120, 682), (120, 100))

In [31]:
weight_dict_X = np.concatenate([weight_dict_X, eigen_dict], axis=-1)
weight_dict_X.shape

(120, 1277)

In [116]:
clf = GradientBoostingClassifier(learning_rate=.018, n_estimators=560, min_samples_leaf=44, max_depth=3, max_features=656, min_samples_split=56)
# clf = GradientBoostingClassifier(learning_rate=.01, n_estimators=500)
cen, acc = bootstrap_performance(weight_dict_X, weight_dict_y, clf, n=50, test_size=.2)
print(np.mean(cen), np.mean(acc))

0.5980388544491879 0.685


first layer weight only: cen - 0.6722151649906911; acc - 0.7125; (agg on axis=0, only weight)
0.6710706124944856 0.7167 (agg on axis=0, weight + bias)
first layer weight only: cen - 0.8338462676300622; acc - 0.5942; (agg on axis=-1, only weight)
first + last layer weight: cen - 0.7767968836885063; acc - 0.675; (agg on axis=0, only weight)
first + last layer weight: cen - 0.8614197942179671; acc - 0.5858; (agg on axis=-1, only weight)
first layer weight with eigen: cen - 0.6719570250977263; acc - 0.7025;
first + last layer weight with eigen: cen - 0.8070599903699808 ; acc - 0.6683;

# Tune/Train Models

In [49]:
OUTPUT_FILEDIR = '/scratch/jialin/cyber-pdf-dec2022/projects/weight_analysis/extracted_source'

In [73]:
clf = GradientBoostingClassifier(learning_rate=.018, n_estimators=560, min_samples_leaf=44, max_depth=3, max_features=656, min_samples_split=56)

# param={'min_samples_split': range(40, 101, 2), 'max_features': range(630, 677, 2)}
param = {'learning_rate':np.arange(.001, .0251, .001), 'n_estimators':range(400, 1201, 40)}
# param = {'learning_rate':[.01, .005, .015, .03, .0075], 'n_estimators':[650, 1300, 450, 225, 900]}
gsearch = GridSearchCV(estimator=clf, param_grid=param, scoring=['neg_log_loss', 'accuracy'], n_jobs=10, cv=5, refit=False);
gsearch.fit(weight_dict_X, weight_dict_y);

In [74]:
gsearch_result = pd.DataFrame(gsearch.cv_results_).sort_values(by=['rank_test_neg_log_loss', 'rank_test_accuracy'])
gsearch_result.to_csv(os.path.join(OUTPUT_FILEDIR, 'gsearch_result.csv'))

In [93]:
weight_dict_X.shape

(120, 682)

In [117]:
import joblib
clf = GradientBoostingClassifier(learning_rate=.018, n_estimators=560, min_samples_leaf=44, max_depth=3, max_features=656, min_samples_split=56).fit(weight_dict_X, weight_dict_y)
joblib.dump(clf, os.path.join(OUTPUT_FILEDIR, 'detector.joblib'))

['/scratch/jialin/cyber-pdf-dec2022/projects/weight_analysis/extracted_source/detector.joblib']

In [118]:
np.save(os.path.join(OUTPUT_FILEDIR, 'X.npy'), weight_dict_X)
np.save(os.path.join(OUTPUT_FILEDIR, 'y.npy'), weight_dict_y)

In [120]:
X, y = np.load(os.path.join(OUTPUT_FILEDIR, 'fe_X.npy')), np.load(os.path.join(OUTPUT_FILEDIR, 'fe_y.npy'))
X.shape, y.shape

((120, 682), (120,))

In [121]:
clf = GradientBoostingClassifier(learning_rate=.018, n_estimators=560, min_samples_leaf=44, max_depth=3, max_features=656, min_samples_split=56)
# clf = GradientBoostingClassifier(learning_rate=.01, n_estimators=500)
cen, acc = bootstrap_performance(X, y, clf, n=50, test_size=.2)
print(np.mean(cen), np.mean(acc))

0.6261368925796619 0.6691666666666667


### Generate json schema (without automatic_training param)

In [None]:
TUNABLE_PARAMS = ['learning_rate', 'n_estimators', 'max_depth', 'min_samples_split', 'min_samples_leaf', 'max_features']
param_dict = {}
for ma in MODEL_ARCH:
    p_dict = {f"{ma[15:]}_{k}": v for k, v in clf_dict[ma].get_params().items() if k in TUNABLE_PARAMS}
    param_dict = {**p_dict, **param_dict}
param_dict

In [None]:
MIN_VAL, MAX_VAL = [0.001, 1, 1, 2, 1, 1], [1, 3000, 10, 1000, 1000, 1100]
SMIN_VAL, SMAX_VAL = [0.005, 100, 2, 10, 2, 20], [0.05, 1200, 5, 50, 25, 220]
keys = ['minimum', 'maximum', 'suggested_minimum', 'suggested_maximum']
DESC_VAL = {}
for key, val in zip(keys, [MIN_VAL, MAX_VAL, SMIN_VAL, SMAX_VAL]):
    val_dict = {k:v for k, v in zip(TUNABLE_PARAMS, val)}
    DESC_VAL[key] = val_dict
desc_dict = {}
for ma in MODEL_ARCH:
    for k, v in clf_dict[ma].get_params().items():
        baseline_desc = {}
        if k in TUNABLE_PARAMS:
            baseline_desc['description'] = f'Tunable parameter {k} in sklearn Gradient Boosting Classifier for model architecture {ma}'
            baseline_desc['type'] = 'number' if k == 'learning_rate' else 'integer'
            for key in keys:
                baseline_desc[key] = DESC_VAL[key][k]
            desc_dict[f'{ma[15:]}_{k}'] = baseline_desc

with open(os.path.join(EXTRACTED_FILEDIR, 'json_vals.json'), 'w') as outfile:
    json.dump(desc_dict, outfile)

In [6]:
import joblib

joblib.dump(None, '/scratch/jialin/image-classification-sep2022/projects/weight_analysis/src/none.joblib')

['/scratch/jialin/image-classification-sep2022/projects/weight_analysis/src/none.joblib']

In [7]:
model = joblib.load('/scratch/jialin/image-classification-sep2022/projects/weight_analysis/src/none.joblib')
model.predict_proba([[1, 2, 3]])

AttributeError: 'NoneType' object has no attribute 'predict_proba'