In [None]:
import os
import json
import datetime
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.feature_extraction import DictVectorizer
from tesseract import evaluation, temporal 
from sklearn.model_selection import train_test_split

FILEPATH='/Users/fabio/work/mlsec-malware-class/labs/extended-features/extended-features'

os.environ["PATH"] += os.pathsep + '/Library/TeX/texbin'

## Loading features
def load_dataset(dataset_path):
    """
    The function to load features in the Tesseract dataset. Please note that you have to parametrize the names of the files opened, to load the right file. 
    """
    print(f'Loading dataset from {dataset_path}')

    with open('{}-X-updated-reduced-10k.json'.format(dataset_path), 'r') as f:
        X = json.load(f)

    print('Loading labels...')
    with open('{}-y-updated.json'.format(dataset_path), 'rt') as f:
        y = json.load(f)

    print('Loading timestamps...')
    with open('{}-meta-updated.json'.format(dataset_path), 'rt') as f:
        meta = json.load(f)

    # Convert to numpy array and get feature names
    vec =  DictVectorizer()
    X = vec.fit_transform(X).astype("float32")
    y = np.asarray(y)
    feature_names = vec.get_feature_names_out()


    return X, y, meta, feature_names

X, y, meta, feature_names = load_dataset(FILEPATH)

## Commenting code to load and split

# # Following DREBIN feature-space
# clf = LinearSVC(C=1)

# # Random split of train-test
# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=0.33, random_state=42)

# # Training the classifier
# clf.fit(X_train, y_train)

# y_pred = clf.predict(X_test)

# Exercise

Check the impacts of 'markets' (sampling bias) and 'labels' (label inaccuracy) on malware detection.

Experiment to quantify impact: 
* Dataset $D_1$: 10,000 goodware from Play, 1,000 malware from Chinese markets
* Dataset $D_2$: 10,000 goodware from Play, 1,000 malware from Play
* Labels: Malware are apps detected by at least 10 VT antivirus scanners 
* Classifier: DREBIN's LinearSVM with C=1

Check Recall, Precision, F1-Score on D1 and D2. 

Does anything change if you change the 'label threshold'? 

In [None]:
## Hint to filter apps from different markets
markets = []
for o in meta:
    markets.append(o['markets'])

mask_google = np.array(markets) == 'play.google.com'
mask_else = np.array(['play' not in x for x in np.array(markets)])
print('Apps in Google Play Store: {:,}'.format(np.sum(mask_google)))
print('Apps in Other Markets: {:,}'.format(np.sum(mask_else)))

In [None]:
# Hint to 'count' apps from different markets
from collections import Counter
Counter(markets)