In [1]:
# execute only once to add home directory
import sys
sys.path.insert(0,'../')
# This will integrate functions to load the datasets. 
# There are four functions: one for X, one for y, and one for metadata associated with the samples, and one for time-index.
import libs.utils

# Important: change this parameter to the folder where you inserted the dataset (see README.md in lab 02)
dataset_path = '../datasets/drebin/'

In [2]:
import json
import pickle
import datetime
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score

In [3]:
# Flag reduced=True loads only the top 10k features; reduced=False loads the entire dataset of featues
X, feature_names = libs.utils.load_X(dataset_path, reduced=True)
y = libs.utils.load_y(dataset_path)
metadata = libs.utils.load_metadata(dataset_path)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Machine learning model: Linear SVM (liblinear implementation)
model = LinearSVC(C=1, dual=True, max_iter=10000)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate F1 score
f1 = f1_score(y_test, y_pred)

# Print the F1 score
print("F1 score:", f1)

Loading data (feature representation X, and feature names)...
Loading labels...
Loading metadata...
F1 score: 0.8608780487804878


# Exercise: Time-aware Evaluation

Use the Tesseract library to visualize the time-aware performance of the LinearSVC classifier. 

Advice: create a Python virtual environment with Python3.10 to install the tesseract library from: https://github.com/s2labres/tesseract-ml-release

To register the virtual environment on the Python notebook:
```bash
python -m ipykernel install --user --name <env-name>
```
where env-name matches the name of the environment. 
