# Coursework
```
Authors: Laurens Leusink, Martin Popper, and Tommaso Bergonzoni
Date: 13-03-2024
```

## Task 1
### Code

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

decimal_count = 3 # number of decimal places to round the evaluation metrics to
num_runs = 5 # number of runs

# Paths to the file containing the samples with the processed features
feature_of_counts = "../processed_data/feature_vectors_counts.csv"

# Importing the dataset
dataset = pd.read_csv(feature_of_counts, index_col=0)
X = dataset.iloc[:,1:9].values
y = dataset.iloc[:, 9].values

# Initialize lists to store evaluation metrics
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
confusion_matrices = []

for run in range(num_runs):
    # Splitting the dataset into the Training set and the (unseen) Test set
    X_train, X_test, y_train, y_test = train_test_split(X, y.astype(int), test_size=0.2)
    
    # Feature Scaling
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    
    # Fitting the model to the Training set
    classifier = RandomForestClassifier()
    classifier.fit(X_train, y_train)

    # Using the model to predict the Test set
    y_pred = classifier.predict(X_test)

    # Computing evaluation metrics
    accuracy_scores.append(accuracy_score(y_test, y_pred))
    precision_scores.append(precision_score(y_test, y_pred, average='binary', zero_division=1))
    recall_scores.append(recall_score(y_test, y_pred, average='binary', zero_division=1))
    f1_scores.append(f1_score(y_test, y_pred))
    confusion_matrices.append(confusion_matrix(y_test, y_pred))

# Calculate the mean of evaluation metrics
mean_accuracy = round(sum(accuracy_scores) / len(accuracy_scores), decimal_count)
mean_precision = round(sum(precision_scores) / len(precision_scores), decimal_count)
mean_recall = round(sum(recall_scores) / len(recall_scores), decimal_count)
mean_f1 = round(sum(f1_scores) / len(f1_scores), decimal_count)

# Printing the mean evaluation metrics and the mean confusion matrix
print(f"Evaluation metrics averaged over {num_runs} tests:")
print("Confusion Matrix:")
print(sum(confusion_matrices) // len(confusion_matrices))
print("Accuracy:", mean_accuracy)
print("Precision:", mean_precision)
print("Recall:", mean_recall)
print("F1 Score:", mean_f1)

Evaluation metrics averaged over 5 tests:
Confusion Matrix:
[[24646    64]
 [  216   875]]
Accuracy: 0.989
Precision: 0.932
Recall: 0.802
F1 Score: 0.862


### Output
```
Evaluation metrics averaged over 5 tests:
Confusion Matrix:
[[24646    64]
 [  216   875]]
Accuracy: 0.989
Precision: 0.932
Recall: 0.802
F1 Score: 0.862
```

## Task 2
### Code

In [5]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, make_scorer
from sklearn.utils import resample

decimal_count = 4 # number of decimal places to round the evaluation metrics to
cv_folds = 5 # number of folds for cross-validation

# Parameters to be tested for each classifier
classifier_params = {
  "Random Forest": [
      {"n_estimators": 100, "max_depth": 10, "random_state": 42},
      {"n_estimators": 150, "max_depth": 15, "random_state": 42},
      {"n_estimators": 200, "max_depth": 20, "random_state": 42}
  ],
  "Gradient Boosting": [
      {"n_estimators": 100, "learning_rate": 0.1, "max_depth": 3, "random_state": 42},
      {"n_estimators": 150, "learning_rate": 0.1, "max_depth": 5, "random_state": 42},
      {"n_estimators": 200, "learning_rate": 0.1, "max_depth": 7, "random_state": 42}
  ],
  "Logistic Regression": [
      {"C": 1.0, "penalty": 'l2', "solver": 'liblinear', "random_state": 42},
      {"C": 0.5, "penalty": 'l1', "solver": 'liblinear', "random_state": 42},
      {"C": 0.1, "penalty": 'l2', "solver": 'liblinear', "random_state": 42}
  ],
  "k-Nearest Neighbors": [
      {"n_neighbors": 5, "algorithm": 'ball_tree'},
      {"n_neighbors": 7, "algorithm": 'ball_tree'},
      {"n_neighbors": 10, "algorithm": 'kd_tree'}
  ],
  "Support Vector Machine": [  
      {"C": 1.0, "kernel": "linear", "random_state": 42}, 
      {"C": 0.5, "kernel": "linear", "random_state": 42},
      {"C": 0.1, "kernel": "linear", "random_state": 42}
  ]
}

classifiers = {} # Store the trained classifiers
best_classifiers = {} # Store the best classifiers

# Load the dataset
feature_of_counts = "../processed_data/feature_vectors_counts.csv"
dataset = pd.read_csv(feature_of_counts, index_col=0)
X = dataset.iloc[:, 1:9].values  # Selecting features
y = dataset.iloc[:, 9].values  # Target variable

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y.astype(int), test_size=0.3, random_state=4)

# Apply Random Undersampling to the training data
X_train_majority = X_train[y_train == 0]
X_train_minority = X_train[y_train == 1]
X_train_majority_downsampled = resample(X_train_majority, 
                                        replace=False,    # sample without replacement
                                        n_samples=X_train_minority.shape[0],    # to match minority class
                                        random_state=42) # reproducible results

X_train_balanced = np.vstack((X_train_majority_downsampled, X_train_minority))
y_train_balanced = np.hstack((np.zeros(X_train_minority.shape[0]), np.ones(X_train_minority.shape[0])))

# Feature scaling
sc = StandardScaler()
X_train_balanced = sc.fit_transform(X_train_balanced)
X_test = sc.transform(X_test)

# Mapping classifier names to classifier classes
classifier_classes = {
  "Random Forest": RandomForestClassifier,
  "Gradient Boosting": GradientBoostingClassifier,
  "Logistic Regression": LogisticRegression,
  "k-Nearest Neighbors": KNeighborsClassifier,
  "Support Vector Machine": SVC
}

# Store the best classifier details
best_classifier_details = {
    'name': None,
    'params': None,
    'score': -1
}

# Function to train and evaluate classifier with cv
def train_and_evaluate_classifier_with_cv(classifier, classifier_name, params_list, X, y, cv_folds=5):
    scorer = make_scorer(roc_auc_score)
    classifiers[classifier_name] = {}  # Initialize dictionary for storing classifiers

    for params in params_list:
        print(f"\tParams: {params}")

        if classifier_name == "Support Vector Machine":
            clf = classifier(probability=True, **params)  # Enable probability estimation
        else:
            clf = classifier(**params)

        cv_results = cross_validate(clf, X, y, cv=cv_folds, scoring=scorer, return_train_score=False)
        auc_scores = cv_results['test_score']
        average_auc = round(np.mean(auc_scores), decimal_count)
        classifiers[classifier_name][str(params)] = average_auc  # Store average AUC score
        print(f"\t\tAverage AUC: {average_auc}")
        
    best_params = max(classifiers[classifier_name], key=classifiers[classifier_name].get)
    print(f"\tBest params for {classifier_name}: {best_params} with AUC: {classifiers[classifier_name][best_params]}")

# Train and evaluate each classifier with different sets of parameters
for clf_name, clf_class in classifier_classes.items():
    params_list = classifier_params.get(clf_name, [])
    print(f"-----\nEvaluating {clf_name} using {cv_folds}-fold cross-validation...")
    train_and_evaluate_classifier_with_cv(clf_class, clf_name, params_list, X_train_balanced, y_train_balanced, cv_folds=cv_folds)

# Find the best classifier
for clf_name, clf_params_scores in classifiers.items():
    for params, score in clf_params_scores.items():
        if score > best_classifier_details['score']:
            best_classifier_details['name'] = clf_name
            best_classifier_details['params'] = eval(params)
            best_classifier_details['score'] = score
            
# Store the best classifier
best_clf_name = best_classifier_details['name']
best_params = best_classifier_details['params']

clf_class = classifier_classes[best_clf_name]  # Get the correct classifier class

# Train the best classifier with the best parameters
if best_clf_name == "Support Vector Machine":
    best_clf = clf_class(probability=True, **best_params)
else:
    best_clf = clf_class(**best_params)

# Fit the best classifier to the balanced training data
best_clf.fit(X_train_balanced, y_train_balanced)
best_classifiers[best_clf_name] = best_clf

# Compute evaluation metrics for the best classifier
best_clf_name, best_clf = max(best_classifiers.items(), key=lambda x: roc_auc_score(y_test, x[1].predict_proba(X_test)[:, 1]))
y_pred = best_clf.predict(X_test)

# Compute and round evaluation metrics
confusion = confusion_matrix(y_test, y_pred)
accuracy = round(accuracy_score(y_test, y_pred), decimal_count)
precision = round(precision_score(y_test, y_pred), decimal_count)
recall = round(recall_score(y_test, y_pred), decimal_count)
f1 = round(f1_score(y_test, y_pred), decimal_count)

# Print the evaluation metrics
print(f"\n-----\nBest classifier: {best_clf_name} with params: {best_params}")
print(f"Confusion matrix:\n{confusion}")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 score: {f1}")

-----
Evaluating Random Forest using 5-fold cross-validation...
	Params: {'n_estimators': 100, 'max_depth': 10, 'random_state': 42}
		Average AUC: 0.9117
	Params: {'n_estimators': 150, 'max_depth': 15, 'random_state': 42}
		Average AUC: 0.9308
	Params: {'n_estimators': 200, 'max_depth': 20, 'random_state': 42}
		Average AUC: 0.9324
	Best params for Random Forest: {'n_estimators': 200, 'max_depth': 20, 'random_state': 42} with AUC: 0.9324
-----
Evaluating Gradient Boosting using 5-fold cross-validation...
	Params: {'n_estimators': 100, 'learning_rate': 0.1, 'max_depth': 3, 'random_state': 42}
		Average AUC: 0.878
	Params: {'n_estimators': 150, 'learning_rate': 0.1, 'max_depth': 5, 'random_state': 42}
		Average AUC: 0.9131
	Params: {'n_estimators': 200, 'learning_rate': 0.1, 'max_depth': 7, 'random_state': 42}
		Average AUC: 0.9294
	Best params for Gradient Boosting: {'n_estimators': 200, 'learning_rate': 0.1, 'max_depth': 7, 'random_state': 42} with AUC: 0.9294
-----
Evaluating Logistic

### Output
```
Best classifier: Random Forest with params: {'n_estimators': 200, 'max_depth': 20, 'random_state': 42}
Confusion matrix:
[[34716  2317]
 [  119  1552]]
Accuracy: 0.9371
Precision: 0.4011
Recall: 0.9288
F1 score: 0.5603
```

## Task 3
### Code

In [6]:
from sklearn.metrics import roc_curve

# Predict probabilities using the best classifier
y_scores = best_clf.predict_proba(X_test)[:, 1]

# Compute ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_scores)

# Compute Youden's index
J = tpr - fpr

# Locate optimal threshold
optimal_idx = np.argmax(J)
optimal_threshold = thresholds[optimal_idx]
print("Optimal threshold value:", optimal_threshold)

# Use optimal threshold to classify items
y_pred_optimal = np.where(y_scores >= optimal_threshold, 1, 0)

# Compute evaluation metrics using the optimal threshold
confusion = confusion_matrix(y_test, y_pred_optimal)
accuracy = round(accuracy_score(y_test, y_pred_optimal), decimal_count)
precision = round(precision_score(y_test, y_pred_optimal), decimal_count)
recall = round(recall_score(y_test, y_pred_optimal), decimal_count)
f1 = round(f1_score(y_test, y_pred_optimal), decimal_count)

# Print the evaluation metrics
print(f"Confusion matrix:\n{confusion}")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 score: {f1}")

Optimal threshold value: 0.5433571428571429
Confusion matrix:
[[35126  1907]
 [  127  1544]]
Accuracy: 0.9474
Precision: 0.4474
Recall: 0.924
F1 score: 0.6029


### Output
```
Optimal threshold value: 0.5433571428571429
Confusion matrix:
[[35126  1907]
 [  127  1544]]
Accuracy: 0.9474
Precision: 0.4474
Recall: 0.924
F1 score: 0.6029
``` 

## Task 4
### Code
#### Create the vectors
This script starts with extracting the amount of permissions from the dataset. It then creates a vector for each app.

In [6]:
import os
import csv

def getPermissionTypes(dir_of_files):

    permissionTypes = set()  # Use set for faster lookup

    # Prepare to write to the CSV file once, reducing file operation overhead
    with open("AllPermissions.csv", "w", encoding="utf-8", newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["Permission_Type"])  # Write the header once

        files = os.listdir(dir_of_files)  # Read the directory list once
        totalFiles = len(files)

        # Iterate over files
        for fileNumber, filename in enumerate(files, start=1):
            with open(os.path.join(dir_of_files, filename), 'r', encoding='utf-8') as f:
                line = f.readline().strip()

                # Check if the line is a permission line
                if line.startswith("permission::"):
                    permission = line.split("::")[1]

                    # Add the permission if it's new and write it to the file
                    if permission not in permissionTypes:
                        permissionTypes.add(permission)
                        writer.writerow([permission])
                        print(f"New permission: {permission}")

            print(f"File {fileNumber} of {totalFiles} processed")
            
# Define the directory you want to list files from
directory = "../raw_data/feature_vectors/"

# List to keep track of already present permissions
permissions_present = []

# Check if AllPermissions.csv already exists
permissions_csv_file = "../processed_data/AllPermissions.csv"
if os.path.isfile(permissions_csv_file):
    # If the file exists, read permissions from it
    with open(permissions_csv_file, "r") as csvfile:
        reader = csv.reader(csvfile)
        permissions_present = [row[0] for row in reader]
    print(f"Permissions loaded from '{permissions_csv_file}'.")
else:
    getPermissionTypes(directory)
    # Sort the list of permissions
    permissions_present.sort()
    
    print(f"The CSV file '{permissions_csv_file}' containing all possible permissions has been successfully created.")

# Define the output CSV file path
output_csv_file = "../processed_data/feature_vectors_permissions_counts.csv"

# List to keep track of malware file names
malware_files = set()

# Read malware file names from the sha256_family.csv file
with open("../raw_data/sha256_family.csv", "r") as file:
    reader = csv.reader(file)
    for row in reader:
        malware_files.add(row[0])

# List to keep track of rows to write in the CSV file
rows_to_write = []

# Loop through files in the directory again and get data for each file
total_files = len(os.listdir(directory))
for i, filename in enumerate(os.listdir(directory), start=1):
    # Print progress
    print(f"Processing file number {i} of {total_files}...")
    # Check if the element is a file (not a directory)
    if os.path.isfile(os.path.join(directory, filename)):
        # Count permissions present in the current file
        current_file_permissions = {permission: 0 for permission in permissions_present}
        with open(os.path.join(directory, filename), 'r') as file:
            # Iterate through each line of the file
            for line in file:
                # Check if the line starts with "permission::"
                if line.startswith("permission::"):
                    # Extract permission from the line
                    permission = line.strip().split("::")[1]
                    # If the permission is present in the file, set the value to 1
                    if permission in current_file_permissions:
                        current_file_permissions[permission] = 1
        # Check if the file is among malware files
        malware = "true" if filename in malware_files else "false"
        # Add data of the current file to the list of rows to write in the CSV file
        rows_to_write.append([i, filename] + list(current_file_permissions.values()) + [malware])

# Write data to the CSV file
with open(output_csv_file, "w", newline='') as csvfile:
    writer = csv.writer(csvfile)
    # Write the CSV file header
    header = [None, "sha256"] + [f"s{j}" for j in range(1, len(permissions_present) + 1)] + ["malware"]
    writer.writerow(header)
    # Write data for each row in the CSV file
    for row in rows_to_write:
        writer.writerow(row)

print(f"The CSV file '{output_csv_file}' has been successfully created.")

File 1 of 129013 processed
File 2 of 129013 processed
File 3 of 129013 processed
File 4 of 129013 processed
File 5 of 129013 processed
File 6 of 129013 processed
File 7 of 129013 processed
File 8 of 129013 processed
File 9 of 129013 processed
File 10 of 129013 processed
File 11 of 129013 processed
File 12 of 129013 processed
File 13 of 129013 processed
File 14 of 129013 processed
File 15 of 129013 processed
File 16 of 129013 processed
File 17 of 129013 processed
File 18 of 129013 processed
File 19 of 129013 processed
New permission: android.permission.VIBRATE
File 20 of 129013 processed
File 21 of 129013 processed
New permission: android.permission.CALL_PHONE
File 22 of 129013 processed
File 23 of 129013 processed
File 24 of 129013 processed
File 25 of 129013 processed
File 26 of 129013 processed
File 27 of 129013 processed
File 28 of 129013 processed
File 29 of 129013 processed
File 30 of 129013 processed
File 31 of 129013 processed
File 32 of 129013 processed
File 33 of 129013 proces

### Applying the classifier

In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.utils import resample

decimal_count = 3 # number of decimal places to round the evaluation metrics to
featureCount = 3261 # number of features in the dataset

feature_permissions_counts = "../processed_data/feature_vectors_permissions_counts.csv" # path to the file containing the samples with the processed features

newDataset = pd.read_csv(feature_permissions_counts, index_col=0)

X = newDataset.iloc[:, 1:featureCount].values
y = newDataset.iloc[:, featureCount+1].values

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y.astype(int), test_size=0.3, random_state=4)

# Apply Random Undersampling to the training data
X_train_majority = X_train[y_train == 0]
X_train_minority = X_train[y_train == 1]
X_train_majority_downsampled = resample(X_train_majority, 
                                        replace=False,    # sample without replacement
                                        n_samples=X_train_minority.shape[0],    # to match minority class
                                        random_state=42) # reproducible results

X_train_balanced = np.vstack((X_train_majority_downsampled, X_train_minority))
y_train_balanced = np.hstack((np.zeros(X_train_minority.shape[0]), np.ones(X_train_minority.shape[0])))

clf = RandomForestClassifier(n_estimators=150, max_depth=15, random_state=42)
clf.fit(X_train_balanced, y_train_balanced)

# Predict probabilities using the best classifier
y_scores = clf.predict_proba(X_test)[:, 1]

# Compute ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_scores)

# Compute Youden's index
J = tpr - fpr

# Locate optimal threshold
optimal_idx = np.argmax(J)
optimal_threshold = thresholds[optimal_idx]
print("Optimal threshold value:", optimal_threshold)

# Use optimal threshold to classify items
y_pred_optimal = np.where(y_scores >= optimal_threshold, 1, 0)

# Compute evaluation metrics using the optimal threshold
confusion = confusion_matrix(y_test, y_pred_optimal)
accuracy = round(accuracy_score(y_test, y_pred_optimal), decimal_count)
precision = round(precision_score(y_test, y_pred_optimal), decimal_count)
recall = round(recall_score(y_test, y_pred_optimal), decimal_count)
f1 = round(f1_score(y_test, y_pred_optimal), decimal_count)

# Print the evaluation metrics
print(f"Confusion matrix:\n{confusion}")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 score: {f1}")

Optimal threshold value: 0.391252011566174
Confusion matrix:
[[26618  3100]
 [  104  1245]]
Accuracy: 0.897
Precision: 0.287
Recall: 0.923
F1 score: 0.437


### Output
```
Optimal threshold value: 0.391252011566174
Confusion matrix:
[[26618  3100]
 [  104  1245]]
Accuracy: 0.897
Precision: 0.287
Recall: 0.923
F1 score: 0.437
```