<a href="https://colab.research.google.com/github/karu-rress/SDAGroupProj/blob/main/0aj_TreeForestLogistic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install shap

Collecting shap
  Downloading shap-0.43.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (532 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/532.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.1/532.9 kB[0m [31m3.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m532.9/532.9 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Collecting slicer==0.0.7 (from shap)
  Downloading slicer-0.0.7-py3-none-any.whl (14 kB)
Installing collected packages: slicer, shap
Successfully installed shap-0.43.0 slicer-0.0.7


In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier  # Import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.decomposition import PCA
import shap
from sklearn.pipeline import Pipeline

In [4]:
df_origin = pd.read_csv('/content/Android_Malware.csv', low_memory=False)

In [5]:
df = df_origin.copy() # Just in case ;)

In [6]:
df.columns = df.columns.str.strip()

In [7]:
# Define the pairs of columns for differences
packet_pairs = [('Total Fwd Packets', 'Total Backward Packets'),
                ('Total Length of Fwd Packets', 'Total Length of Bwd Packets'),
                ('Fwd IAT Total', 'Bwd IAT Total'),
                ('Fwd PSH Flags', 'Bwd PSH Flags'),
                ('Fwd URG Flags', 'Bwd URG Flags'),
                ('Fwd Header Length', 'Bwd Header Length'),
                ('Fwd Packets/s', 'Bwd Packets/s'),
                ('Avg Fwd Segment Size', 'Avg Bwd Segment Size'),
                ('Fwd Avg Bytes/Bulk', 'Bwd Avg Bytes/Bulk'),
                ('Fwd Avg Packets/Bulk', 'Bwd Avg Packets/Bulk'),
                ('Fwd Avg Bulk Rate', 'Bwd Avg Bulk Rate'),
                ('Subflow Fwd Packets', 'Subflow Bwd Packets'),
                ('Subflow Fwd Bytes', 'Subflow Bwd Bytes'),
                ('Init_Win_bytes_forward', 'Init_Win_bytes_backward')]

# Convert columns to numeric before calculating differences
df = df.apply(pd.to_numeric, errors='coerce')

# Calculate differences for each pair and create separate columns
for pair in packet_pairs:
    col_name_diff = f'{pair[0]} - {pair[1]}'
    df[col_name_diff] = df[pair[0]] - df[pair[1]]

In [8]:
# List of all columns to drop
columns_to_drop = [
    'Unnamed: 0', 'Flow ID', 'Source IP', 'Source Port', 'Destination IP', 'Destination Port',
    'Protocol', 'Timestamp', 'Total Fwd Packets', 'Total Backward Packets',
    'Total Length of Fwd Packets', 'Total Length of Bwd Packets',
    'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Fwd Packet Length Std',
    'Bwd Packet Length Max', 'Bwd Packet Length Min', 'Bwd Packet Length Std',
    'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min',
    'Fwd IAT Total', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min',
    'Bwd IAT Total', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min',
    'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags',
    'Fwd Header Length', 'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s',
    'Min Packet Length', 'Max Packet Length', 'Packet Length Std', 'Packet Length Variance',
    'ECE Flag Count', 'Avg Fwd Segment Size', 'Avg Bwd Segment Size',
    'Fwd Header Length.1', 'Fwd Avg Bytes/Bulk', 'Fwd Avg Packets/Bulk', 'Fwd Avg Bulk Rate',
    'Bwd Avg Bytes/Bulk', 'Bwd Avg Packets/Bulk', 'Bwd Avg Bulk Rate',
    'Subflow Fwd Packets', 'Subflow Fwd Bytes', 'Subflow Bwd Packets', 'Subflow Bwd Bytes',
    'Init_Win_bytes_forward', 'Init_Win_bytes_backward',
    'Active Std', 'Active Max', 'Active Min', 'Idle Std', 'Idle Max', 'Idle Min'
]

# Drop the specified columns
df = df.drop(columns=columns_to_drop, errors='ignore')

In [9]:
df['Label'] = df_origin['Label']

In [10]:
columns_with_null = df.columns[df.isnull().any()]
df_null_counts = df[columns_with_null].isnull().sum()
print(df_null_counts[df_null_counts > 0])

Bwd Packet Length Mean                              1
Flow Bytes/s                                        1
Flow Packets/s                                      1
Flow IAT Mean                                       1
Fwd IAT Mean                                        1
Bwd IAT Mean                                        1
Packet Length Mean                                  1
FIN Flag Count                                      1
SYN Flag Count                                      1
RST Flag Count                                      1
PSH Flag Count                                      1
ACK Flag Count                                      1
URG Flag Count                                      1
CWE Flag Count                                      1
Down/Up Ratio                                       1
Average Packet Size                                 1
act_data_pkt_fwd                                    1
min_seg_size_forward                                1
Active Mean                 

In [11]:
df_null_counts.shape
#22 columns with null values. Drop them because it is very few

(33,)

In [12]:
df = df.dropna()

In [13]:
features = df.drop(columns='Label')
target = df['Label']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=0)


## Random Forest

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the random forest classifier
forest = RandomForestClassifier(random_state=0)

# Define the parameter grid for grid search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(forest, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model from the grid search
best_forest = grid_search.best_estimator_

# Evaluate the best random forest model on the test set
print("Best Random Forest Model:")
print("Training set accuracy: {:.3f}".format(best_forest.score(X_train, y_train)))
print("Test set accuracy: {:.3f}".format(best_forest.score(X_test, y_test)))
print("Best parameters: ", grid_search.best_params_)

In [15]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=20, random_state=0)
forest.fit(X_train, y_train)

print("Training set accuracy: {:.3f}".format(forest.score(X_train, y_train)))
print("Test set accuracy: {:.3f}".format(forest.score(X_test, y_test)))

Training set accuracy: 0.980
Test set accuracy: 0.697


In [19]:
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier

# Specify the correct positive label for precision, recall, and f1 scores
positive_label = 'Android_Adware'

# Convert y_test to binary form for multi-class classification
y_test_binary = label_binarize(y_test, classes=['Android_Adware', 'Android_Scareware'])

# Define custom scorers for precision, recall, and f1
precision_scorer = make_scorer(precision_score, average='macro')
recall_scorer = make_scorer(recall_score, average='macro')
f1_scorer = make_scorer(f1_score, average='macro')
roc_auc_scorer = make_scorer(roc_auc_score, multi_class='ovr')

# Use the custom scorers in cross_val_score with OneVsRestClassifier for multi-class
forest_multi = OneVsRestClassifier(RandomForestClassifier())

print("Default Cross-validation score:", cross_val_score(forest_multi, X_test, y_test, scoring="accuracy", cv=10))
print("Accuracy score:", cross_val_score(forest_multi, X_test, y_test, scoring="accuracy", cv=10))
print("Roc_Auc score:", cross_val_score(forest_multi, X_test, y_test_binary, scoring=roc_auc_scorer, cv=10))
print("Precision score:", cross_val_score(forest_multi, X_test, y_test, scoring=precision_scorer, cv=10))
print("Recall score:", cross_val_score(forest_multi, X_test, y_test, scoring=recall_scorer, cv=10))
print("f1 score:", cross_val_score(forest_multi, X_test, y_test, scoring=f1_scorer, cv=10))

Default Cross-validation score: [0.67580323 0.67616627 0.66799782 0.66763478 0.6712652  0.66684822
 0.66975309 0.67338417 0.67937545 0.67429194]
Accuracy score: [0.67652932 0.67870757 0.66817934 0.66600109 0.67072064 0.66848221
 0.67138707 0.67356572 0.67864924 0.67374728]
Roc_Auc score: [0.60911727 0.6113965  0.59386683 0.59130737 0.59521736 0.59631945
 0.60248304 0.6045334  0.60932487 0.59752152]


Note that pos_label (set to 'Android_Adware') is ignored when average != 'binary' (got 'weighted'). You may use labels=[pos_label] to specify a single positive class.
Note that pos_label (set to 'Android_Adware') is ignored when average != 'binary' (got 'weighted'). You may use labels=[pos_label] to specify a single positive class.
Note that pos_label (set to 'Android_Adware') is ignored when average != 'binary' (got 'weighted'). You may use labels=[pos_label] to specify a single positive class.
Note that pos_label (set to 'Android_Adware') is ignored when average != 'binary' (got 'weighted'). You may use labels=[pos_label] to specify a single positive class.
Note that pos_label (set to 'Android_Adware') is ignored when average != 'binary' (got 'weighted'). You may use labels=[pos_label] to specify a single positive class.
Note that pos_label (set to 'Android_Adware') is ignored when average != 'binary' (got 'weighted'). You may use labels=[pos_label] to specify a single positive class

Precision score: [0.657991   0.66033791 0.64928041 0.64806451 0.64913289 0.64853907
 0.65533011 0.65893771 0.66367531 0.65302716]


Note that pos_label (set to 'Android_Adware') is ignored when average != 'binary' (got 'weighted'). You may use labels=[pos_label] to specify a single positive class.
Note that pos_label (set to 'Android_Adware') is ignored when average != 'binary' (got 'weighted'). You may use labels=[pos_label] to specify a single positive class.
Note that pos_label (set to 'Android_Adware') is ignored when average != 'binary' (got 'weighted'). You may use labels=[pos_label] to specify a single positive class.
Note that pos_label (set to 'Android_Adware') is ignored when average != 'binary' (got 'weighted'). You may use labels=[pos_label] to specify a single positive class.
Note that pos_label (set to 'Android_Adware') is ignored when average != 'binary' (got 'weighted'). You may use labels=[pos_label] to specify a single positive class.
Note that pos_label (set to 'Android_Adware') is ignored when average != 'binary' (got 'weighted'). You may use labels=[pos_label] to specify a single positive class

Recall score: [0.67562171 0.6772554  0.66618261 0.665275   0.66817934 0.67029775
 0.67120552 0.67592593 0.67465505 0.67156863]


Note that pos_label (set to 'Android_Adware') is ignored when average != 'binary' (got 'weighted'). You may use labels=[pos_label] to specify a single positive class.
Note that pos_label (set to 'Android_Adware') is ignored when average != 'binary' (got 'weighted'). You may use labels=[pos_label] to specify a single positive class.
Note that pos_label (set to 'Android_Adware') is ignored when average != 'binary' (got 'weighted'). You may use labels=[pos_label] to specify a single positive class.
Note that pos_label (set to 'Android_Adware') is ignored when average != 'binary' (got 'weighted'). You may use labels=[pos_label] to specify a single positive class.
Note that pos_label (set to 'Android_Adware') is ignored when average != 'binary' (got 'weighted'). You may use labels=[pos_label] to specify a single positive class.
Note that pos_label (set to 'Android_Adware') is ignored when average != 'binary' (got 'weighted'). You may use labels=[pos_label] to specify a single positive class

f1 score: [0.66378801 0.65929083 0.65230559 0.64881526 0.66062196 0.65732533
 0.65921571 0.66768846 0.66812426 0.65859743]


Note that pos_label (set to 'Android_Adware') is ignored when average != 'binary' (got 'weighted'). You may use labels=[pos_label] to specify a single positive class.


## Decision Tree

In [22]:
from sklearn.metrics import accuracy_score

# Decision tree with pruning
tree = DecisionTreeClassifier(random_state=0, max_depth=3, min_samples_split=10, min_samples_leaf=5)
tree.fit(X_train, y_train)

# Training set accuracy
train_accuracy = accuracy_score(y_train, tree.predict(X_train))
print("Training set accuracy: {:.3f}".format(train_accuracy))

# Test set accuracy
test_accuracy = accuracy_score(y_test, tree.predict(X_test))
print("Test set accuracy: {:.3f}".format(test_accuracy))

Training set accuracy: 0.670
Test set accuracy: 0.670


In [23]:
# Predict on the test set
y_pred_tree = tree.predict(X_test)

print(classification_report(y_test, y_pred_tree))

                   precision    recall  f1-score   support

   Android_Adware       0.67      0.99      0.80     36860
Android_Scareware       0.56      0.02      0.04     18225

         accuracy                           0.67     55085
        macro avg       0.62      0.51      0.42     55085
     weighted avg       0.63      0.67      0.55     55085



## Logistic Regression

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

# Create a StandardScaler to scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create a Logistic Regression model with increased max_iter
log_reg = LogisticRegression(max_iter=1000, random_state=0)

# Fit the model to the scaled training data
log_reg.fit(X_train_scaled, y_train)

# Evaluate the model on the scaled data
training_accuracy = log_reg.score(X_train_scaled, y_train)
test_accuracy = log_reg.score(X_test_scaled, y_test)

print("Training set accuracy: {:.3f}".format(training_accuracy))
print("Test set accuracy: {:.3f}".format(test_accuracy))


Training set accuracy: 0.668
Test set accuracy: 0.668
