# **Malware Detection**

## Dataset: Benign & malicious PE Files for malware detection
##### * Context
This dataset is a result of a my research about Machine Learning & Malware Detection.

It was built using a Python Library and contains benign and malicious data from PE Files.

##### * Inspiration
Machine Learning Antimalware

In [None]:
import numpy as np
import pandas as pd
import os

**Creating a DataFrame**

In [None]:
df_train = pd.read_csv("./dataset_malwares.csv")

In [None]:
df_train.head()

In [None]:
df_train.columns

In [None]:
df_train.info()

In [None]:
df_train.Malware

**Malware is our target, 0 = Benign, 1 = Malware**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
print("Columns having only 1 unique Value")
for col in df_train:
    if len(df_train[col].unique())==1:
        print(col)

In [None]:
X=df_train.drop(['Name','Malware','e_magic',
'SectionMaxEntropy',
'SectionMaxRawsize',
'SectionMaxVirtualsize',
'SectionMinPhysical',
'SectionMinVirtual',
'SectionMinPointerData',
'SectionMainChar'],axis=1)

In [None]:
df_test=pd.read_csv("./dataset_test.csv")
X_t=df_test.drop(['Name','e_magic',
'SectionMaxEntropy',
'SectionMaxRawsize',
'SectionMaxVirtualsize',
'SectionMinPhysical',
'SectionMinVirtual',
'SectionMinPointerData',
'SectionMainChar'],axis=1)

In [None]:
y=df_train['Malware']

In [None]:
y

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

Train Valid Split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)

# **KNN Classifier**

In [None]:
neigh = KNeighborsClassifier(n_neighbors=2)
neigh.fit(X_train, y_train)

In [None]:
y_pred=neigh.predict(X_valid)

In [None]:
def c_r(y_valid,y_pred):
    ax=sns.heatmap(confusion_matrix(y_pred, y_valid), annot=True, fmt="d", cmap=plt.cm.Blues, cbar=False)
    ax.set_xlabel('Predicted labels')
    ax.set_ylabel('True labels')
    print(classification_report(y_valid, y_pred, target_names=['Benign', 'Malware']))

In [None]:
c_r(y_valid,y_pred)

**Accuracy with Real Test Dataset**

In [None]:
real_preds = neigh.predict(X_t)

In [None]:
real_preds

In [None]:
df_test['Name']

In [None]:
for x in df_test.iterrows():
    i,d=x
    print(f"{d['Name']}:{real_preds[i]}")

# **Random Forest**

In [None]:
df_train = pd.read_csv('./dataset_malwares.csv')

In [None]:
dropped_df = df_train.drop(['Name', 'Machine', 'TimeDateStamp', 'Malware'], axis=1)

In [None]:
features = ['MajorSubsystemVersion', 'MajorLinkerVersion', 'SizeOfCode', 'SizeOfImage', 'SizeOfHeaders', 'SizeOfInitializedData',
            'SizeOfUninitializedData', 'SizeOfStackReserve', 'SizeOfHeapReserve', 'NumberOfSymbols', 'SectionMaxChar']
i=1

for feature in features:
    plt.figure(figsize=(10, 15))
    ax1 = plt.subplot(len(features), 2, i)
    sns.distplot(df_train[df_train['Malware']==1][feature], ax=ax1, kde_kws={'bw': 0.1})
    ax1.set_title(f'Malware', fontsize=10)
    ax2 = plt.subplot(len(features), 2, i+1)
    sns.distplot(df_train[df_train['Malware']==0][feature], ax=ax2, kde_kws={'bw': 0.1})
    ax2.set_title(f'Benign', fontsize=10)
    i= i+2

In [None]:
X_train = dropped_df
y = df_train['Malware']

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
print("Number of used features:", X_train.shape[1])

In [None]:
# Initialize the Random Forest Classifier
clf = RandomForestClassifier(
    # Set the number of trees to 100
    n_estimators=100,
    # Set the random state to 0 to ensure reproducibility
    random_state=0,
    # Enable the out-of-bag (OOB) score
    oob_score = True,
    # Set the maximum depth of the trees to 16
    max_depth = 16)

# Fit the classifier to the training data
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_valid)

In [None]:
c_r(y_valid,y_pred)

In [None]:
importance = clf.feature_importances_
importance_dict = dict(zip(dropped_df.columns.values, importance))
sorted_importance = dict(sorted(importance_dict.items(), key=lambda x: x[1], reverse=True))

plt.figure(figsize=(10, 20))
sns.barplot(x=list(sorted_importance.values()), y=list(sorted_importance.keys()), palette='mako')
plt.xlabel('Importance Value')
plt.ylabel('Feature Name')
plt.title('Feature Importance in Random Forest Classifier')
plt.show()