In [14]:
import pandas as pd
import numpy as np
import altair as alt

In [4]:
data = pd.read_csv('Android_Malware.csv', index=False)

  data = pd.read_csv('Android_Malware.csv')


## Initial EDA
### Data Integrity Inspection

In [5]:
data

Unnamed: 0.1,Unnamed: 0,Flow ID,Source IP,Source Port,Destination IP,Destination Port,Protocol,Timestamp,Flow Duration,Total Fwd Packets,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0,172.217.6.202-10.42.0.211-443-50004-6,10.42.0.211,50004,172.217.6.202,443.0,6.0,13/06/2017 11:52:39,37027,1,...,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Android_Adware
1,1,172.217.6.202-10.42.0.211-443-35455-6,10.42.0.211,35455,172.217.6.202,443.0,6.0,13/06/2017 11:52:39,36653,1,...,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Android_Adware
2,2,131.253.61.68-10.42.0.211-443-51775-6,10.42.0.211,51775,131.253.61.68,443.0,6.0,13/06/2017 11:52:42,534099,8,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Android_Adware
3,3,131.253.61.68-10.42.0.211-443-51775-6,10.42.0.211,51775,131.253.61.68,443.0,6.0,13/06/2017 11:52:43,9309,3,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Android_Adware
4,4,131.253.61.68-10.42.0.211-443-51776-6,10.42.0.211,51776,131.253.61.68,443.0,6.0,13/06/2017 11:52:42,19890496,8,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Android_Adware
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
355625,405,172.217.7.14-10.42.0.211-80-38405-6,172.217.7.14,80,10.42.0.211,38405.0,6.0,17/06/2017 01:29:11,126711,1,...,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
355626,406,10.42.0.211-10.42.0.1-7632-53-17,10.42.0.211,7632,10.42.0.1,53.0,17.0,17/06/2017 01:30:33,48012,1,...,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
355627,407,10.42.0.211-104.192.110.245-45970-443-6,10.42.0.211,45970,104.192.110.245,443.0,6.0,17/06/2017 01:29:45,20028018,11,...,20.0,367528.0,0.0,367528.0,367528.0,19660490.0,0.0,19660490.0,19660490.0,Benign
355628,408,10.42.0.211-10.42.0.1-51982-53-17,10.42.0.211,51982,10.42.0.1,53.0,17.0,17/06/2017 01:29:45,347926,1,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign


In [6]:
d = data.sample(10)

### Data Sparcity checks

In [10]:
# If a column has only one unique value, it essentially does nothing from an ML standpoint and should be removed
data.columns[data.nunique()==1]

Index([' ECE Flag Count', ' Fwd Avg Packets/Bulk', ' Fwd Avg Bulk Rate',
       ' Bwd Avg Bytes/Bulk', ' Bwd Avg Packets/Bulk', 'Bwd Avg Bulk Rate'],
      dtype='object')

In [12]:
# We drop the above columns from the data here
data = data.drop(columns=data.columns[data.nunique()==1])

In [8]:
# Percentage of each row that is unique
# Difficult to use this as a method of filtering based on the size of our data, as we see a lot of our categorical features have very low values
data.nunique()/len(data)*100

Unnamed: 0          3.981104
Flow ID            49.758176
 Source IP          0.713101
 Source Port       14.437196
 Destination IP     1.334533
                     ...    
Idle Mean          11.197874
 Idle Std           2.498946
 Idle Max          11.188314
 Idle Min          11.186908
Label               0.001125
Length: 86, dtype: float64

There are many rows and a few columns that are considered categoricals, so we don't want to just use the percentage of unique values per column as a deciding factor. Instead, we have to take into account the percent contribution of each unique value to each column. For example, if a column has 4 unique labels but one label takes up 99.9% of the rows, it likely does not add much modeling capability.

We can use a variance threshold to take care of numerical features that are highly sparse. This can be visualized by plotting change in selected features vs variance threshold, as we do below

In [33]:
from sklearn.feature_selection import VarianceThreshold

data_num = data.select_dtypes(exclude='object')

x_eda = data_num.iloc[:, :-1] # Take until the last column
y_eda = data_num.iloc[:, -1] # Take the last column

transform = VarianceThreshold()

variance_thresholds = np.arange(start=0.0, stop=0.5, step=0.05)

results = []

for t in variance_thresholds:
    
    transform = VarianceThreshold(threshold=t)
    
    X_sel = transform.fit_transform(x_eda)
    print()
    rows, cols = X_sel.shape
    n_features = cols
    print('Threshold=%.2f, Features=%d' % (t, n_features))
    
    results.append(n_features)
    
d2 = pd.DataFrame({'threshold': variance_thresholds, 'n_features': results})
alt.Chart(d2).mark_line().encode(
    x='threshold',
    y='n_features')


Threshold=0.00, Features=71

Threshold=0.05, Features=69

Threshold=0.10, Features=69

Threshold=0.15, Features=68

Threshold=0.20, Features=67

Threshold=0.25, Features=65

Threshold=0.30, Features=65

Threshold=0.35, Features=65

Threshold=0.40, Features=65

Threshold=0.45, Features=65


Using a variance threshold to filter features does not do much to narrow things down in this case, focusing on only our numerical features. As stated above, we need to use a different approach to deal with sparse categorical features

In [4]:
# Checking the balance of the output variable
data['Label'].value_counts()

Android_Adware         147443
Android_Scareware      117082
Android_SMS_Malware     67397
Benign                  23708
Name: Label, dtype: int64

In [9]:
# Finding null values per column
data.isna().sum()[data.isna().sum() > 0]

Flow ID                     1
 Bwd Packets/s              1
 Min Packet Length          1
 Max Packet Length          1
 Packet Length Mean         1
 Packet Length Std          1
 Packet Length Variance     1
FIN Flag Count              1
 SYN Flag Count             1
 RST Flag Count             1
 PSH Flag Count             1
 ACK Flag Count             1
 URG Flag Count             1
 CWE Flag Count             1
 ECE Flag Count             2
 Down/Up Ratio              2
 Average Packet Size        3
 Avg Fwd Segment Size       3
 Avg Bwd Segment Size       3
 Fwd Header Length.1        3
Fwd Avg Bytes/Bulk          3
 Fwd Avg Packets/Bulk       4
 Fwd Avg Bulk Rate          4
 Bwd Avg Bytes/Bulk         4
 Bwd Avg Packets/Bulk       4
Bwd Avg Bulk Rate           4
Subflow Fwd Packets         4
 Subflow Fwd Bytes          4
 Subflow Bwd Packets        4
 Subflow Bwd Bytes          4
Init_Win_bytes_forward      4
 Init_Win_bytes_backward    4
 act_data_pkt_fwd           4
 min_seg_s

In [25]:
type(data.dtypes[1])

numpy.dtype[object_]

In [27]:
type(np.dtype('object'))

numpy.dtype[object_]