# Feature Variation

The purpose of this notebook is to see how the features (imports in this case) vary through time.

The first thing is to check in small dataset how the features vary and then do the same for all the imports. If in both cases the features vary in the same way, the LR made for the small dataset can behave similarly when used on the bigger dataset.

In [2]:
import lib.data_loading as jcfg_data_loading
import pandas as pd
from IPython.display import display
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import numpy as np

dataset_name = "dataset"
numb_datasets = 6

datasets = []

# Load all samples with an array, where each ith position are samples
# up to the ith+1 submission of the target sample
for i in range(numb_datasets):
    datasets.append(pd.read_csv(dataset_name + str(i) + '.csv'))
    datasets[i] = datasets[i].set_index('md5')
    datasets[i].dropna(inplace=True, subset=['imports'])
    print('Final {0}th dataset size: {1}'.format(i, len(datasets[i])))

Final 0th dataset size: 2798
Final 1th dataset size: 31
Final 2th dataset size: 22
Final 3th dataset size: 121
Final 4th dataset size: 286
Final 5th dataset size: 465


In [41]:
# Parameters for the bag of words
# Split by the semicolon
count_vec_pattern = u'[^;]+'
# A call must be present at least in x samples
min_df = 2
max_df = 1.0

train_test = [datasets[0], pd.concat(datasets[1:])]

# Create the bag of words baseline
cv = CountVectorizer(token_pattern=count_vec_pattern, max_df=max_df, min_df=min_df)
cv.fit(train_test[0].imports)
cv2 = CountVectorizer(token_pattern=count_vec_pattern, max_df=max_df, min_df=min_df)
cv2.fit(train_test[1].imports)

train_features = list(cv.vocabulary_.keys())
test_features = list(cv2.vocabulary_.keys())
common_features = [i for i in train_features if i in test_features]

print('Number of train imports: {0}'.format(len(train_features)))
print('Number of test imports: {0}'.format(len(test_features)))
print('Common imports: {0} ({1:.3} of train)'.format(len(common_features), len(common_features)/len(train_features)))

Number of train imports: 10519
Number of test imports: 3976
Common imports: 3191 (0.303 of train)


In [58]:
all_imports = pd.read_csv('data/imports.csv')
all_imports['date'] = pd.to_datetime(all_imports['date'], format='%Y/%m/%d')
# Set date as index
all_imports = all_imports.set_index('date')
# Drop useless columns
all_imports.drop(labels=['md5', 'link', 'malware'], axis=1, inplace=True)
all_imports.dropna(inplace=True)

In [61]:
all_imports.imports.resample('M').apply(lambda x: print(x))

2013-04-16           getprocaddress;loadlibrary;getmodulehandle
2013-04-16    setenvironmentvariable;_unlink;copyfile;strnca...
2013-04-16                                          _corexemain
2013-04-16    setenvironmentvariable;isdbcsleadbyte;movefile...
2013-04-16                                                sleep
2013-04-16    getprocessheap;getprocaddress;createthread;hea...
2013-04-16    isprocessorfeaturepresent;setenvironmentvariab...
2013-04-16    exitprocess;dragacceptfiles;createfile;seteven...
2013-04-16    vfprintf;wcslen;calloc;exitprocess;tlsgetvalue...
2013-04-16                                          _corexemain
2013-04-17    sendmessagetimeout;getwindowsdirectory;getcapt...
2013-04-17    getwindowsdirectory;virtualfree;createdirector...
2013-04-17    isprocessorfeaturepresent;sendmessagetimeout;r...
2013-04-17    _initterm;_acmdln;__p__commode;??2@yapaxi@z;cr...
2013-04-17    _initterm;isdbcsleadbyte;modf;_memicmp;getproc...
2013-04-17    isprocessorfeaturepresent;

date
2013-04-30    None
2013-05-31    None
2013-06-30    None
2013-07-31    None
2013-08-31    None
2013-09-30    None
2013-10-31    None
2013-11-30    None
2013-12-31    None
2014-01-31    None
2014-02-28    None
2014-03-31    None
2014-04-30    None
2014-05-31    None
2014-06-30    None
2014-07-31    None
2014-08-31    None
2014-09-30    None
2014-10-31    None
2014-11-30    None
2014-12-31    None
2015-01-31    None
2015-02-28    None
2015-03-31    None
2015-04-30    None
2015-05-31    None
2015-06-30    None
2015-07-31    None
2015-08-31    None
2015-09-30    None
2015-10-31    None
2015-11-30    None
2015-12-31    None
2016-01-31    None
2016-02-29    None
2016-03-31    None
2016-04-30    None
2016-05-31    None
2016-06-30    None
2016-07-31    None
2016-08-31    None
2016-09-30    None
2016-10-31    None
Freq: M, Name: imports, dtype: object