In [1]:
from IPython.display import display
import pandas as pd
import numpy as np
import re
import itertools
import matplotlib.pyplot as plt
import networkx as nx
from networkx.drawing.nx_agraph import graphviz_layout

data_folder = 'data/'
vendors_selected_file = data_folder + 'vendors_selected.csv.gz'
name_graph_file = data_folder + 'malware_names_graph.csv.gz'

In [2]:
# Load the names with only relevant vendors
vendors = pd.read_csv(vendors_selected_file, dtype=str)
vendors = vendors.set_index('link')
vendors.replace('Clean', np.nan, inplace=True)

In [3]:
# To split the name into words
regex = re.compile(r'\W')
# To remove pairs that contain at least a number
regex2= re.compile(r'^\S\d|\d\S$')
# To remove hex values
regex3= re.compile(r'^[A-Fa-f0-9]{3,}$')

# Name filter function
def filter_func(name):    
    if name is np.nan:
        return np.nan
    # Convert to lower
    name = str(name).lower()
    
    name = regex.split(name)
    # Remove whitespaces
    name = map(str.strip, name)
    # Filter words with at least 2 len
    name = filter(lambda x: len(x) > 1, name)
    # Filter pairs
    name = filter(lambda x: not regex2.search(x), name)
    # Filter hex values
    name = filter(lambda x: not regex3.search(x), name)
        
    name = filter(lambda x: x != '', name)
    name = '.'.join(name)
    return name if name != '' else np.nan

In [5]:
%%time
# Apply the filter func
vendors_filtered = vendors.applymap(filter_func)

CPU times: user 30.5 s, sys: 157 ms, total: 30.6 s
Wall time: 30.6 s


Generate the graph for all names and save it for future use.

In [12]:
%%time
def generate_graph(df):
    table = list(df.iterrows())
    for i in range(0, len(df), 10000):
        edge_list = []
        for l, s in table[i:i+10000]:
            s = s.dropna()
            s = map(sorted, itertools.combinations(s, 2))
            for a, b in s:
                edge_list.append({'a': a, 'b': b, 'weight':1.0})
        temp = pd.DataFrame(edge_list).groupby(['a', 'b']).sum().reset_index()
        temp.to_csv(name_graph_file + '_{}'.format(i), compression='gzip')
    temp = []
    for i in range(0, len(df), 10000):
        temp.append(pd.read_csv(name_graph_file + '_{}'.format(i), compression='gzip'))
    return pd.concat(temp).groupby(['a', 'b']).sum().reset_index()[['a', 'b', 'weight']]

# frame_graph = generate_graph(vendors_filtered)
# frame_graph.to_csv(name_graph_file, compression='gzip')
frame_graph = pd.read_csv(name_graph_file)
frame_graph = frame_graph[['a', 'b', 'weight']]

In [27]:
# Count unique names occurrence
name_count = pd.Series(np.append(frame_graph['a'].values, frame_graph['b'].values)).value_counts()

In [7]:
# Filter names by percentage of ocurrence
# filtered_names = frozenset(name_count[name_count > len(name_count) * 0.001].index)
# display(len(filtered_names))

In [10]:
# Remove edges that only appear once
frame_graph2 = frame_graph[frame_graph['weight'] != 1]

low_conn = set(frame_graph2['a'].value_counts().sort_values(ascending=True)[:8000].index)
low_conn.update(frame_graph2['b'].value_counts().sort_values(ascending=True)[:8000].index)

frame_graph2 = frame_graph2[frame_graph2['a'].isin(low_conn) & frame_graph2['b'].isin(low_conn)]
frame_graph2 = frame_graph2[frame_graph2['a'] != frame_graph2['b']]

display(frame_graph2['weight'].max())
# frame_graph2 = frame_graph2[frame_graph2['weight'] > frame_graph2['weight'].max() * 0.3]
display(frame_graph2['weight'].describe())
# frame_graph2 = frame_graph2[frame_graph2['weight'] > 4]
display(frame_graph2)

88.0

count    218.000000
mean       3.224771
std        6.130676
min        2.000000
25%        2.000000
50%        2.000000
75%        3.000000
max       88.000000
Name: weight, dtype: float64

Unnamed: 0_level_0,a,b,weight,len
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
15496,adw_adserve,riskware.gator.hsri,2.0,5000.000000
19311,adw_mypcbackup,applicunwnt.mypcbackup,4.0,2500.000000
19431,adw_nextlive,trojan.nextlive.csjhvj,12.0,833.333333
19688,adw_outbrowse,downloader.chb,2.0,5000.000000
20446,adw_searchsuite,pup.optional.musictoolbar,2.0,5000.000000
20462,adw_searchsuite,trojan.toolbar.dfneuz,2.0,5000.000000
22230,adw_zwangi,trojan.adload.cpnsj,2.0,5000.000000
41065,adware.agent.odr,riskware.mutabaha.djzlsp,5.0,2000.000000
53809,adware.commad,not.virus.monitor.netmon,3.0,3333.333333
58149,adware.doubled,heuristic.lookslike.ad.spyware.agent,2.0,5000.000000


In [11]:
%%time
G = nx.from_pandas_dataframe(frame_graph2, 'a', 'b', ['len'])
colors = range(len(G.edges()))[::-1]
# pos = graphviz_layout(G, prog='twopi', args='-scale=1000')
pos = graphviz_layout(G, prog='sfdp',) #args='-scale=1000')

# plt.figure(figsize=(30, 50))
# A4 print size
plt.figure(figsize=(11.69, 8.27))
nx.draw_networkx(G, pos, font_size=5, node_size=1, edge_cmap=plt.cm.Blues, edge_color=colors, width=0.1)
plt.savefig('test.pdf', format='pdf', dpi=500)
plt.close()

CPU times: user 1.25 s, sys: 1.02 s, total: 2.26 s
Wall time: 1.57 s
