In [None]:
import pandas as pd
import numpy as np

In [62]:
import tables

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
import networkx as nx

In [66]:
def add_missing_data_git(df, entity='Vendor'):
    
    max_date =  max(df['dailyTime'])
    min_date = min(df['dailyTime'])

    idx = pd.date_range(min_date, max_date)
    df.set_index('dailyTime', inplace=True)

    df_concat = []

    for user, group in df.groupby(entity):
        s = group['PushEvent']

        s = s.reindex(idx, fill_value=0)

        df = pd.DataFrame(s)
        df[entity] = user

        df_concat.append(df)

    df_new = pd.concat(df_concat)
    return df_new
    

In [67]:
def generate_input_files(G, nodes, file_nodes, file_edges):
    
    df = nx.to_pandas_edgelist(G)
    df['weight'] = 1
    #Find self loops
    self_nodes = df.loc[df['source'] == df['target']]
    self_nodes = list(self_nodes['source'])
    
    #Find nodes without self-loops
    no_self_nodes = list(set(nodes) - set(self_nodes))
    #insert self-loops with weight 0 for records without self loops
    entries = [{'source':node, 'target':node, 'weight':0} for node in no_self_nodes]
    
    if len(entries) > 0:
        df= df.append(entries)
    
    #encode the largest connected component into integers
    lc_encoding = {}
    for _i, node in enumerate(nodes):
        lc_encoding[node] = str(_i)
    df['source'] = df['source'].apply(lambda x: lc_encoding[x])
    df['target'] = df['target'].apply(lambda x: lc_encoding[x])
    
    #rename columns
    df.rename(columns={'source':'from', 'target':'to', 'weight':'distance'}, inplace=True)
    df['distance']= df['distance'].astype(float)
    
    #write id list to file
    with open(file_nodes, 'w') as f:
        f.write(",".join(list(lc_encoding.values())))
    df.to_csv(file_edges, index=False)
    print('Files written succesfully')
    
    return df, lc_encoding

In [4]:
path_to_file = ''

#### Load Dataset

In [None]:
cve_df = ss.load_data(path_to_file)

In [6]:
github_df = cve_df.loc[cve_df['platform'] == 'github'].reset_index(drop=True)

In [7]:
github_df.head()

Unnamed: 0,actionType,communityID,domain_linked,has_URL,informationID,links_to_external,nodeID,nodeTime,nodeUserID,parentID,platform,rootID
0,PullRequestReviewCommentEvent,,[],0,CVE-2016-1643,0,3QzVzsrO-Goc3xy_JRo58A/wwZ4LQv58D0ltk7KUib37Q,2017-04-01 00:03:00,tZWqlNYHz9xWRrmK8-QUKg,,github,
1,PullRequestReviewCommentEvent,,[],0,CVE-2011-0779,0,3QzVzsrO-Goc3xy_JRo58A/wwZ4LQv58D0ltk7KUib37Q,2017-04-01 00:03:00,tZWqlNYHz9xWRrmK8-QUKg,,github,
2,PullRequestReviewCommentEvent,,[],0,CVE-2016-1643,0,3QzVzsrO-Goc3xy_JRo58A/wwZ4LQv58D0ltk7KUib37Q,2017-04-01 00:03:05,tZWqlNYHz9xWRrmK8-QUKg,,github,
3,PullRequestReviewCommentEvent,,[],0,CVE-2011-0779,0,3QzVzsrO-Goc3xy_JRo58A/wwZ4LQv58D0ltk7KUib37Q,2017-04-01 00:03:05,tZWqlNYHz9xWRrmK8-QUKg,,github,
4,IssueCommentEvent,,[],0,CVE-2016-4658,0,rGi7-SHZU9HPq5FsthIIZA/AsV4DYI_UZeHihXm9e9GxA,2017-04-01 00:10:03,J3DaDjCzXqgSSJXbtBkaKg,,github,


In [8]:
print(min(github_df['nodeTime']), max(github_df['nodeTime']))

2015-01-01 00:05:41 2018-03-31 23:41:48


## Vendor Level

In [168]:
df = mapping_df.groupby('Vendor')['Product'].apply(list).reset_index(name='list')

In [173]:
mapping_df.loc[mapping_df['Vendor'] == 'microsoft']

Unnamed: 0,CVEID,Vendor,Product,AccessVector,Severity,DOD Usage
19,CVE-1999-0015,microsoft,windows_95,NETWORK,MEDIUM,1.0
20,CVE-1999-0015,microsoft,windows_nt,NETWORK,MEDIUM,1.0
26,CVE-1999-0016,microsoft,winsock,NETWORK,MEDIUM,1.0
27,CVE-1999-0016,microsoft,windows_95,NETWORK,MEDIUM,1.0
28,CVE-1999-0016,microsoft,windows_nt,NETWORK,MEDIUM,1.0
42,CVE-1999-0278,microsoft,internet_information_server,NETWORK,MEDIUM,1.0
43,CVE-1999-0278,microsoft,windows_nt,NETWORK,MEDIUM,1.0
55,CVE-1999-0519,microsoft,outlook,NETWORK,HIGH,1.0
56,CVE-1999-0519,microsoft,windows_2000,NETWORK,HIGH,1.0
57,CVE-1999-0519,microsoft,windows_95,NETWORK,HIGH,1.0


In [172]:
df.loc[df['Vendor'] == 'microsoft']

Unnamed: 0,Vendor,list
1524,microsoft,"[windows_95, windows_nt, winsock, windows_95, ..."


In [76]:
path_to_ven = ''

In [77]:
mapping_df = pd.read_csv(path_to_ven)

In [78]:
vendor_df = pd.read_csv(path_to_ven)
vendor_df.rename(columns={'CVEID':'informationID'}, inplace=True)

In [79]:
github_ven = pd.merge(github_df, vendor_df, on='informationID', how='inner')

In [80]:
github_ven= github_ven.drop_duplicates(['actionType', 'informationID', 'nodeID', 'nodeTime',
                                          'nodeUserID', 'Vendor']).reset_index(drop=True)

In [81]:
vendor_df = github_ven[['informationID', 'Vendor']].copy()
vendor_df = vendor_df.drop_duplicates()

In [82]:
vendor_df1 = vendor_df.copy()
vendor_df1.rename(columns={'Vendor':'Target'}, inplace=True)

In [83]:
vendors_df = pd.merge(vendor_df, vendor_df1, on='informationID', how='left')

In [84]:
vendors_df= vendors_df.groupby(['Vendor','Target']).size().reset_index(name='weight')

#### Building Graph

In [70]:
#Create digraph for GitHub
G_ven = nx.from_pandas_edgelist(vendors_df,'Vendor', 'Target', ['weight'], create_using=nx.Graph())

In [71]:
G_ven.number_of_nodes(), G_ven.number_of_edges()

(651, 2148)

In [72]:
#Get largest strongly connected component
lc_ven = sorted(nx.connected_components(G_ven), key=len, reverse=True)

In [73]:
#Get Subgraph of the largest connected component.
lc_ven = G_ven.subgraph(lc_ven[0])

In [74]:
#Number of nodes and edges in largest connected component
lc_ven.number_of_nodes(), lc_ven.number_of_edges()

(569, 2047)

In [75]:
#Get list of nodes in largest connected component
ven_nodes = list(lc_ven.nodes())

#### Getting activities by Vendor

In [85]:
ven_acts = github_ven.loc[github_ven['Vendor'].isin(ven_nodes)].reset_index(drop=True)

In [86]:
#Create a dailyTime field
ven_acts['dailyTime'] = ven_acts['nodeTime'].dt.strftime('%Y-%m-%d')

In [87]:
#Get daily activities of each ven'' for 10 events
ven_dayacts = ven_acts.groupby(['Vendor', 'dailyTime']).actionType.value_counts().unstack(fill_value=0).reset_index()

In [88]:
del ven_dayacts.columns.name

In [89]:
#dailyTime to datetime object
ven_dayacts['dailyTime'] = pd.to_datetime(ven_dayacts['dailyTime'])

In [90]:
#Keep only PushEvents
ven_dayacts = ven_dayacts[['Vendor', 'dailyTime', 'PushEvent']].copy()

#### Generate missing inactive days

In [92]:
github_new_ven = add_missing_data_git(ven_dayacts, entity='Vendor')

In [94]:
#Get user avg daily push activities 
ven_avg_df = github_new_ven.groupby('Vendor')['PushEvent'].mean().reset_index(name='avg_daily')

In [95]:
ven_avg_df.describe()

Unnamed: 0,avg_daily
count,569.0
mean,0.618939
std,3.114962
min,0.0
25%,0.004216
50%,0.028668
75%,0.132378
max,41.571669


### Filter out users with an avg. number of activities less than the global avg.

In [97]:
avg_nodes_ven = ven_avg_df.loc[ven_avg_df['avg_daily'] > 0.618939]

In [98]:
avg_nodes_ven = list(avg_nodes_ven['Vendor'])

In [99]:
#Keep only connections for nodes with avg activity greater than mean
df_filter_ven = vendors_df.loc[vendors_df['Vendor'].isin(avg_nodes_ven)].reset_index(drop=True)

In [101]:
#Create digraph for filtered nodes
G_filter_ven = nx.from_pandas_edgelist(df_filter_ven, 'Vendor', 'Target', ['weight'], create_using=nx.DiGraph())

In [102]:
#Get largest strongly connected component
lc_filter_ven = sorted(nx.strongly_connected_components(G_filter_ven), key=len, reverse=True)

In [103]:
#Get Subgraph of the largest connected component.
lc_filter_ven = G_filter_ven.subgraph(lc_filter_ven[0])

In [104]:
#Number of nodes and edges in largest connected component
lc_filter_ven.number_of_nodes(), lc_filter_ven.number_of_edges()

(66, 1032)

In [105]:
lc_filter_nodes_ven = list(lc_filter_ven.nodes())

### Construct edge list and node list

In [106]:
edgelist_path_ven = ''
nodelist_path_ven = ''
edgelist_ven, encoding_ven = generate_input_files(lc_filter_ven, lc_filter_nodes_ven, nodelist_path_ven, edgelist_path_ven)

Files written succesfully


### Get activities for active vendors only

In [108]:
github_new_ven_filter = github_new_ven.loc[github_new_ven['Vendor'].isin(lc_filter_nodes_ven)]

In [None]:
#Set dailyTime as a column in the df
github_new_ven_filter.reset_index(level=0, inplace=True)
github_new_ven_filter.rename(columns={'index':'nodeTime'}, inplace=True)

In [None]:
github_new_ven_filter['label'] = github_new_ven_filter['Vendor'].map(encoding_ven)

In [113]:
ven_day_acts_formatted = github_new_ven_filter.pivot_table(values='PushEvent', index='nodeTime', columns='label', aggfunc='first')

In [175]:
avg_ven_filter = github_new_ven_filter.groupby('Vendor')['PushEvent'].mean().reset_index(name='avg_acts')

In [176]:
avg_ven_filter.describe()

Unnamed: 0,avg_acts
count,66.0
mean,4.838776
std,8.015425
min,0.62226
25%,0.895447
50%,1.730185
75%,4.145025
max,41.571669


#### Generate h5 user features file

In [115]:
output_file = ''
ven_day_acts_formatted.to_hdf(output_file, key='df', mode='w')

## Product Level

In [116]:
path_to_prod = ''

In [117]:
mapping_df = pd.read_csv(path_to_prod)

In [118]:
product_df = pd.read_csv(path_to_prod)
product_df.rename(columns={'CVEID':'informationID'}, inplace=True)

In [119]:
github_prod = pd.merge(github_df, product_df, on='informationID', how='inner')

In [120]:
github_prod= github_prod.drop_duplicates(['actionType', 'informationID', 'nodeID', 'nodeTime',
                                          'nodeUserID', 'Product']).reset_index(drop=True)

In [121]:
product_df = github_prod[['informationID', 'Product']].copy()
product_df = product_df.drop_duplicates()

In [122]:
product_df1 = product_df.copy()
product_df1.rename(columns={'Product':'Target'}, inplace=True)

In [123]:
products_df = pd.merge(product_df, product_df1, on='informationID', how='left')

In [124]:
products_df= products_df.groupby(['Product','Target']).size().reset_index(name='weight')

#### Building Graph

In [125]:
#Create digraph for GitHub
G_prod = nx.from_pandas_edgelist(products_df,'Product', 'Target', ['weight'], create_using=nx.Graph())

In [126]:
G_prod.number_of_nodes(), G_prod.number_of_edges()

(5057, 250401)

In [127]:
#Get largest strongly connected component
lc_prod = sorted(nx.connected_components(G_prod), key=len, reverse=True)

In [128]:
#Get Subgraph of the largest connected component.
lc_prod = G_prod.subgraph(lc_prod[0])

In [129]:
#Number of nodes and edges in largest connected component
lc_prod.number_of_nodes(), lc_prod.number_of_edges()

(1632, 16198)

In [130]:
#Get list of nodes in largest connected component
prod_nodes = list(lc_prod.nodes())

#### Getting activities by proddor

In [131]:
prod_acts = github_prod.loc[github_prod['Product'].isin(prod_nodes)].reset_index(drop=True)

In [132]:
#Create a dailyTime field
prod_acts['dailyTime'] = prod_acts['nodeTime'].dt.strftime('%Y-%m-%d')

In [133]:
#Get daily activities of each prod'' for 10 eprodts
prod_dayacts = prod_acts.groupby(['Product', 'dailyTime']).actionType.value_counts().unstack(fill_value=0).reset_index()

In [134]:
del prod_dayacts.columns.name

In [135]:
#dailyTime to datetime object
prod_dayacts['dailyTime'] = pd.to_datetime(prod_dayacts['dailyTime'])

In [136]:
#Keep only PushEprodts
prod_dayacts = prod_dayacts[['Product', 'dailyTime', 'PushEvent']].copy()

#### Generate missing inactive days

In [137]:
github_new_prod = add_missing_data_git(prod_dayacts, entity='Product')

In [138]:
#Get user avg daily push activities 
prod_avg_df = github_new_prod.groupby('Product')['PushEvent'].mean().reset_index(name='avg_daily')

In [139]:
prod_avg_df.describe()

Unnamed: 0,avg_daily
count,1632.0
mean,0.308702
std,1.8463
min,0.0
25%,0.000843
50%,0.01855
75%,0.090219
max,41.312816


### Filter out users with an avg. number of activities less than the global avg.

In [140]:
avg_nodes_prod = prod_avg_df.loc[prod_avg_df['avg_daily'] > 0.308702]

In [141]:
avg_nodes_prod = list(avg_nodes_prod['Product'])

In [142]:
#Keep only connections for nodes with avg activity greater than mean
df_filter_prod = products_df.loc[products_df['Product'].isin(avg_nodes_prod)].reset_index(drop=True)

In [144]:
#Create digraph for filtered nodes
G_filter_prod = nx.from_pandas_edgelist(df_filter_prod, 'Product', 'Target', ['weight'], create_using=nx.DiGraph())

In [145]:
#Get largest strongly connected component
lc_filter_prod = sorted(nx.strongly_connected_components(G_filter_prod), key=len, reverse=True)

In [146]:
#Get Subgraph of the largest connected component.
lc_filter_prod = G_filter_prod.subgraph(lc_filter_prod[0])

In [147]:
#Number of nodes and edges in largest connected component
lc_filter_prod.number_of_nodes(), lc_filter_prod.number_of_edges()

(199, 5343)

In [148]:
lc_filter_nodes_prod = list(lc_filter_prod.nodes())

### Construct edge list and node list

In [149]:
edgelist_path_prod = ''
nodelist_path_prod = ''
edgelist_prod, encoding_prod = generate_input_files(lc_filter_prod, lc_filter_nodes_prod, nodelist_path_prod, edgelist_path_prod)

Files written succesfully


### Get activities for active proddors only

In [150]:
github_new_prod_filter = github_new_prod.loc[github_new_prod['Product'].isin(lc_filter_nodes_prod)]

In [None]:
#Set dailyTime as a column in the df
github_new_prod_filter.reset_index(level=0, inplace=True)
github_new_prod_filter.rename(columns={'index':'nodeTime'}, inplace=True)

In [None]:
github_new_prod_filter['label'] = github_new_prod_filter['Product'].map(encoding_prod)

In [153]:
prod_day_acts_formatted = github_new_prod_filter.pivot_table(values='PushEvent', index='nodeTime', columns='label', aggfunc='first')

In [177]:
avg_prod_filter = github_new_prod_filter.groupby('Product')['PushEvent'].mean().reset_index(name='avg_acts')

In [178]:
avg_prod_filter.describe()

Unnamed: 0,avg_acts
count,199.0
mean,2.250528
std,4.872136
min,0.312816
25%,0.493255
50%,0.777403
75%,1.87226
max,41.312816


#### Generate h5 user features file

In [155]:
output_file = ''
prod_day_acts_formatted.to_hdf(output_file, key='df', mode='w')