In [249]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx

In [252]:
path_to_file = ''

#### Load Dataset

In [None]:
cve_df = pd.read_csv(path_to_file)

### Get only github activities
github_df = cve_df.loc[cve_df['platform'] == 'github'].reset_index(drop=True)
print(min(github_df['nodeTime']), max(github_df['nodeTime']))

#### Connecting repos by common CVEs

In [257]:
repos_df1 = github_df[['nodeID', 'informationID']].copy()
repos_df1.rename(columns={'nodeID': 'repoID1'}, inplace=True)
repos_df1=repos_df1.drop_duplicates().reset_index(drop=True)

repos_df2 = github_df[['nodeID', 'informationID']].copy()
repos_df2.rename(columns={'nodeID': 'repoID2'}, inplace=True)
repos_df2=repos_df2.drop_duplicates().reset_index(drop=True)

### Merge both dataframes to link repos with common CVEs
repos_df = pd.merge(repos_df1, repos_df2, on='informationID', how='left')

### Get edge weight
repos_df=repos_df.groupby(['repoID1','repoID2']).size().reset_index(name='weight')

#### Build graph for pairs of Repos connected by CVE

In [267]:
g_nx = nx.from_pandas_edgelist(repos_df,'repoID1', 'repoID2', ['weight'], create_using=nx.Graph())
print('Nodes:', g_nx.number_of_nodes(), 'Edges:', g_nx.number_of_edges())

### Get largest strongly connected component
lc_strong = sorted(nx.connected_components(g_nx), key=len, reverse=True)

### Get Subgraph of the largest connected component.
repos_lc = g_nx.subgraph(lc_strong[0])

### Get list of nodes in largest connected component
repos_nodes = list(repos_lc.nodes())

#### Get daily activities

In [273]:
#Create a dailyTime field
github_df['dailyTime'] = github_df['nodeTime'].dt.strftime('%Y-%m-%d')

#Get only repos in connected component
github_acts=github_df.loc[github_df['nodeID'].isin(repos_nodes)].reset_index(drop=True)

#Get daily activities of each repo for each of the 10 events
github_dayacts_df = github_acts.groupby(['nodeID', 'dailyTime']).actionType.value_counts().unstack(fill_value=0).reset_index()

### dailyTime to datetime object
github_dayacts_df['dailyTime'] = pd.to_datetime(github_dayacts_df['dailyTime'])

### Keep only PushEvents
github_dayacts_df = github_dayacts_df[['nodeID', 'dailyTime', 'PushEvent']].copy()

#### Generate missing inactive days

In [342]:
max_date =  max(github_dayacts_df['dailyTime'])
min_date = min(github_dayacts_df['dailyTime'])

idx = pd.date_range(min_date, max_date)
github_dayacts_df.set_index('dailyTime', inplace=True)

df_concat = []

for user, group in github_dayacts_df.groupby('nodeID'):
    s = group['PushEvent']
    
    s = s.reindex(idx, fill_value=0)
   
    df = pd.DataFrame(s)
    df['nodeID'] = user
    
    df_concat.append(df)
    
github_new_df = pd.concat(df_concat)

In [479]:
github_avg_daily = github_new_df.groupby('nodeID')['PushEvent'].mean().reset_index(name='avg_daily')

#### Formatting to match DCRNN Input

In [None]:
#Set dailyTime as a column in the df
github_new_df.reset_index(level=0, inplace=True)
github_new_df.rename(columns={'index':'nodeTime'}, inplace=True)

In [390]:
#### Build output files for DCRNN

#Turn largest connected component subgraph into a dataframe
rt_lc_df = nx.to_pandas_edgelist(repos_lc)
rt_lc_df['weight'] = 1
#Find nodes with self-loops
self_nodes = rt_lc_df.loc[rt_lc_df['source'] == rt_lc_df['target']]
self_nodes = list(self_nodes['source'])

nodes = list(set(repos_nodes) - set(self_nodes))

#insert self-loops with weight 0 for records without self loops
entries = [{'source':node, 'target':node, 'weight':0} for node in nodes]
rt_lc_df= rt_lc_df.append(entries)
#encode the largest connected component into integers
rt_lc_encoding = {}
for _i, node in enumerate(repos_nodes):
    rt_lc_encoding[node] = str(_i)
rt_lc_df['source'] = rt_lc_df['source'].apply(lambda x: rt_lc_encoding[x])
rt_lc_df['target'] = rt_lc_df['target'].apply(lambda x: rt_lc_encoding[x])

#rename columns
rt_lc_df.rename(columns={'source':'from', 'target':'to', 'weight':'distance'}, inplace=True)
rt_lc_df['distance']= rt_lc_df['distance'].astype(float)
#write id list to file
with open('', 'w') as f:
    f.write(",".join(list(rt_lc_encoding.values())))
rt_lc_df.to_csv('', index=False)

#### Generating timestep samples

In [397]:
github_new_df['label'] = github_new_df['nodeID'].map(rt_lc_encoding)
rt_day_acts_formatted = github_new_df.pivot_table(values='PushEvent', index='nodeTime', columns='label', aggfunc='first')
rt_day_acts_formatted.to_pickle('')

## Vendor Level

In [284]:
path_to_ven = ''
mapping_df = pd.read_csv(path_to_ven)

vendor_df = pd.read_csv(path_to_ven)
vendor_df.rename(columns={'CVEID':'informationID'}, inplace=True)

github_ven = pd.merge(github_df, vendor_df, on='informationID', how='inner')

github_ven= github_ven.drop_duplicates(['actionType', 'informationID', 'nodeID', 'nodeTime',
                                          'nodeUserID', 'Vendor']).reset_index(drop=True)

vendor_df = github_ven[['informationID', 'Vendor']].copy()
vendor_df = vendor_df.drop_duplicates()

vendor_df1 = vendor_df.copy()
vendor_df1.rename(columns={'Vendor':'Target'}, inplace=True)

vendors_df = pd.merge(vendor_df, vendor_df1, on='informationID', how='left')

vendors_df= vendors_df.groupby(['Vendor','Target']).size().reset_index(name='weight')

#### Building Graph

In [295]:
#Create digraph for retweet diffusion
g_nx_ven = nx.from_pandas_edgelist(vendors_df,'Vendor', 'Target', ['weight'], create_using=nx.Graph())
print('Nodes:', g_nx_ven.number_of_nodes(), 'Edges:', g_nx_ven.number_of_edges())

#Get largest strongly connected component
lc_strong_ven = sorted(nx.connected_components(g_nx_ven), key=len, reverse=True)

#Get Subgraph of the largest connected component.
ven_lc = g_nx_ven.subgraph(lc_strong_ven[0])

#Get list of nodes in largest connected component
ven_nodes = list(ven_lc.nodes())

#### Getting activities by Vendor

In [471]:
github_ven=github_ven.loc[github_ven['Vendor'].isin(ven_nodes)].reset_index(drop=True)

### Get daily activities of each ven'' for 10 events
github_dayacts_ven = github_ven.groupby(['Vendor', 'dailyTime']).actionType.value_counts().unstack(fill_value=0).reset_index()

### dailyTime to datetime object
github_dayacts_ven['dailyTime'] = pd.to_datetime(github_dayacts_ven['dailyTime'])

### Keep only PushEvents
github_dayacts_ven = github_dayacts_ven[['Vendor', 'dailyTime', 'PushEvent']].copy()

#### Generate missing inactive days

In [349]:
max_date =  max(github_dayacts_ven['dailyTime'])
min_date = min(github_dayacts_ven['dailyTime'])

idx = pd.date_range(min_date, max_date)
github_dayacts_ven.set_index('dailyTime', inplace=True)

df_concat = []

for user, group in github_dayacts_ven.groupby('Vendor'):
    s = group['PushEvent']
    
    s = s.reindex(idx, fill_value=0)
   
    df = pd.DataFrame(s)
    df['Vendor'] = user
    
    df_concat.append(df)
    
github_new_ven = pd.concat(df_concat)

#### Formatting to match DCRNN Input

In [416]:
#Set dailyTime as a column in the df
github_new_ven.reset_index(level=0, inplace=True)
github_new_ven.rename(columns={'index':'nodeTime'}, inplace=True)

In [419]:
#### Build output files for DCRNN

#Turn largest connected component subgraph into a dataframe
rt_lc_df = nx.to_pandas_edgelist(ven_lc)
rt_lc_df['weight'] = 1
#Find nodes with self-loops
self_nodes = rt_lc_df.loc[rt_lc_df['source'] == rt_lc_df['target']]
self_nodes = list(self_nodes['source'])

nodes = list(set(ven_nodes) - set(self_nodes))

#insert self-loops with weight 0 for records without self loops
entries = [{'source':node, 'target':node, 'weight':0} for node in nodes]
rt_lc_df= rt_lc_df.append(entries)
#encode the largest connected component into integers
rt_lc_encoding = {}
for _i, node in enumerate(ven_nodes):
    rt_lc_encoding[node] = str(_i)
rt_lc_df['source'] = rt_lc_df['source'].apply(lambda x: rt_lc_encoding[x])
rt_lc_df['target'] = rt_lc_df['target'].apply(lambda x: rt_lc_encoding[x])

#rename columns
rt_lc_df.rename(columns={'source':'from', 'target':'to', 'weight':'distance'}, inplace=True)
rt_lc_df['distance']= rt_lc_df['distance'].astype(float)
#write id list to file
with open('', 'w') as f:
    f.write(",".join(list(rt_lc_encoding.values())))
rt_lc_df.to_csv('', index=False)

#### Generating timestep samples

In [422]:
github_new_ven['label'] = github_new_ven['Vendor'].map(rt_lc_encoding)
rt_day_acts_formatted = github_new_ven.pivot_table(values='PushEvent', index='nodeTime', columns='label', aggfunc='first')
rt_day_acts_formatted.to_pickle('')

## Product Level

In [307]:
path_to_prod = ''
mapping_df = pd.read_csv(path_to_prod)

product_df = pd.read_csv(path_to_prod)
product_df.rename(columns={'CVEID':'informationID'}, inplace=True)

github_prod = pd.merge(github_df, product_df, on='informationID', how='inner')

github_prod= github_prod.drop_duplicates(['actionType', 'informationID', 'nodeID', 'nodeTime',
                                          'nodeUserID', 'Product']).reset_index(drop=True)

product_df = github_prod[['informationID', 'Product']].copy()
product_df = product_df.drop_duplicates()

product_df1 = product_df.copy()
product_df1.rename(columns={'Product':'Target'}, inplace=True)

products_df = pd.merge(product_df, product_df1, on='informationID', how='left')

products_df= products_df.groupby(['Product','Target']).size().reset_index(name='weight')

#### Building Graph

In [318]:
#Create digraph for retweet diffusion
g_nx_prod = nx.from_pandas_edgelist(products_df,'Product', 'Target', ['weight'], create_using=nx.Graph())
print('Nodes: ', g_nx_prod.number_of_nodes(), 'Edges: ', g_nx_prod.number_of_edges())

#Get largest strongly connected component
lc_strong_prod = sorted(nx.connected_components(g_nx_prod), key=len, reverse=True)

#Get Subgraph of the largest connected component.
prod_lc = g_nx_prod.subgraph(lc_strong_prod[0])

#Get list of nodes in largest connected component
prod_nodes = list(prod_lc.nodes())

#### Getting activities by product

In [324]:
github_prod=github_prod.loc[github_prod['Product'].isin(prod_nodes)].reset_index(drop=True)

#Get daily activities of each prod'' for 10 eprodts
github_dayacts_prod = github_prod.groupby(['Product', 'dailyTime']).actionType.value_counts().unstack(fill_value=0).reset_index()

#dailyTime to datetime object
github_dayacts_prod['dailyTime'] = pd.to_datetime(github_dayacts_prod['dailyTime'])

#Keep only PushEvents
github_dayacts_prod = github_dayacts_prod[['Product', 'dailyTime', 'PushEvent']].copy()

#### Generate missing inactive days

In [364]:
max_date =  max(github_dayacts_prod['dailyTime'])
min_date = min(github_dayacts_prod['dailyTime'])

idx = pd.date_range(min_date, max_date)
github_dayacts_prod.set_index('dailyTime', inplace=True)

df_concat = []

for user, group in github_dayacts_prod.groupby('Product'):
    s = group['PushEvent']
    
    s = s.reindex(idx, fill_value=0)
   
    df = pd.DataFrame(s)
    df['Product'] = user
    
    df_concat.append(df)
    
github_new_prod = pd.concat(df_concat)

#### Formatting to match DCRNN Input

In [401]:
#Set dailyTime as a column in the df
github_new_prod.reset_index(level=0, inplace=True)
github_new_prod.rename(columns={'index':'nodeTime'}, inplace=True)

In [404]:
#### Build output files for DCRNN

#Turn largest connected component subgraph into a dataframe
rt_lc_df = nx.to_pandas_edgelist(prod_lc)
rt_lc_df['weight'] = 1
#Find nodes with self-loops
self_nodes = rt_lc_df.loc[rt_lc_df['source'] == rt_lc_df['target']]
self_nodes = list(self_nodes['source'])

nodes = list(set(prod_nodes) - set(self_nodes))

#insert self-loops with weight 0 for records without self loops
entries = [{'source':node, 'target':node, 'weight':0} for node in nodes]
rt_lc_df= rt_lc_df.append(entries)
#encode the largest connected component into integers
rt_lc_encoding = {}
for _i, node in enumerate(prod_nodes):
    rt_lc_encoding[node] = str(_i)
rt_lc_df['source'] = rt_lc_df['source'].apply(lambda x: rt_lc_encoding[x])
rt_lc_df['target'] = rt_lc_df['target'].apply(lambda x: rt_lc_encoding[x])

#rename columns
rt_lc_df.rename(columns={'source':'from', 'target':'to', 'weight':'distance'}, inplace=True)
rt_lc_df['distance']= rt_lc_df['distance'].astype(float)
#write id list to file
with open('/data/kinwaing_workspace/DCRNN/DCRNN/data/Github_Daily_Product/git_lc_nodes.txt', 'w') as f:
    f.write(",".join(list(rt_lc_encoding.values())))
rt_lc_df.to_csv('/data/kinwaing_workspace/DCRNN/DCRNN/data/Github_Daily_Product/git_lc_df.csv', index=False)

#### Generating timestep samples

In [410]:
github_new_prod['label'] = github_new_prod['Product'].map(rt_lc_encoding)
rt_day_acts_formatted = github_new_prod.pivot_table(values='PushEvent', index='nodeTime', columns='label', aggfunc='first')
rt_day_acts_formatted.to_pickle('')