# Data Preprocessing

##### 18BCE1302, 18BCE1309

In [1]:
%matplotlib inline

import pandas as pd
import networkx as nx

In [2]:
nodes = pd.read_csv("datasets/stack_network_nodes.csv")
nodes.head()

Unnamed: 0,name,group,nodesize
0,html,6,272.45
1,css,6,341.17
2,hibernate,8,29.83
3,spring,8,52.84
4,ruby,3,70.14


In [3]:
#checking for null values for nodes dataset
nodes.isna()

Unnamed: 0,name,group,nodesize
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False
...,...,...,...
110,False,False,False
111,False,False,False
112,False,False,False
113,False,False,False


In [4]:
edges = pd.read_csv("datasets/stack_network_edges.csv")
edges.head()

Unnamed: 0,Source,Target,Weight
0,azure,.net,20.933192
1,sql-server,.net,32.322524
2,asp.net,.net,48.40703
3,entity-framework,.net,24.370903
4,wpf,.net,32.350925


In [5]:
#checking for null values for edges dataset
edges.notna()

Unnamed: 0,Source,Target,Weight
0,True,True,True
1,True,True,True
2,True,True,True
3,True,True,True
4,True,True,True
...,...,...,...
485,True,True,True
486,True,True,True
487,True,True,True
488,True,True,True


In [6]:
temp1 = edges.loc[:,['Source']]
temp1.rename(columns={'Source':'Label'}, inplace=True)

In [7]:
temp2 = edges.loc[:,['Target']]
temp2.rename(columns={'Target':'Label'}, inplace=True)

In [8]:
# grouping labels(both source and target) by count(matches)
tags = pd.concat([temp1,temp2])
tags = tags['Label'].groupby(tags['Label']).agg(['count'])
tags.reset_index(inplace= True)
tags.rename(columns={'0':'id', 'count':'matches'},inplace=True)

In [9]:
tags=tags.reset_index()

In [10]:
tags.rename(columns = {'index':'Id'}, inplace=True)

In [11]:
tag_list = list(tags['Label'])

In [12]:
tags.head()

Unnamed: 0,Id,Label,matches
0,0,.net,16
1,1,agile,2
2,2,ajax,14
3,3,amazon-web-services,8
4,4,android,6


In [13]:
edges = edges.loc[edges['Source'].isin(tag_list)]
edges = edges.loc[edges['Target'].isin(tag_list)]


In [14]:
# Merging source labels and matches
tempdf = pd.merge(edges,tags, how='left',left_on = 'Source', right_on = 'Label')
tempdf = tempdf.rename(columns={'Id' :'Sourceid'})

# Merging target labels and matches
tempdf = pd.merge(tempdf,tags, how='left',left_on = 'Target', right_on = 'Label')
tempdf = tempdf.rename(columns={'Id' :'Targetid'})

tempdf

Unnamed: 0,Source,Target,Weight,Sourceid,Label_x,matches_x,Targetid,Label_y,matches_y
0,azure,.net,20.933192,14,azure,10,0,.net,16
1,sql-server,.net,32.322524,93,sql-server,18,0,.net,16
2,asp.net,.net,48.407030,12,asp.net,26,0,.net,16
3,entity-framework,.net,24.370903,30,entity-framework,16,0,.net,16
4,wpf,.net,32.350925,111,wpf,12,0,.net,16
...,...,...,...,...,...,...,...,...,...
485,objective-c,xcode,43.418825,65,objective-c,10,113,xcode,8
486,swift,xcode,48.620335,94,swift,8,113,xcode,8
487,iphone,xcode,34.712865,45,iphone,8,113,xcode,8
488,ios,xcode,46.365091,44,ios,12,113,xcode,8


In [15]:
# Replacing source and target labels with source and target ids
edges = tempdf.loc[:,['Sourceid', 'Targetid','Weight']]
edges.rename(columns={'Sourceid':'Source','Targetid':'Target'}, inplace=True)
edges

Unnamed: 0,Source,Target,Weight
0,14,0,20.933192
1,93,0,32.322524
2,12,0,48.407030
3,30,0,24.370903
4,111,0,32.350925
...,...,...,...
485,65,113,43.418825
486,94,113,48.620335
487,45,113,34.712865
488,44,113,46.365091


In [16]:
# saving processed edges dataset
edges.to_csv('datasets/edges.csv',index=False)

In [17]:
#tempdf.drop(['Label_x','matches_x','Label_y','matches_y'], axis=1, inplace=True)
tempdf

Unnamed: 0,Source,Target,Weight,Sourceid,Label_x,matches_x,Targetid,Label_y,matches_y
0,azure,.net,20.933192,14,azure,10,0,.net,16
1,sql-server,.net,32.322524,93,sql-server,18,0,.net,16
2,asp.net,.net,48.407030,12,asp.net,26,0,.net,16
3,entity-framework,.net,24.370903,30,entity-framework,16,0,.net,16
4,wpf,.net,32.350925,111,wpf,12,0,.net,16
...,...,...,...,...,...,...,...,...,...
485,objective-c,xcode,43.418825,65,objective-c,10,113,xcode,8
486,swift,xcode,48.620335,94,swift,8,113,xcode,8
487,iphone,xcode,34.712865,45,iphone,8,113,xcode,8
488,ios,xcode,46.365091,44,ios,12,113,xcode,8


In [18]:
# Merging id, label and matches for a node with nodes dataset
tempdf1 = pd.merge(tags,nodes, how='left',left_on = 'Label', right_on = 'name')
tempdf1

Unnamed: 0,Id,Label,matches,name,group,nodesize
0,0,.net,16,.net,2,75.08
1,1,agile,2,agile,12,12.22
2,2,ajax,14,ajax,6,35.41
3,3,amazon-web-services,8,amazon-web-services,9,30.05
4,4,android,6,android,4,229.86
...,...,...,...,...,...,...
110,110,wordpress,12,wordpress,6,46.74
111,111,wpf,12,wpf,2,19.38
112,112,xamarin,2,xamarin,2,11.18
113,113,xcode,8,xcode,4,11.37


In [19]:
tempdf1.drop(['name','matches'], axis=1, inplace=True)

In [20]:
tempdf1 = tempdf1.set_index('Id')


In [21]:
# Replacing name with label and id for easier analysis
tempdf1

Unnamed: 0_level_0,Label,group,nodesize
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,.net,2,75.08
1,agile,12,12.22
2,ajax,6,35.41
3,amazon-web-services,9,30.05
4,android,4,229.86
...,...,...,...
110,wordpress,6,46.74
111,wpf,2,19.38
112,xamarin,2,11.18
113,xcode,4,11.37


In [22]:
# saving processed nodes data
tempdf1.to_csv('datasets/nodes.csv')