# Deduplication Files

In [None]:
import pandas as pd
import numpy as np
import re
import pandas_dedupe

In [None]:
files_path = './dedupe_files/'

## Connectivity

In [None]:
# Read Connectivity Merged
connectivity = pd.read_csv(files_path+'Connectivity_Merged.tsv',sep='\t')
print(connectivity.shape)
connectivity.head()

In [None]:
connectivity.EdgeLabel.value_counts()

It would appear that there are a few duplicates here.

In [None]:
# Let us work on de-duping the Edge Labels:
clustered_connect = pandas_dedupe.dedupe_dataframe(connectivity,['EdgeLabel'])

In [None]:
print(clustered_connect.shape)
clustered_connect.head()

In [None]:
clustered_connect.groupby('cluster id')['EdgeLabel'].apply(lambda x: list(np.unique(x)))

In [None]:
similar_connect = ['developed by','produces','created by','authored']
clustered_connect[clustered_connect.EdgeLabel.isin(similar_connect)]

In [None]:
# After some analysis, it would appear produces and developed by are the same & created by and authored are the same.
clustered_connect['cluster id'].replace({9: 8, 7: 4},inplace=True)
# Now let's look at the labels:
clustered_connect.groupby('cluster id')['EdgeLabel'].apply(lambda x: list(np.unique(x)))

In [None]:
# Let us use these clusters
cl_conn_map = clustered_connect.groupby('cluster id')['EdgeLabel'].apply(lambda x: list(np.unique(x)))
st_conn_dict = {k:cl_conn_map[k][0] for k in cl_conn_map.index}
clustered_connect['std_EdgeLabel'] = clustered_connect['cluster id'].map(st_conn_dict)

In [None]:
clustered_connect.head()

In [None]:
export_connect = clustered_connect.drop(['cluster id','confidence','EdgeLabel'],axis=1)
export_connect.columns = connectivity.columns
export_connect.to_csv(files_path+'dd_Connectivity_Merged.tsv',sep="\t",index=False)

## Edge Property:

In [None]:
# Read Edge Property Merged
edge_property = pd.read_csv(files_path+'EdgeProperty_Merged.tsv',sep='\t')
print(edge_property.shape)
edge_property.head()

In [None]:
edge_property.dropna(inplace=True)
print(edge_property.shape)
edge_property.head()

In [None]:
edge_property.Property.value_counts()

In [None]:
# Preview the data
edge_property[edge_property.Property.isin(['Licensed','ownership'])]

In [None]:
edge_dict = {}
for i,val in enumerate(edge_property.Property.unique()):
    edge_dict[val] = i

In [None]:
edge_dict

In [None]:
edge_dict.update({'Date ': 0, 'Value ': 1,'ownership':6,'Date11':0})

In [None]:
edge_property['cluster id'] = edge_property['Property'].map(edge_dict)

In [None]:
# Let us use these clusters
cl_edgeprop_map = edge_property.groupby('cluster id')['Property'].apply(lambda x: list(np.unique(x)))
st_ep_dict = {k:cl_edgeprop_map[k][0] for k in cl_edgeprop_map.index}
edge_property['std_Property'] = edge_property['cluster id'].map(st_ep_dict)

In [None]:
edge_export = edge_property[['EdgeID', 'std_Property', 'Value']].copy()
edge_export.columns = ['EdgeID','Property','Value']
edge_export.to_csv(files_path+'dd_EdgeProperty_Merged.tsv',sep="\t",index=False)

## Node Labels

In [None]:
# Read Node Labels
node_lbls = pd.read_csv(files_path+'NodeLabels.tsv',sep='\t')
print(node_lbls.shape)
node_lbls.head()

In [None]:
node_lbls.label.value_counts()

In [None]:
# Node Labels look clean enough, check for duplicates
node_lbls_dd = node_lbls.drop_duplicates(subset=None, keep='first', inplace=False)
print(node_lbls.shape)

No adjustments needed.

## Node Property

In [None]:
# Read Node Labels
node_prop = pd.read_csv(files_path+'NodeProperty.tsv',sep='\t')
print(node_prop.shape)
node_prop.head()

In [None]:
node_prop.Property.value_counts()

In [None]:
clustered_node_prop = pandas_dedupe.dedupe_dataframe(node_prop,['Property'])

In [None]:
print(clustered_node_prop.shape)
clustered_node_prop.head()

In [None]:
clustered_node_prop.groupby('cluster id')['Property'].apply(lambda x: list(np.unique(x)))

In [None]:
# Preview the data
clustered_node_prop[clustered_node_prop.Property.isin(['initial release','release date','launched'])]

In [None]:
clustered_node_prop['cluster id'].replace({22: 6, 12: 4, 9: 18, 23:6, 0:15},inplace=True)

In [None]:
clustered_node_prop.groupby('cluster id')['Property'].apply(lambda x: list(np.unique(x)))

In [None]:
# Let us use these clusters
cl_np_map = clustered_node_prop.groupby('cluster id')['Property'].apply(lambda x: list(np.unique(x)))
st_np_dict = {k:cl_np_map[k][0] for k in cl_np_map.index}
clustered_node_prop['std_Property'] = clustered_node_prop['cluster id'].map(st_np_dict)

In [None]:
clustered_node_prop.head()

In [None]:
clustered_node_prop.std_Property.value_counts()

In [None]:
# Like launched better than initial released, purely preference:
clustered_node_prop.std_Property.replace('initial release', 'launched',inplace=True)

In [None]:
npop_export = clustered_node_prop[['NodeID', 'std_Property', 'Value']].copy()
npop_export.columns = ['NodeID','Property','Value']
npop_export.to_csv(files_path+'dd_NodeProperty.tsv',sep="\t",index=False)