In [1]:
import pandas as pd
import numpy as np
import json

# Change data into one mode and two mode

## Load data

In [2]:
df = pd.read_excel("Krackhardt Web of Science Data v3.xlsx")
df.head()

Unnamed: 0,Article,Description,Authors,Author Full Name,Document Title,Publication Name,Publication year
0,00TRDBDKSMJ,"Redudant Governance Structures - Rowley, T; Be...","Adler, PS; Kwon, SW","Adler, PS; Kwon, SW",Social capital: Prospects for a new concept,ACADEMY OF MANAGEMENT REVIEW,2002
1,00TRDBDKSMJ,,"Ahuja, G","Ahuja, G","Collaboration networks, structural holes, and ...",ADMINISTRATIVE SCIENCE QUARTERLY,2000
2,00TRDBDKSMJ,,"Dyer, JH; Nobeoka, K","Dyer, JH; Nobeoka, K",Creating and managing a high-performance knowl...,STRATEGIC MANAGEMENT JOURNAL,2000
3,00TRDBDKSMJ,,"Inkpen, AC; Tsang, EWK","Inkpen, AC; Tsang, EWK","Social capital, networks, and knowledge transfer",ACADEMY OF MANAGEMENT REVIEW,2005
4,00TRDBDKSMJ,,"Baum, JAC; Calabrese, T; Silverman, BS","Baum, JAC; Calabrese, T; Silverman, BS",Don't go it alone: Alliance network compositio...,STRATEGIC MANAGEMENT JOURNAL,2000


## Abnormal data detection

In [3]:
#Multi-records of the same Article-citation pair

article_citation = df.drop("Description", axis = 1).groupby(["Article", "Document Title"]).count()

multi_citation = article_citation[(article_citation["Authors"] > 1) 
                                 | (article_citation["Author Full Name"] > 1)
                                 | (article_citation["Publication Name"] > 1)
                                 | (article_citation["Publication year"] > 1)]

        
multi_citation = multi_citation.index.values.tolist()
multi_citation

[('00TRDBDKSMJ',
  'Capacity Sharing Issue in an Electronic Co-Opetitive Network: A Simulative Approach'),
 ('00TRDBDKSMJ',
  "Collaboration partner portfolio along the growth of Chinese firms' innovation capability: configuration, evolution and pattern"),
 ('00TRDBDKSMJ',
  'Joint R&D projects: Experiences in the context of European technology policy'),
 ('00TRDBDKSMJ',
  'Network embeddedness and the exploration of novel technologies: Technological distance, betweenness centrality and density'),
 ('00TRDBDKSMJ',
  'With a Little Help from Our Colleagues: A Longitudinal Study of Social Networks for Innovation'),
 ('06SPBKMCDKSN', 'The Who of Systemic Thinking'),
 ('87DKSN', 'An equilibrium-correction model for dynamic network data'),
 ('87DKSN', 'BLOCKMODELS - INTERPRETATION AND EVALUATION'),
 ('87DKSN',
  'BRINGING THE INDIVIDUAL BACK IN - A STRUCTURAL-ANALYSIS OF THE INTERNAL MARKET FOR REPUTATION IN ORGANIZATIONS'),
 ('87DKSN', 'Cognitive inconsistencies and non-symmetric friendshi

In [4]:
#Multi-description of the same article

article_description = df.dropna(subset=["Description"])[["Article", "Description"]].groupby(["Article"]).count()

multi_description = article_description[article_description["Description"] > 1]
        
multi_description = multi_description.index.values.tolist()
multi_description

['87DKSN']

## Two mode

In [5]:
rows = df['Document Title'].unique()
cols = df['Article'].unique()
two_mode = pd.DataFrame(data=np.zeros((rows.shape[0],cols.shape[0]),dtype=int),index=rows,columns=cols)

row_dict = dict(zip(rows,range(rows.shape[0])))
col_dict = dict(zip(cols,range(cols.shape[0])))

In [6]:
for ind in df.index:
    article_name = df['Document Title'][ind]
    doc_name = df['Article'][ind]
    two_mode.at[article_name, doc_name] += 1
    
two_mode.head()

Unnamed: 0,00TRDBDKSMJ,92DKNO:SFA,90DKASQ,88DKSRNSPQ,88DKSN,93DKJRHHBR,87DKSN,94MKDKAMJ,07DDDKTSP,06SPBKMCDKSN,...,96JBCMDKGD,15PPAPJLDKEMJ,14KGDKLWSMR,95DKASQ,77LLDKHO,14PPJLDKPEC,14DKCPOSN,07DKTSJRSSSASS,02NEFDKSN,98PHFHDKCA
Social capital: Prospects for a new concept,1,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"Collaboration networks, structural holes, and innovation: A longitudinal study",1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Creating and managing a high-performance knowledge-sharing network: The Toyota case,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"Social capital, networks, and knowledge transfer",1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Don't go it alone: Alliance network composition and startups' performance in Canadian biotechnology,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## One mode

In [7]:
#using dot product of two-mode to get the number of co-citation

A = np.array(two_mode)
G = np.dot(A.T, A)
one_mode = pd.DataFrame(data=G, index=cols, columns=cols)

for articles in one_mode.columns.values.tolist():
    one_mode.loc[articles, articles] = np.sum(df["Article"] == articles)
    
one_mode.head()

Unnamed: 0,00TRDBDKSMJ,92DKNO:SFA,90DKASQ,88DKSRNSPQ,88DKSN,93DKJRHHBR,87DKSN,94MKDKAMJ,07DDDKTSP,06SPBKMCDKSN,...,96JBCMDKGD,15PPAPJLDKEMJ,14KGDKLWSMR,95DKASQ,77LLDKHO,14PPJLDKPEC,14DKCPOSN,07DKTSJRSSSASS,02NEFDKSN,98PHFHDKCA
00TRDBDKSMJ,846,74,24,11,5,6,10,14,2,2,...,0,0,1,2,0,0,0,0,0,0
92DKNO:SFA,74,628,50,30,28,32,43,31,7,3,...,0,0,0,0,0,0,0,0,0,0
90DKASQ,24,50,583,40,33,40,100,59,6,2,...,0,0,0,0,0,0,0,0,0,0
88DKSRNSPQ,11,30,40,420,20,28,29,17,9,5,...,0,0,0,0,0,0,0,0,0,0
88DKSN,5,28,33,20,438,12,144,25,102,9,...,0,0,0,0,0,0,0,0,1,1


## Output the data in csv format

In [8]:
two_mode.to_csv("two_mode.csv")
one_mode.to_csv("one_mode.csv")

# Create Json Files

In [9]:
#Change one-mode dataframe into source-target list

src_dst = pd.DataFrame(columns=['source', 'target', 'value'])

for i in range(len(one_mode)):
    for j in range(i + 1, len(one_mode)):
        src_dst = src_dst.append([{'source': one_mode.columns.values[i], 
                                   'target': one_mode.columns.values[j],
                                   'value': one_mode.iloc[i, j]}], ignore_index=True, sort=False)
        
src_dst[0: 5]

Unnamed: 0,source,target,value
0,00TRDBDKSMJ,92DKNO:SFA,74
1,00TRDBDKSMJ,90DKASQ,24
2,00TRDBDKSMJ,88DKSRNSPQ,11
3,00TRDBDKSMJ,88DKSN,5
4,00TRDBDKSMJ,93DKJRHHBR,6


## Link List

In [10]:
#Create link list

links_list = []

indexes = pd.Index(src_dst['source']
                   .append(src_dst['target'])
                   .reset_index(drop=True).unique())

for index, link in src_dst.iterrows():
    record = {"source": indexes.get_loc(link['source']),
              "target": indexes.get_loc(link['target']),
              "value": link['value']}
    links_list.append(record)
    
links_list[0: 10]

[{'source': 0, 'target': 1, 'value': 74},
 {'source': 0, 'target': 2, 'value': 24},
 {'source': 0, 'target': 3, 'value': 11},
 {'source': 0, 'target': 4, 'value': 5},
 {'source': 0, 'target': 5, 'value': 6},
 {'source': 0, 'target': 6, 'value': 10},
 {'source': 0, 'target': 7, 'value': 14},
 {'source': 0, 'target': 8, 'value': 2},
 {'source': 0, 'target': 9, 'value': 2},
 {'source': 0, 'target': 10, 'value': 19}]

## Node List

In [11]:
nodes_list = []

#article_description = df[["Article", "Description"]].groupby('Article').nth(0).reset_index()

for i in range(len(one_mode)):
    name = one_mode.columns.values.tolist()[i]
    year = int(one_mode.columns.values.tolist()[i][0: 2])
    year = (1900 + year) if year > 20 else (2000 + year)
    nodes_list.append({"name": name, 
                        "group": year,
                        "size": int(one_mode.iloc[i, i]),
                        "meta_node": df[df["Article"] == name].iloc[0, ]["Description"]})
    
nodes_list[0: 5]

[{'name': '00TRDBDKSMJ',
  'group': 2000,
  'size': 846,
  'meta_node': 'Redudant Governance Structures - Rowley, T; Behrens, D; Krackhardt, D; Strategic Management Journal - March 2000'},
 {'name': '92DKNO:SFA',
  'group': 1992,
  'size': 628,
  'meta_node': 'The Strength of Strong Ties - Krackhardt, D - Networks & Organizations: Structure, Form, & Action - 1992'},
 {'name': '90DKASQ',
  'group': 1990,
  'size': 583,
  'meta_node': 'Assessing the Political Landscape - Krackhardt, D; Administrative Science Quarterly - June 1992'},
 {'name': '88DKSRNSPQ',
  'group': 1988,
  'size': 420,
  'meta_node': 'Informal Network and Organizational Crises - Krackhardt, D; Stern, RN; Social Psychology Quarterly - June 1988'},
 {'name': '88DKSN',
  'group': 1988,
  'size': 438,
  'meta_node': 'Predicting with Networks - Krackhardt, D; Social Networks - Dec. 1988'}]

## Output to files

In [12]:
json_prep = {"links":links_list, "nodes":nodes_list}
json_dump = json.dumps(json_prep, indent=1, sort_keys=True)

In [13]:
filename_out = 'link_node.json'
json_out = open(filename_out,'w')
json_out.write(json_dump)
json_out.close()