In [3]:
import pandas as pd
import sys
import re
import numpy as np
import csv
import difflib 

In [24]:
ieee_orig = pd.read_csv('../../data/interim/ieee_author_df.csv')
papers = pd.read_csv('../../data/processed/papers_to_study.txt', header=None)[0].tolist()

In [5]:
ieee_orig

Unnamed: 0,Year,DOI,Title,Number of Authors,Author Position,Author Name,Author ID,Author Affiliation,One Affiliation
0,2011,10.1109/TVCG.2011.185,D³ Data-Driven Documents,3.0,1.0,Michael Bostock,https://ieeexplore.ieee.org/author/37591067400,"Computer Science Department, Stanford Universi...",True
1,2011,10.1109/TVCG.2011.185,D³ Data-Driven Documents,3.0,2.0,Vadim Ogievetsky,https://ieeexplore.ieee.org/author/38016292400,"Computer Science Department, Stanford Universi...",True
2,2011,10.1109/TVCG.2011.185,D³ Data-Driven Documents,3.0,3.0,Jeffrey Heer,https://ieeexplore.ieee.org/author/37550791300,"Computer Science Department, Stanford Universi...",True
3,1991,10.1109/VISUAL.1991.175815,Tree-maps: a space-filling approach to the vis...,2.0,1.0,B. Johnson,https://ieeexplore.ieee.org/author/37381975300,Department of Computer Science & Human-Compute...,True
4,1991,10.1109/VISUAL.1991.175815,Tree-maps: a space-filling approach to the vis...,2.0,2.0,B. Shneiderman,https://ieeexplore.ieee.org/author/37283016400,Department of Computer Science & Human-Compute...,True
...,...,...,...,...,...,...,...,...,...
12403,2021,10.1109/TVCG.2020.3032984,Understanding Missing Links in Bipartite Netwo...,4.0,1.0,Jian Zhao,,"School of Computer Science, University of Wate...",True
12404,2021,10.1109/TVCG.2020.3032984,Understanding Missing Links in Bipartite Netwo...,4.0,2.0,Maoyuan Sun,,"Department of Computer Science, Northern Illin...",True
12405,2021,10.1109/TVCG.2020.3032984,Understanding Missing Links in Bipartite Netwo...,4.0,3.0,Francine Chen,,"Research, FXPAL, Palo Alto, California, United...",True
12406,2021,10.1109/TVCG.2020.3032984,Understanding Missing Links in Bipartite Netwo...,4.0,4.0,Patrick Chui,,"Research, FXPAL, Palo Alto, California, United...",True


In [6]:
def update_ieee_orig(DF): # df here is ieee_orig
	"""update ieee_org

	ieee_org is wrong in '10.1109/TVCG.2008.157' as it contains an additional author that shouldn't be there;
	also, ieee_org lacks author info for '10.1109/VIS.1999.10000'.

	What this function does is to delete the additional author in '10.1109/TVCG.2008.157' and update info in 
	that paper. Then, I added author data manually for '10.1109/VIS.1999.10000'.

	"""
	DF = DF.drop(DF[DF.DOI == '10.1109/VIS.1999.10000'].index)
	row_to_drop = DF.index[DF.DOI == '10.1109/TVCG.2008.157'].tolist()[0]
	df_dropped = DF.drop([row_to_drop])
	df_dropped.loc[df_dropped.DOI == '10.1109/TVCG.2008.157', 'Number of Authors'] -= 1
	df_dropped.loc[df_dropped.DOI == '10.1109/TVCG.2008.157', 'Author Position'] -= 1.0
	df = df_dropped
	FILL_DATA = [
	{
		'Year': 1999,
		'DOI': '10.1109/VIS.1999.10000',
		'Title': 'Progressive Compression of Arbitrary Triangular Meshes',
		'Number of Authors': 3,
		'Author Position': 1,
		'Author Name': 'Daniel Cohen-Or',
		'Author ID': np.NaN,
		'Author Affiliation': 'Tel Aviv University',
		'One Affiliation': True,
	},
	{
		'Year': 1999,
		'DOI': '10.1109/VIS.1999.10000',
		'Title': 'Progressive Compression of Arbitrary Triangular Meshes',
		'Number of Authors': 3,
		'Author Position': 2,
		'Author Name': 'David Levin',
		'Author ID': np.NaN,
		'Author Affiliation': 'Tel Aviv University',
		'One Affiliation': True,
	},
	{
		'Year': 1999,
		'DOI': '10.1109/VIS.1999.10000',
		'Title': 'Progressive Compression of Arbitrary Triangular Meshes',
		'Number of Authors': 3,
		'Author Position': 3,
		'Author Name': 'Offir Remez',
		'Author ID': np.NaN,
		'Author Affiliation': 'Tel Aviv University',
		'One Affiliation': True,
	}
	]
	fill_data_df = pd.DataFrame(FILL_DATA)
	df = df.append(fill_data_df, ignore_index = True)
	return df

In [7]:
ieee = update_ieee_orig(ieee_orig)

In [8]:
ieee

Unnamed: 0,Year,DOI,Title,Number of Authors,Author Position,Author Name,Author ID,Author Affiliation,One Affiliation
0,2011,10.1109/TVCG.2011.185,D³ Data-Driven Documents,3.0,1.0,Michael Bostock,https://ieeexplore.ieee.org/author/37591067400,"Computer Science Department, Stanford Universi...",True
1,2011,10.1109/TVCG.2011.185,D³ Data-Driven Documents,3.0,2.0,Vadim Ogievetsky,https://ieeexplore.ieee.org/author/38016292400,"Computer Science Department, Stanford Universi...",True
2,2011,10.1109/TVCG.2011.185,D³ Data-Driven Documents,3.0,3.0,Jeffrey Heer,https://ieeexplore.ieee.org/author/37550791300,"Computer Science Department, Stanford Universi...",True
3,1991,10.1109/VISUAL.1991.175815,Tree-maps: a space-filling approach to the vis...,2.0,1.0,B. Johnson,https://ieeexplore.ieee.org/author/37381975300,Department of Computer Science & Human-Compute...,True
4,1991,10.1109/VISUAL.1991.175815,Tree-maps: a space-filling approach to the vis...,2.0,2.0,B. Shneiderman,https://ieeexplore.ieee.org/author/37283016400,Department of Computer Science & Human-Compute...,True
...,...,...,...,...,...,...,...,...,...
12404,2021,10.1109/TVCG.2020.3032984,Understanding Missing Links in Bipartite Netwo...,4.0,4.0,Patrick Chui,,"Research, FXPAL, Palo Alto, California, United...",True
12405,2021,10.1109/TVCG.2021.3114959,Understanding Data Visualization Design Practice,1.0,1.0,Paul Parsons,https://ieeexplore.ieee.org/author/37085378772,"Purdue University, United States",True
12406,1999,10.1109/VIS.1999.10000,Progressive Compression of Arbitrary Triangula...,3.0,1.0,Daniel Cohen-Or,,Tel Aviv University,True
12407,1999,10.1109/VIS.1999.10000,Progressive Compression of Arbitrary Triangula...,3.0,2.0,David Levin,,Tel Aviv University,True


In [9]:
ieee.shape

(12409, 9)

In [11]:
ieee[ieee['One Affiliation'] == False].shape

(333, 9)

In [14]:
ieee[ieee['Author Affiliation'].isnull()].shape

(165, 9)

In [15]:
alex = pd.read_csv('../../data/interim/openalex_author_df.csv')

In [16]:
def get_diff_dois(IEEE, ALEX): # ieee, alex
	# return a list of DOIs where alex is wrong in Number of Authors
	DOIS = list(set(IEEE.DOI))
	diff_dois = []
	for doi in DOIS:
		ieee_n = IEEE[IEEE.DOI == doi]['Number of Authors'].tolist()[0]
		alex_n = ALEX[ALEX.DOI == doi]['Number of Authors'].tolist()[0]
		if ieee_n != alex_n:
			diff_dois.append(doi)
	return diff_dois 

In [17]:
def get_alex_new(IEEE, ALEX, DIFF_DOIS):
	"""
	For DOIs where alex is wrong in Number of Authors, get correct data from IEEE first
	Drop the rows where alex is wrong from alex, and append the correct ieee data to alex_dropped

	Returns:
		alex_new, where data of Number of Authors is correct
	"""
	df_to_append = IEEE[IEEE.DOI.isin(DIFF_DOIS)].iloc[:, 0:6]
	alex_dropped = ALEX.drop(ALEX[ALEX.DOI.isin(DIFF_DOIS)].index)
	alex_new = alex_dropped.append(df_to_append, ignore_index = True)
	return alex_new

In [18]:
def get_sorted_dfs(IEEE, ALEX_NEW, PAPERS):
	"""sort ieee and alex author df by paper index and author position

	I added a variable 'Paper Index' to both ieee and alex_new. I 
	also added a prefix of 'IEEE ' in ieee. Then I sort the two datasets 
	by 'Paper Index' and 'Author Position'. 

	Returns:
		two dataframes, ieee_sorted, and alex_new_sorted

	"""
	IEEE['Paper Index'] = [PAPERS.index(i) for i in IEEE.DOI.tolist()]
	ALEX_NEW['Paper Index'] = [PAPERS.index(i) for i in ALEX_NEW.DOI.tolist()]
	IEEE = IEEE.add_prefix('IEEE ')
	alex_new_sorted = ALEX_NEW.sort_values(
		by=['Paper Index', 'Author Position'], ).reset_index(drop=True)
	ieee_sorted = IEEE.sort_values(
		by=['IEEE Paper Index', 'IEEE Author Position'], ).reset_index(drop=True)
	return ieee_sorted, alex_new_sorted

In [19]:
def get_concat_df(IEEE, ALEX, PAPERS): # ieee_sorted, alex_sorted
	"""check https://stackoverflow.com/a/13680953 for details
	"""
	fuzzy_match_df_list = []
	mismatch_doi_list = []
	for doi in PAPERS:
		df1 = IEEE[IEEE['IEEE DOI'] == doi]
		df2 = ALEX[ALEX['DOI'] == doi]
		try:
			kwargs = {'IEEE Author Name': 
			df2['Author Name'].apply(
				lambda x: difflib.get_close_matches(
					x, df1['IEEE Author Name'], cutoff=0.6)[0])
			}
		except:
			kwargs = {'IEEE Author Name': df1['IEEE Author Name']}
			mismatch_doi_list.append(doi)
		df2 = df2.assign(**kwargs)
		df = df1.merge(df2, on='IEEE Author Name', how='inner')
		fuzzy_match_df_list.append(df)
	print(f'in {len(mismatch_doi_list)} dois, fuzzy matching was not successful, so I assumed author position in merging')
	df = pd.concat(fuzzy_match_df_list, ignore_index=True)
	return df 

In [20]:
diff_dois = get_diff_dois(ieee, alex)

In [21]:
alex_new = get_alex_new(ieee, alex, diff_dois)

In [25]:
ieee_sorted, alex_sorted = get_sorted_dfs(ieee, alex_new, papers)

In [26]:
concat_df = get_concat_df(ieee_sorted, alex_sorted, papers)

in 152 dois, fuzzy matching was not successful, so I assumed author position in merging


In [32]:
concat_df[(concat_df['IEEE Author Affiliation'].isnull()) & (
    concat_df['First Institution Name'].isnull()) & (concat_df['Raw Affiliation String'].isnull())].shape

(58, 27)