## Aim

This notebook is to check the get_merged_author_df script step by step

In [1]:
import sys
import pandas as pd
import re
import numpy as np
import csv
import difflib 

In [2]:
IEEE_AUTHOR = '../../data/interim/ieee_author_df.csv'
OPENALEX_AUTHOR = '../../data/interim/openalex_author_df.csv'
PAPERS_TO_STUDY = '../../data/processed/papers_to_study.txt'
VISPUBDATA = '../../data/raw/vispubdata.csv'

In [3]:
def get_dicts(VISPUBDATA):
    # get year_dict and title_dict
    vispd = pd.read_csv(VISPUBDATA)
    dois = vispd.loc[:, "DOI"].tolist()
    titles = vispd.loc[:, "Title"].tolist()
    years = vispd.loc[:, "Year"].tolist()
    doi_year_dict = dict(zip(dois, years))
    doi_title_dict = dict(zip(dois, titles))
    return doi_year_dict, doi_title_dict

In [4]:
vispd = pd.read_csv(VISPUBDATA)
doi_year_dict, doi_title_dict = get_dicts(VISPUBDATA)
ieee_orig = pd.read_csv(IEEE_AUTHOR)
alex = pd.read_csv(OPENALEX_AUTHOR)
papers = pd.read_csv(PAPERS_TO_STUDY, header=None).iloc[:,0].tolist()

In [5]:
ieee_orig.shape

(12423, 9)

In [6]:
ieee_orig[ieee_orig['Author Affiliation'].isnull()].shape

(167, 9)

In [7]:
def update_ieee_orig(DF): # df here is iee_orig
    """update ieee_org

    ieee_org is wrong in '10.1109/TVCG.2008.157' as it contains an additional author that shouldn't be there;
    also, ieee_org lacks author info for '10.1109/VIS.1999.10000'.

    What this function does is to delete the additional author in '10.1109/TVCG.2008.157' and update info in 
    that paper. Then, I added author data manually for '10.1109/VIS.1999.10000'.

    """
    DF = DF.drop(DF[DF.DOI == '10.1109/VIS.1999.10000'].index)
    row_to_drop = DF.index[DF.DOI == '10.1109/TVCG.2008.157'].tolist()[0]
    df_dropped = DF.drop([row_to_drop])
    df_dropped.loc[df_dropped.DOI == '10.1109/TVCG.2008.157', 'Number of Authors'] -= 1
    df_dropped.loc[df_dropped.DOI == '10.1109/TVCG.2008.157', 'Author Position'] -= 1.0
    df = df_dropped
    FILL_DATA = [
    {
        'Year': 1999,
        'DOI': '10.1109/VIS.1999.10000',
        'Title': 'Progressive Compression of Arbitrary Triangular Meshes',
        'Number of Authors': 3,
        'Author Position': 1,
        'Author Name': 'Daniel Cohen-Or',
        'Author ID': np.NaN,
        'Author Affiliation': 'Tel Aviv University',
        'One Affiliation': True,
    },
    {
        'Year': 1999,
        'DOI': '10.1109/VIS.1999.10000',
        'Title': 'Progressive Compression of Arbitrary Triangular Meshes',
        'Number of Authors': 3,
        'Author Position': 2,
        'Author Name': 'David Levin',
        'Author ID': np.NaN,
        'Author Affiliation': 'Tel Aviv University',
        'One Affiliation': True,
    },
    {
        'Year': 1999,
        'DOI': '10.1109/VIS.1999.10000',
        'Title': 'Progressive Compression of Arbitrary Triangular Meshes',
        'Number of Authors': 3,
        'Author Position': 3,
        'Author Name': 'Offir Remez',
        'Author ID': np.NaN,
        'Author Affiliation': 'Tel Aviv University',
        'One Affiliation': True,
    }
    ]
    fill_data_df = pd.DataFrame(FILL_DATA)
    df = df.append(fill_data_df, ignore_index = True)
    return df

In [8]:
ieee = update_ieee_orig(ieee_orig)

In [9]:
ieee.shape

(12424, 9)

In [10]:
ieee.head(2)

Unnamed: 0,Year,DOI,Title,Number of Authors,Author Position,Author Name,Author ID,Author Affiliation,One Affiliation
0,2011,10.1109/TVCG.2011.185,D³ Data-Driven Documents,3.0,1.0,Michael Bostock,https://ieeexplore.ieee.org/author/37591067400,"Computer Science Department, Stanford Universi...",True
1,2011,10.1109/TVCG.2011.185,D³ Data-Driven Documents,3.0,2.0,Vadim Ogievetsky,https://ieeexplore.ieee.org/author/38016292400,"Computer Science Department, Stanford Universi...",True


In [11]:
ieee[ieee['Author Affiliation'].isnull()].shape

(165, 9)

In [12]:
ieee[ieee['Number of Authors'].isnull()].shape

(0, 9)

In [13]:
def get_diff_dois(IEEE, ALEX): # ieee, alex
	# return a list of DOIs where alex is wrong in Number of Authors
	DOIS = list(set(IEEE.DOI))
	diff_dois = []
	for doi in DOIS:
		ieee_n = IEEE[IEEE.DOI == doi]['Number of Authors'].tolist()[0]
		alex_n = ALEX[ALEX.DOI == doi]['Number of Authors'].tolist()[0]
		if ieee_n != alex_n:
			diff_dois.append(doi)
	return diff_dois 

In [14]:
diff_dois = get_diff_dois(ieee, alex)

In [15]:
diff_dois

['10.1109/TVCG.2006.173',
 '10.1109/VISUAL.1997.663909',
 '10.1109/VISUAL.1996.567598',
 '10.1109/VAST.2018.8802509',
 '10.1109/TVCG.2006.142',
 '10.1109/VAST.2008.4677359',
 '10.1109/TVCG.2006.187',
 '10.1109/VISUAL.2001.964489',
 '10.1109/INFVIS.2003.1249016',
 '10.1109/TVCG.2021.3059294',
 '10.1109/TVCG.2020.3030437',
 '10.1109/VAST.2007.4389006',
 '10.1109/VAST.2012.6400554',
 '10.1109/VAST.2014.7042494',
 '10.1109/VISUAL.1993.398857',
 '10.1109/TVCG.2006.193',
 '10.1109/TVCG.2007.70552',
 '10.1109/TVCG.2009.163',
 '10.1109/VISUAL.1998.745288']

In [16]:
def get_alex_new(IEEE, ALEX, DIFF_DOIS):
	"""
	For DOIs where alex is wrong in Number of Authors, get correct data from IEEE first
	Drop the rows where alex is wrong from alex, and append the correct ieee data to alex_dropped

	Returns:
		alex_new, where data of Number of Authors is correct
	"""
	df_to_append = IEEE[IEEE.DOI.isin(DIFF_DOIS)].iloc[:, 0:6]
	alex_dropped = ALEX.drop(ALEX[ALEX.DOI.isin(DIFF_DOIS)].index)
	alex_new = alex_dropped.append(df_to_append, ignore_index = True)
	return alex_new

In [17]:
alex_new = get_alex_new(ieee, alex, diff_dois)

In [18]:
alex_new.shape

(12424, 16)

In [19]:
def get_sorted_dfs(IEEE, ALEX_NEW, PAPERS):
	"""sort ieee and alex author df by paper index and author position

	I added a variable 'Paper Index' to both ieee and alex_new. I 
	also added a prefix of 'IEEE ' in ieee. Then I sort the two datasets 
	by 'Paper Index' and 'Author Position'. 

	Returns:
		two dataframes, ieee_sorted, and alex_new_sorted

	"""
	IEEE['Paper Index'] = [PAPERS.index(i) for i in IEEE.DOI.tolist()]
	ALEX_NEW['Paper Index'] = [PAPERS.index(i) for i in ALEX_NEW.DOI.tolist()]
	IEEE = IEEE.add_prefix('IEEE ')
	alex_new_sorted = ALEX_NEW.sort_values(
		by=['Paper Index', 'Author Position'], ).reset_index(drop=True)
	ieee_sorted = IEEE.sort_values(
		by=['IEEE Paper Index', 'IEEE Author Position'], ).reset_index(drop=True)
	return ieee_sorted, alex_new_sorted

In [20]:
ieee_sorted, alex_sorted = get_sorted_dfs(ieee, alex_new, papers)

In [21]:
ieee_sorted.head(2)

Unnamed: 0,IEEE Year,IEEE DOI,IEEE Title,IEEE Number of Authors,IEEE Author Position,IEEE Author Name,IEEE Author ID,IEEE Author Affiliation,IEEE One Affiliation,IEEE Paper Index
0,2011,10.1109/TVCG.2011.185,D³ Data-Driven Documents,3.0,1.0,Michael Bostock,https://ieeexplore.ieee.org/author/37591067400,"Computer Science Department, Stanford Universi...",True,0
1,2011,10.1109/TVCG.2011.185,D³ Data-Driven Documents,3.0,2.0,Vadim Ogievetsky,https://ieeexplore.ieee.org/author/38016292400,"Computer Science Department, Stanford Universi...",True,0


In [22]:
alex_sorted.head(2)

Unnamed: 0,Year,DOI,Title,Number of Authors,Author Name,Author Position,Author Position Type,OpenAlex Author ID,Author ORCID,Number of Affiliations,First Institution Name,Raw Affiliation String,First Institution ID,First Institution ROR,First Institution Type,First Institution Country Code,Paper Index
0,2011,10.1109/TVCG.2011.185,D³ Data-Driven Documents,3.0,Michael Bostock,1.0,first,https://openalex.org/A2048345123,,1.0,Stanford University,"Computer Science Department, Stanford Universi...",https://openalex.org/I97018004,https://ror.org/00f54p054,education,US,0
1,2011,10.1109/TVCG.2011.185,D³ Data-Driven Documents,3.0,Vadim Ogievetsky,2.0,middle,https://openalex.org/A2668634103,,1.0,Stanford University,"Computer Science Department, Stanford Universi...",https://openalex.org/I97018004,https://ror.org/00f54p054,education,US,0


In [23]:
def get_concat_df(IEEE, ALEX, PAPERS): # ieee_sorted, alex_sorted
    """check https://stackoverflow.com/a/13680953 for details
    """
    fuzzy_match_df_list = []
    mismatch_doi_list = []
    for doi in PAPERS:
        df1 = IEEE[IEEE['IEEE DOI'] == doi]
        df2 = ALEX[ALEX['DOI'] == doi]
        try:
            kwargs = {'IEEE Author Name': 
            df2['Author Name'].apply(
                lambda x: difflib.get_close_matches(
                    x, df1['IEEE Author Name'], cutoff=0.6)[0])
            }
        except:
            kwargs = {'IEEE Author Name': df1['IEEE Author Name']}
            mismatch_doi_list.append(doi)
        df2 = df2.assign(**kwargs)
        df = df1.merge(df2, on='IEEE Author Name', how='inner')
        fuzzy_match_df_list.append(df)
    print(f'in {len(mismatch_doi_list)} dois, fuzzy matching was not successful, so I assumed author position in merging')
    df = pd.concat(fuzzy_match_df_list, ignore_index=True)
    return df 

In [24]:
concat_df = get_concat_df(ieee_sorted, alex_sorted, papers)

in 154 dois, fuzzy matching was not successful, so I assumed author position in merging


In [25]:
concat_df[concat_df['IEEE Author Affiliation'].isnull()].shape

(165, 27)

In [26]:
concat_df['IEEE Paper Index'].tolist() == concat_df['Paper Index'].tolist()

True

In [27]:
concat_df['IEEE Number of Authors'].tolist() == concat_df['Number of Authors'].tolist()

True

In [28]:
concat_df['IEEE Author Position'].tolist() == concat_df['Author Position'].tolist()

False

In [29]:
cols_to_see = ['IEEE Author Position', 'Author Position']
concat_df[concat_df['IEEE Author Position'] != concat_df['Author Position']].shape

(85, 27)

In [30]:
doi = '10.1109/TVCG.2011.185'
df1 = ieee_sorted[ieee_sorted['IEEE DOI'] == doi]
df2 = alex_sorted[alex_sorted['DOI'] == doi]

In [31]:
df1

Unnamed: 0,IEEE Year,IEEE DOI,IEEE Title,IEEE Number of Authors,IEEE Author Position,IEEE Author Name,IEEE Author ID,IEEE Author Affiliation,IEEE One Affiliation,IEEE Paper Index
0,2011,10.1109/TVCG.2011.185,D³ Data-Driven Documents,3.0,1.0,Michael Bostock,https://ieeexplore.ieee.org/author/37591067400,"Computer Science Department, Stanford Universi...",True,0
1,2011,10.1109/TVCG.2011.185,D³ Data-Driven Documents,3.0,2.0,Vadim Ogievetsky,https://ieeexplore.ieee.org/author/38016292400,"Computer Science Department, Stanford Universi...",True,0
2,2011,10.1109/TVCG.2011.185,D³ Data-Driven Documents,3.0,3.0,Jeffrey Heer,https://ieeexplore.ieee.org/author/37550791300,"Computer Science Department, Stanford Universi...",True,0


In [32]:
kwargs = {'IEEE Author Name': 
            df2['Author Name'].apply(
                lambda x: difflib.get_close_matches(
                    x, df1['IEEE Author Name'], cutoff=0.6)[0])
            }

In [33]:
df2 = df2.assign(**kwargs)

In [34]:
difflib.get_close_matches('good', ['goodnow', 'goodhow', 'goody'], cutoff=0.6)[0]

'goody'

In [35]:
concat_df.columns

Index(['IEEE Year', 'IEEE DOI', 'IEEE Title', 'IEEE Number of Authors',
       'IEEE Author Position', 'IEEE Author Name', 'IEEE Author ID',
       'IEEE Author Affiliation', 'IEEE One Affiliation', 'IEEE Paper Index',
       'Year', 'DOI', 'Title', 'Number of Authors', 'Author Name',
       'Author Position', 'Author Position Type', 'OpenAlex Author ID',
       'Author ORCID', 'Number of Affiliations', 'First Institution Name',
       'Raw Affiliation String', 'First Institution ID',
       'First Institution ROR', 'First Institution Type',
       'First Institution Country Code', 'Paper Index'],
      dtype='object')

In [36]:
# This is to check, after the merging procudure above, 
# whether the author position in each paper changed
# if changed, that means the original IEEE data is distorted
# if not, than the IEEE data is preserved
ieee_author_position_is_monotonic_increasing = True
openalex_author_position_is_monotonic_increasing = True
for paper_index in concat_df['Paper Index']:
    paper_df = concat_df[concat_df['Paper Index']==paper_index]
    if not paper_df['IEEE Author Position'].is_monotonic_increasing:
        ieee_author_position_is_monotonic_increasing = False
    if not paper_df['Author Position'].is_monotonic_increasing:
        openalex_author_position_is_monotonic_increasing = False

In [37]:
ieee_author_position_is_monotonic_increasing

True

In [38]:
openalex_author_position_is_monotonic_increasing

False

In [39]:
def flatten(t):
	"""convert list of lists to a list of items"""
	"""source: https://stackoverflow.com/a/952952"""
	return [item for sublist in t for item in sublist]

In [40]:
def update_with_vispubdata_author_data(VISPD, DF): # vispd, concat_df
    ieee_wrong = [
    '10.1109/INFVIS.2005.1532150',
    '10.1109/VISUAL.2005.1532819',
    '10.1109/VISUAL.2005.1532794',
    '10.1109/VISUAL.1992.235178',
    ]
    correct_author_num = [5, 2, 5, 4]
    correct_author_num_dict = dict(zip(ieee_wrong, correct_author_num))
    vispd_names = VISPD.loc[VISPD.DOI.isin(ieee_wrong), 'AuthorNames-Deduped'].tolist()
    dois = flatten([np.repeat(doi, correct_author_num_dict[doi]) for doi in ieee_wrong])
    years = [doi_year_dict[x] for x in dois]
    titles = [doi_title_dict[x] for x in dois]
    author_names = flatten([x.split(';') for x in vispd_names])
    author_nums = flatten([np.repeat(i, i) for i in correct_author_num])
    author_positions = flatten([range(1, i+1) for i in correct_author_num])
    paper_index = [papers.index(doi) for doi in dois]
    DF_TO_FILL = pd.DataFrame({
        'IEEE DOI': dois,
        'DOI': dois,
        'IEEE Year': years,
        'Year': years,
        'IEEE Title': titles,
        'Title': titles,
        'IEEE Number of Authors': author_nums,
        'IEEE Author Position': author_positions,
        'IEEE Author Name': author_names,
        'Number of Authors': author_nums,
        'Author Position': author_positions,
        'Author Name': author_names,
        'IEEE Paper Index': paper_index,
        'Paper Index': paper_index,
    })
    df_dropped = DF.drop(DF[DF['IEEE DOI'].isin(ieee_wrong)].index)
    df_new = df_dropped.append(DF_TO_FILL, ignore_index=True)
    df_new = df_new.sort_values(
        by=['IEEE Paper Index', 'IEEE Author Position'], ).reset_index(drop=True)
    return df_new

In [41]:
concat_df = update_with_vispubdata_author_data(vispd, concat_df)

In [42]:
concat_df.head()

Unnamed: 0,IEEE Year,IEEE DOI,IEEE Title,IEEE Number of Authors,IEEE Author Position,IEEE Author Name,IEEE Author ID,IEEE Author Affiliation,IEEE One Affiliation,IEEE Paper Index,...,OpenAlex Author ID,Author ORCID,Number of Affiliations,First Institution Name,Raw Affiliation String,First Institution ID,First Institution ROR,First Institution Type,First Institution Country Code,Paper Index
0,2011,10.1109/TVCG.2011.185,D³ Data-Driven Documents,3.0,1.0,Michael Bostock,https://ieeexplore.ieee.org/author/37591067400,"Computer Science Department, Stanford Universi...",True,0,...,https://openalex.org/A2048345123,,1.0,Stanford University,"Computer Science Department, Stanford Universi...",https://openalex.org/I97018004,https://ror.org/00f54p054,education,US,0
1,2011,10.1109/TVCG.2011.185,D³ Data-Driven Documents,3.0,2.0,Vadim Ogievetsky,https://ieeexplore.ieee.org/author/38016292400,"Computer Science Department, Stanford Universi...",True,0,...,https://openalex.org/A2668634103,,1.0,Stanford University,"Computer Science Department, Stanford Universi...",https://openalex.org/I97018004,https://ror.org/00f54p054,education,US,0
2,2011,10.1109/TVCG.2011.185,D³ Data-Driven Documents,3.0,3.0,Jeffrey Heer,https://ieeexplore.ieee.org/author/37550791300,"Computer Science Department, Stanford Universi...",True,0,...,https://openalex.org/A2112690490,https://orcid.org/0000-0002-6175-1655,1.0,Stanford University,"Computer Science Department, Stanford Universi...",https://openalex.org/I97018004,https://ror.org/00f54p054,education,US,0
3,1991,10.1109/VISUAL.1991.175815,Tree-maps: a space-filling approach to the vis...,2.0,1.0,B. Johnson,https://ieeexplore.ieee.org/author/37381975300,Department of Computer Science & Human-Compute...,True,1,...,https://openalex.org/A2167794833,,1.0,"University of Maryland, College Park","Dept. of Comput Sci., Maryland Univ., College ...",https://openalex.org/I66946132,https://ror.org/047s2c258,education,US,1
4,1991,10.1109/VISUAL.1991.175815,Tree-maps: a space-filling approach to the vis...,2.0,2.0,B. Shneiderman,https://ieeexplore.ieee.org/author/37283016400,Department of Computer Science & Human-Compute...,True,1,...,https://openalex.org/A668764113,https://orcid.org/0000-0002-8298-1097,1.0,"University of Maryland, College Park","Dept. of Comput Sci., Maryland Univ., College ...",https://openalex.org/I66946132,https://ror.org/047s2c258,education,US,1


In [43]:
# The IEEEXplore portion of the merged author dataset missed af- filiation information for 181 authors

concat_df[concat_df['IEEE Author Affiliation'].isnull()].shape

(181, 27)

In [44]:
def update_country_code(DF, DOI, NEW_DATA):
    DF.loc[DF['DOI'] == DOI, 'First Institution Country Code By Hand'] = NEW_DATA
#     DF.loc[DF['DOI'] == DOI, 'Author Name'] = DF.loc[DF['DOI'] == DOI, 'IEEE Author Name']
    return DF 

def update_country_code_by_raw_string(DF, RAW_STRING, NEW_DATA):
    DF.loc[DF['Raw Affiliation String'] == RAW_STRING, 'First Institution Country Code By Hand'] = NEW_DATA
    return DF 

def update_type(DF, DOI, NEW_DATA):
    DF.loc[DF['DOI'] == DOI, 'First Institution Type By Hand'] = NEW_DATA
    return DF 

def update_type_by_raw_string(DF, RAW_STRING, NEW_DATA):
    DF.loc[DF['Raw Affiliation String'] == RAW_STRING, 'First Institution Type By Hand'] = NEW_DATA
#     DF.loc[DF['Raw Affiliation String'] == RAW_STRING, 'Author Name'] = DF.loc[
#     DF['Raw Affiliation String'] == RAW_STRING, 'IEEE Author Name']
    return DF 

def update_affiliations(DF, DOI, NEW_DATA):
    # update both ieee author affiliation, alex first institution names and raw string
    DF.loc[DF['DOI'] == DOI, 'IEEE Author Affiliation'] = NEW_DATA
    DF.loc[DF['DOI'] == DOI, 'First Institution Name'] = NEW_DATA
    DF.loc[DF['DOI'] == DOI, 'Raw Affiliation String'] = NEW_DATA
    return DF 

def update_author_name(DF, DOI, NEW_DATA):
    DF.loc[DF['DOI'] == DOI, 'IEEE Author Name'] = NEW_DATA
#     DF.loc[DF['DOI'] == DOI, 'Author Name'] = NEW_DATA
    return DF

In [45]:
def update_concat_df(DF): # DF here is concat_df
    """Update data for specific DOIs

    Return:
        still concat_df, but updated
    """
    # '10.1109/VISUAL.1996.568115',
    update_country_code(
        DF, 
        '10.1109/VISUAL.1996.568115',
        ['US']*3,
    )
    update_type(
        DF, 
        '10.1109/VISUAL.1996.568115',
        ['company']*2 + ['education'],
    )
    update_affiliations(
        DF, 
        '10.1109/VISUAL.1996.568115',
        ['MRJ, Inc']*2 + ['NASA Ames Research Center']
    )
    # '10.1109/VISUAL.2000.885735'
    update_country_code(
        DF, 
        '10.1109/VISUAL.2000.885735',
        np.repeat('NL', 6),
    )
    update_type(
        DF, 
        '10.1109/VISUAL.2000.885735',
        ['government']*2 + ['education']*4,
    )
    update_affiliations(
        DF, 
        '10.1109/VISUAL.2000.885735',
        np.append(
            np.repeat(
                'Center for Mathematics and Computer Science, CWI, Amsterdam, Netherlands', 2),
            np.repeat(
                'Swammerdam Inst. for Life Sciences, BioCentrum Amsterdam, Amsterdam, Netherlands', 4)
            )
    )
    # '10.1109/VISUAL.1996.568143',
    update_country_code(
        DF, 
        '10.1109/VISUAL.1996.568143',
        ['US']*6,
    )
    update_type(
        DF, 
        '10.1109/VISUAL.1996.568143',
        ['education']*6,
    )
    update_affiliations(
        DF, 
        '10.1109/VISUAL.1996.568143',
        ['Ohio State University, Columbus, OH, USA']*6
    )
    # '10.1109/VISUAL.1999.809936',
    update_country_code(
        DF, 
        '10.1109/VISUAL.1999.809936',
        ['US']*3,
    )
    update_type(
        DF, 
        '10.1109/VISUAL.1999.809936',
        ['education']*3,
    )
    update_affiliations(
        DF, 
        '10.1109/VISUAL.1999.809936',
        ['Worcester Polytechnic Institute, Worcester, MA, USA']*3
    )
    # '10.1109/INFVIS.2002.1173147',
    # IEEE Xplore got author name wrong
    update_country_code(
        DF, 
        '10.1109/INFVIS.2002.1173147',
        ['SE', 'US', 'SE'],
    )
    update_type(
        DF, 
        '10.1109/INFVIS.2002.1173147',
        ['education']*3,
    )
    update_affiliations(
        DF, 
        '10.1109/INFVIS.2002.1173147',
        [
            'Dept. of Information Science, Uppsala University, Uppsala, Sweden',
            'Dept. of Psychology, Indiana University, Bloomington, Indiana, USA',
            'Dept. of Information Science, Uppsala University, Uppsala, Sweden',
        ]
    )
    update_author_name(
        DF, 
        '10.1109/INFVIS.2002.1173147',
        ['M. Lind', 'G.P. Bingham', 'C. Forsell'],
    )
    # '10.1109/VISUAL.1992.235175',
    update_country_code(
        DF, 
        '10.1109/VISUAL.1992.235175',
        ['US']*12,
    )
    update_type(
        DF, 
        '10.1109/VISUAL.1992.235175',
        ['company']*3 + ['government']*2 + ['education']*6 + ['company']*1
    )
    update_affiliations(
        DF, 
        '10.1109/VISUAL.1992.235175',
        [
            'Unisys Corporation',
            'Sterling Software',
            'Unisys Corporation',
            'U.S. Environmental Protection Agency, United States',
            'U.S. Environmental Protection Agency',
            'University of Alabama in Huntsville (UAH), United States',
            'Florida State University, United States',
            'Florida State University, United States',
            'University of Wisconsin, Madison, WI, United States',
            'University of Wisconsin, Madison, WI, United States',
            'University of Wisconsin, Madison, WI, United States',
            'IBM T.J. Watson Research Center, United States',
        ]
    )
    # '10.1109/TVCG.2006.182',
    update_country_code(
        DF, 
        '10.1109/TVCG.2006.182',
        ['US']*5,
    )
    update_type(
        DF, 
        '10.1109/TVCG.2006.182',
        ['company']*1 + ['education']*4 
    )
    update_affiliations(
        DF, 
        '10.1109/TVCG.2006.182',
        ['Brown University, United States']*5,
    )
    # '10.1109/TVCG.2015.2467971',
    update_country_code(
        DF, 
        '10.1109/TVCG.2015.2467971',
        ['US']*5,
    )
    update_type(
        DF, 
        '10.1109/TVCG.2015.2467971',
        ['education']*5, 
    )
    update_affiliations(
        DF, 
        '10.1109/TVCG.2015.2467971',
        ['University of North Carolina at Charlotte, NC, United States']*5,
    )
    # '10.1109/SciVis.2015.7429489', 
    # author affilitions listed on ieee are all WRONG!!!
    # I found the authors' correct affilition on their ieee author id pages
    update_country_code(
        DF, 
        '10.1109/SciVis.2015.7429489',
        ['DE']*5,
    )
    update_type(
        DF, 
        '10.1109/SciVis.2015.7429489',
        ['education']*5, 
    )
    update_affiliations(
        DF, 
        '10.1109/SciVis.2015.7429489',
        ['Technical University of Munich, Germany']*5,
    )
    # '10.1109/VISUAL.2005.1532821',
    update_country_code(
        DF, 
        '10.1109/VISUAL.2005.1532821',
        ['AT', 'HR', 'AT', 'AT', 'US'],
    )
    update_type(
        DF, 
        '10.1109/VISUAL.2005.1532821',
        ['company']*4 + ['education']*1 
    )
    update_affiliations(
        DF, 
        '10.1109/VISUAL.2005.1532821',
        ['VRVis Research Center Vienna, Austria'] + ['AVL-AST Zagreb, Croatia'] + [
        'VRVis Research Center Vienna, Austria']*2 + ['Virginia Tech']
    )
    # '10.1109/VISUAL.2000.885692',
    update_country_code(
        DF, 
        '10.1109/VISUAL.2000.885692',
        ['US']*6,
    )
    update_type(
        DF, 
        '10.1109/VISUAL.2000.885692',
        ['education']*6,
    )
    update_affiliations(
        DF, 
        '10.1109/VISUAL.2000.885692',
        ['University of Utah, Salt Lake City, UT, USA']*4 + ['Vanderbilt University, USA'] + [
          'University of Utah, Salt Lake City, UT, USA'],
    )
    # '10.1109/VISUAL.1999.809912',
    update_country_code(
        DF, 
        '10.1109/VISUAL.1999.809912',
        ['DE']*4,
    )
    update_type(
        DF, 
        '10.1109/VISUAL.1999.809912',
        ['education']*2 + ['healthcare']*2,
    )
    update_affiliations(
        DF, 
        '10.1109/VISUAL.1999.809912',
        ['WSUGRIS, University of Tubingen, Tubingen, Germany']*2 + [
         'Department of Neuroradiology, University Hospital Tubingen, Tubingen, Germany']*2 ,
    )
    # '10.1109/VISUAL.1999.809929',
    update_country_code(
        DF, 
        '10.1109/VISUAL.1999.809929',
        ['US']*4,
    )
    update_type(
        DF, 
        '10.1109/VISUAL.1999.809929',
        ['company']*4,
    )
    update_affiliations(
        DF, 
        '10.1109/VISUAL.1999.809929',
        ['IBM T.J. Watson Research Center, United States']*3 + [
         'UBS Group AG'] ,
    )
    # '10.1109/VISUAL.1999.809884',
    # In this paper, openalex got country wrong
    update_country_code(
        DF, 
        '10.1109/VISUAL.1999.809884',
        ['DE']*5,
    )
    # '10.1109/VISUAL.1999.809920',
    # openalex got country wrong
    update_country_code(
        DF, 
        '10.1109/VISUAL.1999.809920',
        ['DE']*5,
    )
    # '10.1109/VISUAL.1993.398911',
    # openalex got this paper country wrong for the last two authors
    update_country_code(
        DF, 
        '10.1109/VISUAL.1993.398911',
        ['RU']*4 + ['DE']*2,
    )
    # '10.1109/VISUAL.2005.1532816',
    # ieee xplore got author positions and author affiliations wrong
    update_author_name(
        DF, 
        '10.1109/VISUAL.2005.1532816',
        [
            'Gregor Schlosser',
            'J ̈urgen Hesser',
            'Frank Zeilfelder',
            'Christian Rossl',
            'Reinhard Manner',
            'Gunther Nurnberger',
            'Hans-Peter Seidel',
        ],
    )
    update_country_code(
        DF, 
        '10.1109/VISUAL.2005.1532816',
        ['DE']*7,
    )
    update_type(
        DF, 
        '10.1109/VISUAL.2005.1532816',
        ['education']*3 + ['nonprofit']*1 + ['education']*2 + ['nonprofit']*1,
    )
    update_affiliations(
        DF, 
        '10.1109/VISUAL.2005.1532816',
        ['ICM, Universitäten Mannheim und Heidelberg, Mannheim, Germany']*2 +
        ['Institut für Mathematik, Universität Mannheim, Mannheim, Germany'] +
        ['Max Planck Institut für Informatik, Saarbruecken, Germany'] +
        ['ICM, Universitäten Mannheim und Heidelberg, Mannheim, Germany'] +
        ['Institut für Mathematik, Universität Mannheim, Mannheim, Germany'] +
        ['Max Planck Institut für Informatik, Saarbruecken, Germany'],
    )
    # '10.1109/VAST.2016.7883507',
    # This is the paper where i don't have ieee author affilition or openalex raw string,
    # but i have openalex first institution name.
    # Another note: Information on IEEE about the first two authors of this paper is WRONG!
    update_country_code(
        DF, 
        '10.1109/VAST.2016.7883507',
        ['DE']*5,
    )
    update_type(
        DF, 
        '10.1109/VAST.2016.7883507',
        ['education']*5,
    )
    update_affiliations(
        DF, 
        '10.1109/VAST.2016.7883507',
        ['University of Stuttgart, Germany']*5
    )
    # '10.1109/VISUAL.2004.38',
    update_country_code(
        DF, 
        '10.1109/VISUAL.2004.38',
        ['CN']*1 + ['US']*3,
    )
    update_type(
        DF, 
        '10.1109/VISUAL.2004.38',
        ['education']*3 + ['company']*1,
    )
    update_affiliations(
        DF, 
        '10.1109/VISUAL.2004.38',
        ['Zhejiang University, China'] + ['Carnegie Mellon University, United States'] + [
            'Massachusetts Institute Of Technology, United States'] + [
                'Mitsubishi Electric Research Laboratories, United States']
    )
    """The following are cases where i have raw string, but not type or country code"""
    # '10.1109/TVCG.2006.195',
    update_country_code(
        DF, 
        '10.1109/TVCG.2006.195',
        ['NL']*3
    )
    update_type(
        DF, 
        '10.1109/TVCG.2006.195',
        ['education']*2 + ['government']*1,
    )
    update_affiliations(
        DF, 
        '10.1109/TVCG.2006.195',
        ['Swammerdam Institute for Life Sciences (SILS), University of Amsterdam, Netherlands']*2 + [
            'Center for Mathematics and Computer Science (CWI), Netherlands'
        ]*1
    )
    # '10.1109/VISUAL.1996.567752',
    update_country_code(
        DF, 
        '10.1109/VISUAL.1996.567752',
        ['US']*3
    )
    update_type(
        DF, 
        '10.1109/VISUAL.1996.567752',
        ['company']*3
    )
    update_affiliations(
        DF, 
        '10.1109/VISUAL.1996.567752',
        ['GE Corporate Research & Development, United States']*3,
    )
    # '10.1109/VISUAL.1999.809907',
    update_country_code(
        DF, 
        '10.1109/VISUAL.1999.809907',
        ['NL']*2
    )
    update_type(
        DF, 
        '10.1109/VISUAL.1999.809907',
        ['government']*2
    )
    update_affiliations(
        DF, 
        '10.1109/VISUAL.1999.809907',
        ['Center for Mathematics and Computer Science (CWI), Netherlands']*2,
    )
    # '10.1109/VISUAL.2004.88',
    update_country_code(
        DF, 
        '10.1109/VISUAL.2004.88',
        ['DE']*2
    )
    update_type(
        DF, 
        '10.1109/VISUAL.2004.88',
        ['nonprofit'] + ['education']
    )
    update_affiliations(
        DF, 
        '10.1109/VISUAL.2004.88',
        ['Caesar Research Center, Bonn, Germany'] + [
        'Interdisciplinary Center for Scientific Computing, Heidelberg, Germany'],
    )
    # '10.1109/VISUAL.2004.113',
    update_type_by_raw_string(
        DF,
        'DLR Goettingen',
        ['government']
    )
    update_country_code_by_raw_string(
        DF,
        'DLR Goettingen',
        'DE'
    )
    # '10.1109/VISUAL.2000.885722',
    update_type_by_raw_string(
        DF,
        'ETH Zentrum, CH - 8092 Switzerland',
        'education'
    )
    update_country_code_by_raw_string(
        DF,
        'ETH Zentrum, CH - 8092 Switzerland',
        'CH'
    )
    # '10.1109/VISUAL.2000.885715',
    update_country_code(
        DF, 
        '10.1109/VISUAL.2000.885715',
        ['DE']*3 + ['NL'] + ['DE'] + ['NL']
    )
    update_type(
        DF, 
        '10.1109/VISUAL.2000.885715',
        ['education']*6,
    )
    update_affiliations(
        DF, 
        '10.1109/VISUAL.2000.885715',
        ['University of Bonn, Bonn, Germany'] * 3 + ['Eindhoven University of Technology'] + [
            'University of Bonn, Bonn, Germany'] + ['Eindhoven University of Technology']
    )
    # '10.1109/VISUAL.2000.885731',
    update_country_code(
        DF, 
        '10.1109/VISUAL.2000.885731',
        ['US']*6,
    )
    update_type(
        DF, 
        '10.1109/VISUAL.2000.885731',
        ['education']*6,
    )
    update_affiliations(
        DF, 
        '10.1109/VISUAL.2000.885731',
        ['Brown University, United States']*6,
    )
    # '10.1109/VISUAL.1996.568133',
    update_country_code(
        DF, 
        '10.1109/VISUAL.1996.568133',
        ['US']*7,
    )
    update_type(
        DF, 
        '10.1109/VISUAL.1996.568133',
        ['healthcare'] + ['education'] + ['facility']*2 + ['healthcare'] + ['education']*2,
    )
    update_affiliations(
        DF, 
        '10.1109/VISUAL.1996.568133',
        ['National Jewish Center for Immunology and Respiratory Medicine, United States'] + [
        'University of New Mexico, United States'] + [
        'Sandia National Laboratories, United States']*2 + [
        'National Jewish Center for Immunology and Respiratory Medicine, United States'] + [
        'State University of New York at Stony Brook, United States'] + [
        'University of New Mexico, United States']
    )
    # '10.1109/VISUAL.2005.1532808',
    update_country_code(
        DF, 
        '10.1109/VISUAL.2005.1532808',
        ['DE'],
    )
    update_type(
        DF, 
        '10.1109/VISUAL.2005.1532808',
        ['education'],
    )
    update_affiliations(
        DF, 
        '10.1109/VISUAL.2005.1532808',
        ['University of Stuttgart']
    )
    # '10.1109/VISUAL.1998.745350',
    update_country_code(
        DF, 
        '10.1109/VISUAL.1998.745350',
        ['US']*6,
    )
    update_type(
        DF, 
        '10.1109/VISUAL.1998.745350',
        ['facility']*6,
    )
    update_affiliations(
        DF, 
        '10.1109/VISUAL.1998.745350',
        ['Naval Reseach Lab, Washington, D.C.']*6
    )
    # '10.1109/VISUAL.2005.1532776',
    update_country_code(
        DF, 
        '10.1109/VISUAL.2005.1532776',
        ['US']*7,
    )
    update_type(
        DF, 
        '10.1109/VISUAL.2005.1532776',
        ['company']*3 + ['facility']*2 + ['company']*2,
    )
    update_affiliations(
        DF, 
        '10.1109/VISUAL.2005.1532776',
        ['Kitware, United States']*3 + [
        'Sandia National Laboratories, United States']*2 + [
        'Simmetrix, United States']*2,
    )
    # '10.1109/VISUAL.1996.568150',
    update_country_code(
        DF, 
        '10.1109/VISUAL.1996.568150',
        ['NL']*4,
    )
    update_type(
        DF, 
        '10.1109/VISUAL.1996.568150',
        ['nonprofit'] + ['government']*2 + ['education']
    )
    update_affiliations(
        DF, 
        '10.1109/VISUAL.1996.568150',
        ['Netherlands Energy Research Foundation, Netherlands'] + [
        'Centre for Mathematics and Computer Science (CWI), Netherlands']*2 + [
        'Vrije Universiteit, Netherlands']
    )
    # '10.1109/VISUAL.1990.146398',
    update_country_code(
        DF, 
        '10.1109/VISUAL.1990.146398',
        ['US']*4,
    )
    update_type(
        DF, 
        '10.1109/VISUAL.1990.146398',
        ['government'] + ['company']*3 
    )
    update_affiliations(
        DF, 
        '10.1109/VISUAL.1990.146398',
        ['NASA Ames Research Center, Moffett Field, CA, USA'] + [
        'Sterling Software, United States'] + [
            'Crossfield Marketing, United States'] + [
            'Crystal River Engineering, Inc., Groveland, CA, USA']
    ) 
    # '10.1109/VISUAL.1996.568120',
    update_country_code(
        DF, 
        '10.1109/VISUAL.1996.568120',
        ['US']*3,
    )
    update_type(
        DF, 
        '10.1109/VISUAL.1996.568120',
        ['education']*3 
    )
    update_affiliations(
        DF, 
        '10.1109/VISUAL.1996.568120',
        ['University of Illinois at Chicago, United States'] + [
        'University of Chicago, United States'] + [
            'University of Illinois at Chicago, United States']
    ) 
    """BELOW ARE WHERE I FILL AUTHOR DATA FOR ROWS WHERE DATA WAS FROM VISPUBDATA RAW"""
    # '10.1109/INFVIS.2005.1532150',
    update_country_code(
        DF, 
        '10.1109/INFVIS.2005.1532150',
        ['US']*5,
    )
    update_type(
        DF, 
        '10.1109/INFVIS.2005.1532150',
        ['education']*5,
    )
    update_affiliations(
        DF, 
        '10.1109/INFVIS.2005.1532150',
        ['Stanford University, United States']*5,
    ) 
    # '10.1109/VISUAL.2005.1532819',
    update_country_code(
        DF, 
        '10.1109/VISUAL.2005.1532819',
        ['CA']*2,
    )
    update_type(
        DF, 
        '10.1109/VISUAL.2005.1532819',
        ['education']*2,
    )
    update_affiliations(
        DF, 
        '10.1109/VISUAL.2005.1532819',
        ['University of Alberta, Canada']*2,
    ) 
    # '10.1109/VISUAL.2005.1532794',
    update_country_code(
        DF, 
        '10.1109/VISUAL.2005.1532794',
        ['US']*5,
    )
    update_type(
        DF, 
        '10.1109/VISUAL.2005.1532794',
        ['facility'] + ['education']*3 + ['facility'],
    )
    update_affiliations(
        DF, 
        '10.1109/VISUAL.2005.1532794',
        ['Oak Ridge National Lab, United States'] + [
            'The University of Tennessee, United States']*3 + [
            'Oak Ridge National Lab, United States'],
    ) 
    # '10.1109/VISUAL.1992.235178',
    update_country_code(
        DF, 
        '10.1109/VISUAL.1992.235178',
        ['US']*4,
    )
    update_type(
        DF, 
        '10.1109/VISUAL.1992.235178',
        ['education']*4,
    )
    update_affiliations(
        DF, 
        '10.1109/VISUAL.1992.235178',
        ['University of Utah, United States']*4,
    ) 
    ## IEEE Website updates the name of Sehi LYi but this update is 
    ## different from the name shown on PDF. I changed it back. 
    # '10.1109/TVCG.2021.3114876',
    update_author_name(
        DF, 
        '10.1109/TVCG.2021.3114876', 
        ["Sehi L'Yi", 'Qianwen Wang', 'Fritz Lekschas', 'Nils Gehlenborg'],
    )
    ## I found the in this paper, Some authors' affiliations contain two institutions
    update_country_code(
        DF, 
        '10.1109/TVCG.2011.207',
        ['DE']*4,
    )
    update_type(
        DF, 
        '10.1109/TVCG.2011.207',
        ['company'] + ['education']*1 + ['company']*2,
    )
    update_affiliations(
        DF, 
        '10.1109/TVCG.2011.207',
        ['Fraunhofer MEVIS, Germany'] + [
            'Center of Complex Systems and Visualization (CeVis), University of Bremen, Germany']*1 + [
            'Fraunhofer MEVIS, Germany']*2,
    ) 
    ## I found that in this paper, the first author has two affiliations
    update_country_code(
        DF, 
        '10.1109/INFVIS.2004.1',
        ['FR']*3,
    )
    update_type(
        DF, 
        '10.1109/INFVIS.2004.1',
        ['education']*1 + ['nonprofit']*1 + ['education']*1
    )
    update_affiliations(
        DF, 
        '10.1109/INFVIS.2004.1',
        ['ecole des mines de nantes nantes france'] + ['INRIA']*1 + ['ecole des mines de nantes nantes france'],
    ) 

    return DF

In [46]:
concat_df = update_concat_df(concat_df)

In [47]:
concat_df.head(2)

Unnamed: 0,IEEE Year,IEEE DOI,IEEE Title,IEEE Number of Authors,IEEE Author Position,IEEE Author Name,IEEE Author ID,IEEE Author Affiliation,IEEE One Affiliation,IEEE Paper Index,...,Number of Affiliations,First Institution Name,Raw Affiliation String,First Institution ID,First Institution ROR,First Institution Type,First Institution Country Code,Paper Index,First Institution Country Code By Hand,First Institution Type By Hand
0,2011,10.1109/TVCG.2011.185,D³ Data-Driven Documents,3.0,1.0,Michael Bostock,https://ieeexplore.ieee.org/author/37591067400,"Computer Science Department, Stanford Universi...",True,0,...,1.0,Stanford University,"Computer Science Department, Stanford Universi...",https://openalex.org/I97018004,https://ror.org/00f54p054,education,US,0,,
1,2011,10.1109/TVCG.2011.185,D³ Data-Driven Documents,3.0,2.0,Vadim Ogievetsky,https://ieeexplore.ieee.org/author/38016292400,"Computer Science Department, Stanford Universi...",True,0,...,1.0,Stanford University,"Computer Science Department, Stanford Universi...",https://openalex.org/I97018004,https://ror.org/00f54p054,education,US,0,,


In [48]:
concat_df[concat_df['First Institution Type By Hand'].notnull()].shape

(162, 29)

In [49]:
concat_df[concat_df['First Institution Country Code By Hand'].notnull()].shape

(178, 29)

In [50]:
concat_df[concat_df['IEEE Author Affiliation'].isnull()].shape

(86, 29)

In [148]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', None)
cols_to_show = ['IEEE Year',
                'IEEE DOI',
                'IEEE Title',
                'IEEE Number of Authors',
                'IEEE Author Position',
                'IEEE Author Name',
                'First Institution Name',
                'Raw Affiliation String',
                'First Institution Type',
                'First Institution Country Code',
                'Paper Index'
               ]
concat_df[concat_df['IEEE Author Affiliation'].isnull()][cols_to_show]
## (86,11)

Unnamed: 0,IEEE Year,IEEE DOI,IEEE Title,IEEE Number of Authors,IEEE Author Position,IEEE Author Name,First Institution Name,Raw Affiliation String,First Institution Type,First Institution Country Code,Paper Index
11,1997,10.1109/VISUAL.1997.663860,ROAMing terrain: Real-time Optimally Adapting Meshes,6.0,4.0,M.C. Miller,Lawrence Livermore National Laboratory,Lawrence livermore National Laboratory,facility,US,4
94,1999,10.1109/VISUAL.1999.809883,Multi-projector displays using camera-based registration,8.0,7.0,B. Scales,University of North Carolina at Chapel Hill,"[Dept. of Computer Science, University of North Carolina, Chapel Hill]",education,US,31
248,1999,10.1109/VISUAL.1999.809908,Multiresolution Techniques for Interactive Texture-based Volume Visualization,3.0,3.0,K.I. Joy,,,,,82
280,1996,10.1109/VISUAL.1996.568113,Generation of Transfer Functions with Stochastic Search Technique,4.0,4.0,H. Pfister,,,,,93
286,1999,10.1109/VISUAL.1999.809894,Tensorlines: Advection-Diffusion based Propagation through Diffusion Tensor Fields,3.0,3.0,E. Lundberg,University of Utah,"Center for Scientific Computing and Imaging, Department of Computer Science, University of Utah",education,US,95
292,1996,10.1109/VISUAL.1996.568124,Mesh reduction with error control,3.0,3.0,W. Strasser,University of Tübingen,"Wilhelm-Schickard-Institut, GRIS, Universität Tübingen, Germany#TAB#",education,DE,97
384,1997,10.1109/INFVIS.1997.636786,Nonlinear magnification fields,2.0,2.0,E.L. Robertson,Indiana University,Indiana University*,education,US,126
457,1996,10.1109/INFVIS.1996.559210,Visage: a user interface environment for exploring information,8.0,7.0,A.J. Kolojechick,,,,,146
460,1999,10.1109/VISUAL.1999.809896,"The ""Parallel Vectors"" operator-a vector field visualization primitive",2.0,2.0,M. Roth,,,,,147
543,1998,10.1109/VISUAL.1998.745302,TOPIC ISLANDS TM - a wavelet-based text visualization system,4.0,2.0,P.C. Wong,,,,,175


I've checked all the rows where IEEE missed author info but openalex has it:

<!-- - 10.1109/VISUAL.1997.663848, R. Machiraju, Mississippi State University, Mississippi, United States -->
- 10.1109/VISUAL.2004.128, E. Parkinson, VA Tech Hydro, company, CH
- 10.1109/INFVIS.1999.801864, change author name, J. Sean -> Jeffrey Senn, for both Ieee and openalex
- 10.1109/TVCG.2019.2934260, Andrew J. Solis, openalex, change to UT Austin

In [149]:
def manual_update(DF, DOI, AUTHOR_NAME, COL_TO_CHANGE, TEXT):
    DF.loc[(DF['DOI'] == DOI) & (DF['IEEE Author Name'] == AUTHOR_NAME), COL_TO_CHANGE] = TEXT

In [150]:
def manual_update_concat_df(DF): # DF here is concat_df
    manual_update(
        DF,
        '10.1109/VISUAL.1997.663848',
        'R. Machiraju',
        'Raw Affiliation String',
        'Mississippi State University, Mississippi, United States'
    )
    manual_update(
        DF,
        '10.1109/VISUAL.2004.128',
        'E. Parkinson',
        'Raw Affiliation String',
        'VA Tech Hydro Corporation, Swizerland',
    )
    manual_update(
        DF,
        '10.1109/VISUAL.2004.128',
        'E. Parkinson',
        'First Institution Type',
        'company'
    )
    manual_update(
        DF,
        '10.1109/VISUAL.2004.128',
        'E. Parkinson',
        'First Institution Country Code',
        'CH',
    )
    manual_update(
        DF,
        '10.1109/INFVIS.1999.801864',
        'J. Sean',
        'IEEE Author Name',
        'Jeffrey Senn',
    )
    manual_update(
        DF,
        '10.1109/INFVIS.1999.801864',
        'J. Sean',
        'Author Name',
        'Jeffrey Senn',
    )
    manual_update(
        DF,
        '10.1109/TVCG.2019.2934260',
        'Andrew J. Solis',
        'Raw Affiliation String',
        'University of Texas Austin, Texas, United States',
    )
    manual_update(
        DF,
        '10.1109/TVCG.2019.2934260',
        'Andrew J. Solis',
        'First Institution Name',
        'University of Texas Austin',
    )

In [151]:
manual_update_concat_df(concat_df)

In [152]:
concat_df[concat_df['IEEE Author Affiliation'].isnull()][cols_to_show]

Unnamed: 0,IEEE Year,IEEE DOI,IEEE Title,IEEE Number of Authors,IEEE Author Position,IEEE Author Name,First Institution Name,Raw Affiliation String,First Institution Type,First Institution Country Code,Paper Index
11,1997,10.1109/VISUAL.1997.663860,ROAMing terrain: Real-time Optimally Adapting Meshes,6.0,4.0,M.C. Miller,Lawrence Livermore National Laboratory,Lawrence livermore National Laboratory,facility,US,4
94,1999,10.1109/VISUAL.1999.809883,Multi-projector displays using camera-based registration,8.0,7.0,B. Scales,University of North Carolina at Chapel Hill,"[Dept. of Computer Science, University of North Carolina, Chapel Hill]",education,US,31
248,1999,10.1109/VISUAL.1999.809908,Multiresolution Techniques for Interactive Texture-based Volume Visualization,3.0,3.0,K.I. Joy,,,,,82
280,1996,10.1109/VISUAL.1996.568113,Generation of Transfer Functions with Stochastic Search Technique,4.0,4.0,H. Pfister,,,,,93
286,1999,10.1109/VISUAL.1999.809894,Tensorlines: Advection-Diffusion based Propagation through Diffusion Tensor Fields,3.0,3.0,E. Lundberg,University of Utah,"Center for Scientific Computing and Imaging, Department of Computer Science, University of Utah",education,US,95
292,1996,10.1109/VISUAL.1996.568124,Mesh reduction with error control,3.0,3.0,W. Strasser,University of Tübingen,"Wilhelm-Schickard-Institut, GRIS, Universität Tübingen, Germany#TAB#",education,DE,97
384,1997,10.1109/INFVIS.1997.636786,Nonlinear magnification fields,2.0,2.0,E.L. Robertson,Indiana University,Indiana University*,education,US,126
457,1996,10.1109/INFVIS.1996.559210,Visage: a user interface environment for exploring information,8.0,7.0,A.J. Kolojechick,,,,,146
460,1999,10.1109/VISUAL.1999.809896,"The ""Parallel Vectors"" operator-a vector field visualization primitive",2.0,2.0,M. Roth,,,,,147
543,1998,10.1109/VISUAL.1998.745302,TOPIC ISLANDS TM - a wavelet-based text visualization system,4.0,2.0,P.C. Wong,,,,,175


In [153]:
# of course, I didn't do that in the script where I created a new variable called "IEEE Author Affiliation Filled" 
# for rows where ieee is missing author aff but openalex has it. 


def fill_ieee_with_openalex(DF):
    DF.loc[(DF['IEEE Author Affiliation'].isnull()) & (
        DF['Raw Affiliation String'].notnull()), 'IEEE Author Affiliation'] = DF[(DF['IEEE Author Affiliation'].isnull()) & (
        DF['Raw Affiliation String'].notnull())]['Raw Affiliation String']

In [154]:
fill_ieee_with_openalex(concat_df)

In [155]:
concat_df[concat_df['IEEE Author Affiliation'].isnull()][cols_to_show]

Unnamed: 0,IEEE Year,IEEE DOI,IEEE Title,IEEE Number of Authors,IEEE Author Position,IEEE Author Name,First Institution Name,Raw Affiliation String,First Institution Type,First Institution Country Code,Paper Index
248,1999,10.1109/VISUAL.1999.809908,Multiresolution Techniques for Interactive Texture-based Volume Visualization,3.0,3.0,K.I. Joy,,,,,82
280,1996,10.1109/VISUAL.1996.568113,Generation of Transfer Functions with Stochastic Search Technique,4.0,4.0,H. Pfister,,,,,93
457,1996,10.1109/INFVIS.1996.559210,Visage: a user interface environment for exploring information,8.0,7.0,A.J. Kolojechick,,,,,146
460,1999,10.1109/VISUAL.1999.809896,"The ""Parallel Vectors"" operator-a vector field visualization primitive",2.0,2.0,M. Roth,,,,,147
543,1998,10.1109/VISUAL.1998.745302,TOPIC ISLANDS TM - a wavelet-based text visualization system,4.0,2.0,P.C. Wong,,,,,175
545,1998,10.1109/VISUAL.1998.745302,TOPIC ISLANDS TM - a wavelet-based text visualization system,4.0,4.0,H. Foote,,,,,175
1217,1999,10.1109/VISUAL.1999.809889,Enabling Classification and Shading for 3D Texture Mapping based Volume Rendering using OpenGL and Extensions,3.0,3.0,W. Strasser,,,,,378
2791,1998,10.1109/VISUAL.1998.745337,Interactive virtual angioscopy,4.0,4.0,M. Tuveri,,,,,821
3907,1999,10.1109/VISUAL.1999.809924,Real-time visualization of scalably large collections of heterogeneous objects,5.0,4.0,N. Fanst,,,,,1129
4410,2007,10.1109/TVCG.2007.70557,Generalized Streak Lines: Analysis and Visualization of Boundary Induced Vortices,5.0,4.0,Heike Janicke,,,,,1276


In [156]:
def get_concat_df_filled(DF): # DF here is concat_df
    """ find out who don't have affilition, and fill the data manually

    Get the subset of concat_df where there does not exist any affiliation name. 
    Then drop this subset from concat_df

    Update this subset's IEEE Author Affiliation with fill_dict, and then append 
    this updated subset to concat_df_dropped

    Returns:
        concat_df_filled, where all authors have at least one affiliation name

    """
    fill_dict = {
    'K.I. Joy': 'University of California, Davis, United States',
    'H. Pfister': 'Department of Computer Science, State University of New York at Stony Brook, United States',
    'A.J. Kolojechick': 'Carnegie Mellon University，School of Computer Science，Pittsburgh，United States',
    'M. Roth': 'Computer Graphics Research Group, Deptartment of Computer Science, ETH Zurich, Switzerland',
    'P.C. Wong': 'Pacific Northwest National Laboratory, United States',
    'H. Foote': 'Pacific Northwest National Laboratory, United States',
    'W. Strasser': 'Computer Graphics Lab, University of Tubingen, Germany',
    'M. Tuveri': 'Center for Advanced Studies, Research and Development in Sardinia, Cagliari, Italy',
    'N. Fanst': 'Georgia Institute of Technology, United States',
    'Heike Janicke': 'Image and Signal Processing Group at the Universi ̈at Leipzig, Germany',
    'A. Vilanova': 'Institute of Computer Graphics, Vienna University of Technology, Austria',
    'P. Thiansathaporn': 'Department of Physics & Astronomy, University of North Carolina, Chapel Hill, United States',
    'B. Hegedust': 'Institute of Computer Graphics, Vienna University of Technology, Austria',
    'W.C. Flowers': 'Massachusetts Institute of Technology, United States',
    'G. Turk': 'GVU Center, College of Computing, Georgia Institute of Technology, United States',
    'P. Ermest': 'Philips Medical Systems, Best, Netherlands',
    'T. Moller': 'Department Of Computer And Information Science, The Ohio State University, Columbus, Ohio, United States',
    'K. Fostiropoulos': 'German National Research Centre for Information Technology, Germany',
    'F. Sobieczky': 'University of Göttingen, Germany',
    'W. Bertelheimer': 'Bayerische Motoren Werke AG (BMW) Corporation, Germany',
    }
    to_fill_df = DF[(
        DF['IEEE Author Affiliation'].isnull()) & (
        DF['Raw Affiliation String'].isnull()) & (
        DF['First Institution Name'].isnull())
    ]
    rows_to_drop = DF.index[(
        DF['IEEE Author Affiliation'].isnull()) & (
        DF['Raw Affiliation String'].isnull()) & (
        DF['First Institution Name'].isnull())
    ]
    concat_df_dropped = DF.drop(rows_to_drop)
    if concat_df_dropped.shape[0] + to_fill_df.shape[0] == DF.shape[0]:
        print('concat_df_dropped has correct row numbers')
    else:
        print('concat_df_dropped has incorrect row numbers')
    name_list = to_fill_df['IEEE Author Name'].tolist()
    kwargs = {'IEEE Author Affiliation' : lambda x: [fill_dict[i] for i in name_list]}
    to_fill_df = to_fill_df.assign(**kwargs)
    concat_df_filled = concat_df_dropped.append(
        to_fill_df, ignore_index=True).sort_values(
        by=['IEEE Paper Index', 'IEEE Author Position'], ).reset_index(drop=True)
    return concat_df_filled

In [157]:
concat_df_filled = get_concat_df_filled(concat_df)

concat_df_dropped has correct row numbers


In [158]:
# Now every author has affiliationn information
concat_df_filled[concat_df_filled['IEEE Author Affiliation'].isnull()].shape

(0, 29)

In [159]:
concat_df_filled.head(10)

Unnamed: 0,IEEE Year,IEEE DOI,IEEE Title,IEEE Number of Authors,IEEE Author Position,IEEE Author Name,IEEE Author ID,IEEE Author Affiliation,IEEE One Affiliation,IEEE Paper Index,...,Number of Affiliations,First Institution Name,Raw Affiliation String,First Institution ID,First Institution ROR,First Institution Type,First Institution Country Code,Paper Index,First Institution Country Code By Hand,First Institution Type By Hand
0,2011,10.1109/TVCG.2011.185,D³ Data-Driven Documents,3.0,1.0,Michael Bostock,https://ieeexplore.ieee.org/author/37591067400,"Computer Science Department, Stanford University, Stanford, CA, USA",True,0,...,1.0,Stanford University,"Computer Science Department, Stanford University, Stanford, CA, USA#TAB#",https://openalex.org/I97018004,https://ror.org/00f54p054,education,US,0,,
1,2011,10.1109/TVCG.2011.185,D³ Data-Driven Documents,3.0,2.0,Vadim Ogievetsky,https://ieeexplore.ieee.org/author/38016292400,"Computer Science Department, Stanford University, Stanford, CA, USA",True,0,...,1.0,Stanford University,"Computer Science Department, Stanford University, Stanford, CA, USA#TAB#",https://openalex.org/I97018004,https://ror.org/00f54p054,education,US,0,,
2,2011,10.1109/TVCG.2011.185,D³ Data-Driven Documents,3.0,3.0,Jeffrey Heer,https://ieeexplore.ieee.org/author/37550791300,"Computer Science Department, Stanford University, Stanford, CA, USA",True,0,...,1.0,Stanford University,"Computer Science Department, Stanford University, Stanford, CA, USA#TAB#",https://openalex.org/I97018004,https://ror.org/00f54p054,education,US,0,,
3,1991,10.1109/VISUAL.1991.175815,Tree-maps: a space-filling approach to the visualization of hierarchical information structures,2.0,1.0,B. Johnson,https://ieeexplore.ieee.org/author/37381975300,"Department of Computer Science & Human-Computer Interaction Laboratory, University of Maryland, College Park, MD, USA",True,1,...,1.0,"University of Maryland, College Park","Dept. of Comput Sci., Maryland Univ., College Park, MD, USA",https://openalex.org/I66946132,https://ror.org/047s2c258,education,US,1,,
4,1991,10.1109/VISUAL.1991.175815,Tree-maps: a space-filling approach to the visualization of hierarchical information structures,2.0,2.0,B. Shneiderman,https://ieeexplore.ieee.org/author/37283016400,"Department of Computer Science & Human-Computer Interaction Laboratory, University of Maryland, College Park, MD, USA",True,1,...,1.0,"University of Maryland, College Park","Dept. of Comput Sci., Maryland Univ., College Park, MD, USA",https://openalex.org/I66946132,https://ror.org/047s2c258,education,US,1,,
5,1990,10.1109/VISUAL.1990.146402,Parallel coordinates: a tool for visualizing multi-dimensional geometry,2.0,1.0,A. Inselberg,https://ieeexplore.ieee.org/author/37294162600,"IBM Scientific Center, Los Angeles, CA, USA",False,2,...,1.0,IBM,"IBM Sci. Center, Los Angeles, CA, USA",https://openalex.org/I1341412227,https://ror.org/05hh8d621,company,US,2,,
6,1990,10.1109/VISUAL.1990.146402,Parallel coordinates: a tool for visualizing multi-dimensional geometry,2.0,2.0,B. Dimsdale,https://ieeexplore.ieee.org/author/37426169800,"IBM Scientific Center, Los Angeles, CA, USA",True,2,...,1.0,IBM,"IBM Sci. Center, Los Angeles, CA, USA",https://openalex.org/I1341412227,https://ror.org/05hh8d621,company,US,2,,
7,2006,10.1109/TVCG.2006.147,Hierarchical Edge Bundles: Visualization of Adjacency Relations in Hierarchical Data,1.0,1.0,Danny Holten,https://ieeexplore.ieee.org/author/37827881300,"Technische Universiteit Eindhoven, Netherlands",True,3,...,1.0,Eindhoven University of Technology,Technische Univ. Eindhoven,https://openalex.org/I83019370,https://ror.org/02c2kyt77,education,NL,3,,
8,1997,10.1109/VISUAL.1997.663860,ROAMing terrain: Real-time Optimally Adapting Meshes,6.0,1.0,M. Duchaineau,https://ieeexplore.ieee.org/author/37267813100,"Los Alamos National Laboratory, USA",False,4,...,1.0,Lawrence Livermore National Laboratory,Los Alamos National Laboratory and Lawrence Livermore National Laboratory#TAB#,https://openalex.org/I1282311441,https://ror.org/041nk4h53,facility,US,4,,
9,1997,10.1109/VISUAL.1997.663860,ROAMing terrain: Real-time Optimally Adapting Meshes,6.0,2.0,M. Wolinsky,https://ieeexplore.ieee.org/author/37443252200,"Los Alamos National Laboratory, USA",True,4,...,1.0,Los Alamos National Laboratory,Los Alamos national Laboratory,https://openalex.org/I1343871089,https://ror.org/01e41cf67,facility,US,4,,
