# Measuring Industry Presence in NLP Research

## Data Loading

In [2]:
import os
import pandas as pd
from pathlib import Path
import json
from tqdm.auto import tqdm
tqdm.pandas()

In [3]:
# Requires the raw S2 data to be in your Downloads folder
overwrite_cache = False
data_path = Path.home() / "Downloads"
output_path = "out"

In [4]:
if not os.path.exists(output_path):
    os.makedirs(output_path)

In [5]:
def check_file_exists(filename):
    return os.path.isfile(filename) and os.path.exists(filename)

### Authors

In [6]:
def load_authors(source_data_path):
    authors_df = pd.json_normalize(
        pd.Series(open(source_data_path).readlines()).apply(json.loads)
    )
    authors_df['orcid'] = authors_df['externalids.ORCID']
    authors_df_filtered = authors_df[['authorid', 'name', 'aliases', 'papercount', 'citationcount', 'hindex', 'affiliations', 'homepage', 'orcid']]
    authors_df_filtered[['authorid', 'name', 'papercount', 'citationcount', 'hindex', 'homepage', 'orcid']]
    return authors_df_filtered

In [7]:
authors_output_path = os.path.join(output_path, 's2_authors.csv')
overwrite_cache = False

if check_file_exists(authors_output_path) and not overwrite_cache:
    authors_df = pd.read_csv(authors_output_path)
else:
    authors_df = load_authors(os.path.join(data_path, "authors.jsonl"))
    authors_df.to_csv(authors_output_path, index=False)

In [8]:
authors_df.keys()

Index(['authorid', 'name', 'aliases', 'papercount', 'citationcount', 'hindex',
       'affiliations', 'homepage', 'orcid'],
      dtype='object')

#### Map Authors to Affiliations

In [9]:
authors_to_affiliations = authors_df[['authorid', 'affiliations']].explode('affiliations').rename(columns={'affiliations': 'affiliation'})

In [10]:
authors_to_affiliations[~authors_to_affiliations['affiliation'].isna()]

Unnamed: 0,authorid,affiliation
9,145104037,"['Adam Mickiewicz University', 'Applica.ai']"
42,2539674,['Johns Hopkins University']
48,3440700,"['School of Computer Science, University of Ma..."
54,48424413,['Virginia Tech']
66,2699105,['Allen Institute for AI']
...,...,...
67882,1725420331,"['Minerva Schools at KGI', 'Brown University']"
67928,8352056,['University of Agder']
67956,147740244,['Ghent University']
67985,40185455,['Imperial College London']


In [11]:
authors_to_affiliations.to_csv(os.path.join(output_path, 's2_authors_to_affiliations.csv'), index=False)

#### Map Authors to Aliases

In [12]:
authors_to_aliases = authors_df[['authorid', 'aliases']].explode('aliases').rename(columns={'aliases': 'alias'})

In [13]:
authors_to_aliases.to_csv(os.path.join(output_path, 's2_authors_to_aliases.csv'), index=False)

### Papers

In [14]:
def load_papers(source_data_path):
    papers_df = pd.json_normalize(
        pd.Series(open(os.path.join(data_path, "papers.jsonl")).readlines()).apply(json.loads)
    )
    papers_df['aclid'] = papers_df['externalids.ACL']
    return papers_df

In [15]:
import ast
import numpy as np

def find_in_text(col_text, col_position):
    l = []
    for text, position in zip(col_text, col_position):
        if (
            position != "None"
            and text != "None"
            and position != None
            and text != None
            and not isinstance(position, float)
        ):
            sub = []
            position = ast.literal_eval(position)
            for p in position:
                sub.append(text[int(p["start"]) : int(p["end"])])
            l.append(sub)
        else:
            l.append(None)
    return l

In [16]:
import re

def cond(position, section):
    if (
        position != "None"
        and section != "None"
        and position != None
        and section != None
        and not isinstance(position, float)
    ):
        position = ast.literal_eval(position)
        indexes = [i for i, item in enumerate(section) if re.search("acknow", item, re.IGNORECASE)]
        return indexes, position
    return None, position

def get_start(col_sections, col_positions):
    l = []
    for section, position in zip(col_sections, col_positions):
        indexes, position = cond(position, section)
        if indexes:
            l.append(position[indexes[0]]['end'])
        else:
            l.append(None)
    return l

def get_end(col_sections, col_positions):
    l = []
    for section, position in zip(col_sections, col_positions):
        indexes, position = cond(position, section)
        if indexes:
            if len(position) > indexes[0] + 1:
                l.append(position[indexes[0] + 1]['start'])
            else:
                l.append(-1)
        else:
            l.append(None)
    return l

In [17]:
def get_affiliation_section(col_text, col_start, col_end):
    l = []
    for text, start, end in zip(col_text, col_start, col_end):
        if start != None and end != None:
            l.append(text[int(start):int(end)])
        else:
            l.append(None)
    return l

In [18]:
def preprocess_papers(papers_df):
    papers_df_acknowledgements_affiliations = papers_df[['corpusid', 'aclid', 'year', 'referencecount', 'citationcount', 'influentialcitationcount', 'venue', 'title', 'url', 's2fieldsofstudy', 'content.annotations.sectionheader', 'content.text', 'content.annotations.authoraffiliation']]
    papers_df_acknowledgements_affiliations['sections'] = find_in_text(papers_df_acknowledgements_affiliations['content.text'], papers_df_acknowledgements_affiliations['content.annotations.sectionheader'])
    papers_df_acknowledgements_affiliations['affiliations'] = find_in_text(papers_df_acknowledgements_affiliations['content.text'], papers_df_acknowledgements_affiliations['content.annotations.authoraffiliation'])
    papers_df_acknowledgements_affiliations['acknowledgements_section_start'] = get_start(papers_df_acknowledgements_affiliations['sections'], papers_df_acknowledgements_affiliations['content.annotations.sectionheader'])
    papers_df_acknowledgements_affiliations['acknowledgements_section_end'] = get_end(papers_df_acknowledgements_affiliations['sections'], papers_df_acknowledgements_affiliations['content.annotations.sectionheader'])
    papers_df_acknowledgements_affiliations['acknowledgements_section'] = get_affiliation_section(papers_df_acknowledgements_affiliations['content.text'], papers_df_acknowledgements_affiliations['acknowledgements_section_start'], papers_df_acknowledgements_affiliations['acknowledgements_section_end'])
    filtered = papers_df_acknowledgements_affiliations[['corpusid', 'aclid', 'year', 'referencecount', 'citationcount', 'influentialcitationcount', 'venue', 'title', 'url', 's2fieldsofstudy', 'acknowledgements_section', 'affiliations']]
    return filtered

#### Map Papers to Authors

In [19]:
def map_papers_to_authors(papers_df):
    papers_to_authors = papers_df[['corpusid', 'aclid', 'authors']].explode('authors')
    papers_to_authors['authorid'] = papers_to_authors['authors'].apply(lambda x: x and type(x)is dict and x['authorId'] or None)
    papers_to_authors = papers_to_authors[['corpusid', 'aclid', 'authorid']]
    return papers_to_authors

In [20]:
papers_output_path = os.path.join(output_path, 's2_papers.csv')
papers_to_authors_output_path = os.path.join(output_path, 's2_papers_to_authors.csv')
overwrite_cache = False

if check_file_exists(papers_output_path) and not overwrite_cache:
    papers_df = pd.read_csv(papers_output_path)
    papers_to_authors = pd.read_csv(papers_to_authors_output_path)
else:
    papers_df = load_papers(os.path.join(data_path, "papers.jsonl"))
    papers_to_authors = map_papers_to_authors(papers_df)
    papers_df = preprocess_papers(papers_df)
    papers_df.to_csv(papers_output_path, index=False)
    papers_to_authors.to_csv(papers_to_authors_output_path, index=False)

#### Map Venue Type (Workshop, Demo) to Paper

In [21]:
import bibtexparser
import re
import pandas as pd
from collections import defaultdict

if 'publicationtype' not in papers_df:

    types = ['workshop', 'demo', 'tutorial']

    with open('anthology.bib') as bibtex_file:
        bib_database = bibtexparser.load(bibtex_file)

    acl_ids = []
    publications_types = []
    not_found = 0
    counts = defaultdict(int)

    query = "|".join(types)

    for i, el in enumerate(bib_database.entries):

        search_for = el['booktitle'] if 'booktitle' in el else el['title']
        search_for = ''.join(e for e in search_for if (e.isalnum() or e.isspace()))

        if 'booktitle' in el:
            match = re.search(query, search_for, re.IGNORECASE)
        elif 'title' in el:
            match = re.search(query, search_for, re.IGNORECASE)
        if not match:
            counts['main'] += 1
            publication_type = 'main'
        else:
            counts[match.group(0).lower()] += 1
            publication_type = match.group(0).lower()

        acl_id = el['url'].split('/')[-1]
        acl_ids.append(acl_id)
        publications_types.append(publication_type)

    paper_types_df = pd.DataFrame({'aclid': acl_ids, 'publicationtype': publications_types})
    papers = pd.read_csv(os.path.join(output_path, 's2_papers.csv'))
    paper_types_df = papers.merge(df, on='aclid', how='left')
    paper_types_df.to_csv(os.path.join(output_path, 's2_papers.csv'), index=False)

#### Company / University Mentions

In [22]:
from collections import Counter
from itertools import islice

orgs = pd.read_csv('big-tech-companies.csv')
universities = pd.read_csv(os.path.join('.', 'universities.csv'), header=None, names=['abbreviation', 'name', 'url'])

canonical_names = orgs['canonicalname'].tolist()

canonical_plus_aliases = orgs['canonicalname'] + ', ' + orgs['aliases'].fillna('')

name_map = dict(zip(orgs['canonicalname'], canonical_plus_aliases))
name_map = {v.strip().lower(): k for k, values in name_map.items() for v in values.split(',') if v is not None}
name_map

{'apple': 'Apple',
 '': 'Canon',
 'microsoft': 'Microsoft',
 'alphabet': 'Alphabet',
 'google': 'Alphabet',
 'amazon': 'Amazon',
 'tesla': 'Tesla',
 'meta platforms': 'Meta Platforms',
 'fair': 'Meta Platforms',
 'facebook': 'Meta Platforms',
 'tsmc': 'TSMC',
 'nvidia': 'NVIDIA',
 'tencent': 'Tencent',
 'samsung': 'Samsung',
 'alibaba': 'Alibaba',
 'oracle': 'Oracle',
 'broadcom': 'Broadcom',
 'asml': 'ASML',
 'cisco': 'Cisco',
 'salesforce': 'Salesforce',
 'adobe': 'Adobe',
 'texas instruments': 'Texas Instruments',
 'qualcomm': 'QUALCOMM',
 'netflix': 'Netflix',
 'ibm': 'IBM',
 'intuit': 'Intuit',
 'meituan': 'Meituan',
 'intel': 'Intel',
 'sap': 'SAP',
 'paypal': 'PayPal',
 'automatic data processing': 'Automatic Data Processing',
 'amd': 'AMD',
 'sony': 'Sony',
 'keyence': 'Keyence',
 'pinduoduo': 'Pinduoduo',
 'airbnb': 'Airbnb',
 'analog devices': 'Analog Devices',
 'servicenow': 'ServiceNow',
 'booking holdings': 'Booking Holdings',
 'booking': 'Booking Holdings',
 'booking.com'

In [23]:
list_of_universities = universities['name'].tolist()
company_names = [name for names in canonical_plus_aliases for name in names.split(', ') if len(name) > 0]
all_org_names = company_names + list_of_universities
all_org_names = [name.strip() for name in all_org_names]
all_org_names[95:]

['GlobalFoundries',
 'Keysight',
 'CoStar Group',
 'Constellation Software',
 'MediaTek',
 'The Trade Desk',
 'ON Semiconductor',
 'HP',
 'Nokia',
 'Datadog',
 'Dell',
 'Wolters Kluwer',
 'Veeva Systems',
 'Zoom',
 'SMIC',
 'Canon',
 'University of Andorra',
 'Abu Dhabi University',
 'Ajman University of Science & Technology',
 'Alain University of Science and Technology',
 'Al Ghurair University',
 'Alhosn University',
 'Al Khawarizmi International College',
 'American College Of Dubai',
 'American University in Dubai',
 'American University in the Emirates',
 'American University of Sharjah',
 'British University in Dubai',
 'Dubai Medical College for Girls',
 'Dubai Pharmacy College',
 'Etisalat University College',
 'Gulf Medical University',
 'Hamdan Bin Mohammed e-University',
 'Higher Colleges of Technology',
 'Ittihad University',
 'Jumeira University',
 'Khalifa University',
 'Khalifa University of Science, Technology and Research',
 'Masdar University Of Science And Technolog

In [24]:
import regex

with_word_boundary = [r'\b' + name + r'\b' for name in all_org_names]

def match_orgs(affiliation_or_acknowledgement):
    l = []
    if affiliation_or_acknowledgement:
        sub_l = []
        match = regex.findall(r"(?=(" + "|".join(with_word_boundary) + r")){e<=1}", str(affiliation_or_acknowledgement), regex.BESTMATCH)
        out = [item for el in match for item in el if len(item) > 0]
        # Check if any of the elements in out is a substring of any other element in out
        for i, el in enumerate(out):
            if any([el in other_el and not el == other_el for j, other_el in enumerate(out) if j != i]):
                continue
            sub_l.append(el)
        l.append(sub_l)
    else:
        return None
    # flatten list
    filtered_and_mapped = []
    for sublist in l:
        for item in sublist:
            if item.lower() in name_map:
                filtered_and_mapped.append(name_map[item.lower()])
            elif item in list_of_universities:
                filtered_and_mapped.append(item)
    # remove duplicates
    filtered_and_mapped = list(dict.fromkeys(filtered_and_mapped))
    return filtered_and_mapped

In [25]:
papers_df['mentions_affiliations'] = papers_df['affiliations'].progress_apply(match_orgs)

  0%|          | 0/73717 [00:00<?, ?it/s]

In [26]:
papers_to_affiliation_mentions = papers_df[['corpusid', 'aclid', 'mentions_affiliations']].explode('mentions_affiliations')
papers_to_affiliation_mentions['orgtype'] = np.where(papers_to_affiliation_mentions['mentions_affiliations'].isin(company_names) , "company", "university")
papers_to_affiliation_mentions['orgtype'] = np.where(papers_to_affiliation_mentions['mentions_affiliations'].isnull(), None, papers_to_affiliation_mentions['orgtype'])

In [27]:
papers_to_affiliation_mentions[papers_to_affiliation_mentions['orgtype'] == 'university']

Unnamed: 0,corpusid,aclid,mentions_affiliations,orgtype
9,398,C02-1133,Tokyo Institute of Technology,university
15,498,W96-0214,Harvard University,university
16,505,W96-0102,Tilburg University,university
27,774,W98-1114,University of Cambridge,university
32,949,1995.iwpt-1.27,University of Pennsylvania,university
...,...,...,...,...
72080,237217104,2004.tc-1.3,City University,university
72084,237434429,2021.cnl-1.10,Singapore Management University,university
72089,245838254,2021.paclic-1.34,National University,university
72090,245838322,2021.paclic-1.14,Institute of Science and Technology,university


In [28]:
papers_to_affiliation_mentions['hasorg'] = np.where(papers_to_affiliation_mentions['mentions_affiliations'].isnull(), 0, 1)

In [29]:
papers_to_affiliation_mentions.to_csv(os.path.join(output_path, 's2_papers_to_affiliations.csv'), index=False)

In [30]:
papers_to_affiliation_mentions.count()

corpusid                 78528
aclid                    78528
mentions_affiliations    24890
orgtype                  24890
hasorg                   78528
dtype: int64

In [31]:
papers_df['mentions_acknowledgements'] = papers_df['acknowledgements_section'].progress_apply(match_orgs)

  0%|          | 0/73717 [00:00<?, ?it/s]

In [32]:
papers_to_acknowledgements_mentions = papers_df[['corpusid', 'aclid', 'mentions_acknowledgements']].explode('mentions_acknowledgements')

In [33]:
papers_to_acknowledgements_mentions.count()

corpusid                     73914
aclid                        73914
mentions_acknowledgements      628
dtype: int64

In [34]:
papers_to_acknowledgements_mentions['hasorg'] = np.where(papers_to_acknowledgements_mentions['mentions_acknowledgements'].isnull(), 0, 1)

In [35]:
papers_to_acknowledgements_mentions.to_csv(os.path.join(output_path, 's2_papers_to_acknowledgements_mentions.csv'), index=False)

#### Map Papers to Countries

In [84]:
import pycountry
import geograpy

def match_countries(affiliation_or_acknowledgement):
    countries = []
    cities = []
    if affiliation_or_acknowledgement and affiliation_or_acknowledgement is not np.nan:
        places = geograpy.get_geoPlace_context(text=affiliation_or_acknowledgement)
        return places.countries, places.cities
    else:
        return None, None
    
papers_df['country'], papers_df['city'] = zip(*papers_df['affiliations'].progress_apply(match_countries))

  0%|          | 0/73717 [00:00<?, ?it/s]

In [85]:
papers_df.explode('country')[['corpusid', 'aclid', 'country']].to_csv(os.path.join(output_path, 's2_papers_to_countries.csv'), index=False)