# Setup

In [1]:
#import ujson as json
import gzip
import pandas as pd
from collections import Counter
import string
import io
from multiprocessing import Pool
import numpy as np
from collections import defaultdict
from bs4 import BeautifulSoup
import requests
from collections import defaultdict 
import re
import langid
import pickle
from collections import defaultdict
import json
import sys
from glob import glob
# For field matching
import field_matching
STOPWORDS = set(['and','not','of','the'])

%matplotlib inline
from scipy import stats, integrate
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)

In [2]:
tr = dict((ord(x), None) for x in string.punctuation)

def lookup(myjson, k):
  # return myjson[k]
  if '.' in k:
    # jpath path
    ks = k.split('.')
    v = myjson
    for k in ks: 
        if not v:
            return ""
        v = v.get(k,{})
    return v or ""
  return myjson.get(k,"")


def get_terms(text, term_set =None):
    spl = re.split("[ /+,]",text.lower())
    t = set([x.translate(tr) for x in spl])
    if term_set:
        return t & term_set
    return t

# Load in Bio, Affiliation data

# Code used in paper

For our paper, we used a direct sync of ORCID's database through AWS. This proved to be a pain. We are leaving the code here for replicability, but below we also provide code to work with the public ORCID json data dumps, which in general are much easier to work with

In [3]:
DATA_DIR = "../../data/raw/"

In [4]:
all_pickled_fils = glob(DATA_DIR + "*.pkl")

In [5]:
# Load in Bios from 01_...ipynb
bios = []
affiliations = []
for i,fil in enumerate(all_pickled_fils):
    if i % 50 == 0:
        print(i)
    try:
        f = open(fil,"rb")
        p = pickle.load(f)
        f.close()
    except:
        print(fil)
        p = {}
    for oid, data in p.items():
        for af in data['affiliations']:
            af['oid'] = oid
            affiliations.append(af)
        data['bio']['oid'] = oid
        bios.append(data['bio'])

0


In [6]:
# # Create bio dataframe
bio_df = pd.DataFrame(bios)

In [7]:
bio_df.shape

(64743, 5)

In [8]:
# Create the DF 
df = pd.DataFrame(affiliations)

# merge affiliations with bios
df = pd.merge(df,bio_df, on="oid")

In [9]:
bio_df.shape, bio_df.oid.nunique()

((64743, 5), 64743)

In [10]:
bio_df.head()

Unnamed: 0,oid,person_created_prof_date,person_first_name,person_last_mod_date,person_last_name
0,0000-0002-1583-3000,,,,
1,0000-0003-1566-6000,2016-04-15T23:42:33.132Z,Alessandra,2016-06-06T15:29:36.952Z,Bianchi
2,0000-0002-7866-6000,2017-09-05T13:42:27.076Z,Владимир,2017-09-05T13:42:27.312Z,Заболотный
3,0000-0002-0280-5000,2018-07-25T10:14:26.828Z,Artur,2018-07-25T10:39:25.097Z,Gawlik
4,0000-0002-9379-4000,2016-04-15T23:20:28.475Z,Frank,2016-06-06T15:29:36.952Z,Vernon


In [11]:
df.head()

Unnamed: 0,dept,disamb_org,oid,org_country,org_name,role,start_month,start_year,type,person_created_prof_date,person_first_name,person_last_mod_date,person_last_name
0,Mathematics,60242 RINGGOLD,0000-0003-1566-6000,IT,,Ricercatore,5,2012,employment,2016-04-15T23:42:33.132Z,Alessandra,2016-06-06T15:29:36.952Z,Bianchi
1,Mathematics,60242 RINGGOLD,0000-0003-1566-6000,IT,,Ricercatore,5,2012,employment,2016-04-15T23:42:33.132Z,Alessandra,2016-06-06T15:29:36.952Z,Bianchi
2,Mathematics,60242 RINGGOLD,0000-0003-1566-6000,IT,,Ricercatore,5,2012,employment,2016-04-15T23:42:33.132Z,Alessandra,2016-06-06T15:29:36.952Z,Bianchi
3,,28423 RINGGOLD,0000-0003-1566-6000,DE,,Post-doctoral fellow,1,2007,employment,2016-04-15T23:42:33.132Z,Alessandra,2016-06-06T15:29:36.952Z,Bianchi
4,Faculty of Mechanical Engimeering,grid.22555.35 GRID,0000-0002-0280-5000,PL,,"Phd, Eng.",10,1999,employment,2018-07-25T10:14:26.828Z,Artur,2018-07-25T10:39:25.097Z,Gawlik


# Code for working with the raw ORCID data

First, run the script ```parse_raw_orcid_dump.py``` on the raw ORCID data. It may need to be modified slightly for updated versions of the dump, as again, we did not use this code for our final analyses.  Then, run the code below

In [None]:
# Load in Bios from 01_...ipynb
bios = []
affils = []
for i, z in enumerate(gzip.open("/home/kjoseph/branching_pipeline/data/affiliations.json.gz","rb")):
    l = json.loads(z.decode("utf8"))
    for a in l['affiliation']:
        #print a
        df = {"dept": a['department-name'],
              "type" : a['type'],
              "year" : lookup(a,'start-date.year.value'),
              "month" : lookup(a,'start-date.month.value'),
              "org_country" : lookup(a,'organization.address.country'),
              "org_name" :  lookup(a,'organization.name'),
              "role" :  lookup(a,'role-title'),
              'uid' : i
             }
        affils.append(df)
    l['bio'].append(i)
    l['bio'].append(l['id'])
    bios.append(l['bio'])
    if i % 200000 == 0:
        print(i)

In [None]:
# # Create bio dataframe
bio_df = pd.DataFrame(bios)
bio_df.columns = ['person_first_name','person_last_name','bio', 'urls', 'country', 'keywords','uid','oid']

In [None]:
# Create the DF 
df = pd.DataFrame(affils)

# merge affiliations with bios
df = pd.merge(df,bio_df, on="uid")

# Affiliations per person

We probably want to toss people who have a crazy number of affiliations

In [22]:
# How many affiliations per person
affiliations_per_person = df.groupby("oid").size()


In [23]:
affl_percentage_cum = (affiliations_per_person.value_counts()/float(len(affiliations_per_person))).cumsum()

In [24]:
affiliations_per_person[affiliations_per_person > 1].shape

(13573,)

In [25]:
# Cumsum of affiliations per person - we can toss ppl with > 12 affiliations and lose only .05% of the data
affl_percentage_cum.head(15)

#what are these 2 affiliations?

1     0.320194
2     0.618752
3     0.753080
4     0.849594
5     0.903436
6     0.940599
7     0.963488
8     0.977061
9     0.984724
10    0.990534
11    0.993539
12    0.995693
13    0.997345
14    0.998497
15    0.998898
dtype: float64

In [26]:
affiliations_per_person[ affiliations_per_person > 15][:10]

oid
0000-0001-5573-5002    17
0000-0001-6168-300X    24
0000-0001-6318-4001    23
0000-0001-6563-4002    26
0000-0001-8542-9006    16
0000-0001-8581-1002    16
0000-0001-8781-9008    16
0000-0002-0936-2003    23
0000-0002-1938-5006    23
0000-0002-2228-0002    18
dtype: int64

In [27]:
# Remove ppl with > 12 affiliations

df = df[df.oid.isin(affiliations_per_person[affiliations_per_person < 13].index.values)]
print(df.shape)
# How many ppl. do we have a this point?
df.oid.nunique()

(52976, 13)


19880

# Role Categorization

Using a series of regular expressions, we categorize what role a given affiliation represents.  
We determine a role by the highest matching role we identify (using the regular expressions, 
in the order of roles in the code below).  If there is no match, we will give the affiliation
a blank role

## Clean Role text

In [28]:
clean_role = df.role.apply(lambda x : x.strip().replace(".","").lower() if x else None)
df = df.assign(clean_role=clean_role)
roles = df.clean_role.value_counts()

In [29]:
df

Unnamed: 0,dept,disamb_org,oid,org_country,org_name,role,start_month,start_year,type,person_created_prof_date,person_first_name,person_last_mod_date,person_last_name,clean_role
0,Mathematics,60242 RINGGOLD,0000-0003-1566-6000,IT,,Ricercatore,05,2012,employment,2016-04-15T23:42:33.132Z,Alessandra,2016-06-06T15:29:36.952Z,Bianchi,ricercatore
1,Mathematics,60242 RINGGOLD,0000-0003-1566-6000,IT,,Ricercatore,05,2012,employment,2016-04-15T23:42:33.132Z,Alessandra,2016-06-06T15:29:36.952Z,Bianchi,ricercatore
2,Mathematics,60242 RINGGOLD,0000-0003-1566-6000,IT,,Ricercatore,05,2012,employment,2016-04-15T23:42:33.132Z,Alessandra,2016-06-06T15:29:36.952Z,Bianchi,ricercatore
3,,28423 RINGGOLD,0000-0003-1566-6000,DE,,Post-doctoral fellow,01,2007,employment,2016-04-15T23:42:33.132Z,Alessandra,2016-06-06T15:29:36.952Z,Bianchi,post-doctoral fellow
4,Faculty of Mechanical Engimeering,grid.22555.35 GRID,0000-0002-0280-5000,PL,,"Phd, Eng.",10,1999,employment,2018-07-25T10:14:26.828Z,Artur,2018-07-25T10:39:25.097Z,Gawlik,"phd, eng"
5,Faculty of Mechanical Engineering,grid.22555.35 GRID,0000-0002-0280-5000,PL,,MSc,10,1994,education,2018-07-25T10:14:26.828Z,Artur,2018-07-25T10:39:25.097Z,Gawlik,msc
6,Scripps Institution of Oceanography,8784 RINGGOLD,0000-0002-9379-4000,US,,Research Geophysicist,07,2001,employment,2016-04-15T23:20:28.475Z,Frank,2016-06-06T15:29:36.952Z,Vernon,research geophysicist
7,Scripps Institution of Oceanography,8784 RINGGOLD,0000-0002-9379-4000,US,,Associate Research Geophysicist,07,1996,employment,2016-04-15T23:20:28.475Z,Frank,2016-06-06T15:29:36.952Z,Vernon,associate research geophysicist
8,Scripps Institution of Oceanography,8784 RINGGOLD,0000-0002-9379-4000,US,,Assistant Research Geophysicist,07,1990,employment,2016-04-15T23:20:28.475Z,Frank,2016-06-06T15:29:36.952Z,Vernon,assistant research geophysicist
9,Scripps Institution of Oceanography,8784 RINGGOLD,0000-0002-9379-4000,US,,Postgraduate Research Geophysicist,07,1989,employment,2016-04-15T23:20:28.475Z,Frank,2016-06-06T15:29:36.952Z,Vernon,postgraduate research geophysicist


## Find roles

In [30]:
regexes = [
 ["research", re.compile("research|scientist|scholar|ricercatore")],
 ["bachelors", re.compile("(\b|^|\s)((b[ \.]?(a|s|e|se|sc|s\.e|sn)(\b|$|\s|\.))|bachelor|btech|underg|licenciatura|graduação)")],
 ["masters/postgrad", re.compile("(\b|^|\s)((m[\. ]?(a|s|p|sc|as)(\b|$|\s|\.))|master|^me$|mlis|mba|mbbs|postgrad|m[ \.]?tech|mphil|mph|mestrado|magister|mbchb|meng|mlis)")],
 ["phd",re.compile("^doctor(ate)?$|^d[\.]?r[\.]?$|^m[\.]?d[\.]?$|ph[ \.]?d(\b|$|\s|\.)|doctor of|d(\. )?phil|doctorado|mestre|graduate student|(\b|\s|^)graduate (research|teaching)? ?assistant|pharmd|^jd$|doctoral student|doutor|doctorat")],
 ["postdoc", re.compile("post[ o-]?d")],
 ["prof", re.compile(u"prof|(\b|^|\s)lect|faculty|reader|docente|instructor|доцент")],
 ['head', re.compile("dean|director|head")]
]

# for x in ['msc','m sc','manager','ma','ba ', 'bm', 'mas','b.a.', 'bastard', 'b.s.e.','d.r.','phd','phda'] +roles[100:200].index.tolist() :
#     print x
#     for name, reg in regexes:
#         if reg.search(x):
#             print "\t", name


In [31]:
matches = defaultdict(list)
captured = 0
total = 0
for i, x in roles.reset_index().iterrows():
    matched = ""
    for name, reg in regexes:
        if reg.search(x[0]):
            matched = name
    if matched != "":
        captured += x[1]
    total += x[1]
    matches[matched].append(x[0])

In [32]:
print(captured, total )
print('Percentage of non-null affiliations we can identify a role for: ', float(captured)/total)

33874 46938
Percentage of non-null affiliations we can identify a role for:  0.7216754015935916


In [33]:
print(len(matches['']), len(roles))
print('Percent of all unique role fields we can identify: ', len(matches[''])/float(len(roles)))

8422 18350
Percent of all unique role fields we can identify:  0.45896457765667575


In [34]:
role_df=pd.DataFrame([x for k,v in matches.items() for x in zip(v,[k]*len(v))],
                     columns=['clean_role','role_category'])



In [35]:

df = df.merge(role_df,on="clean_role",how = "left")

## Check out role details

In [36]:
missing = df[pd.isnull(df.role_category)].shape[0]+df[df.role_category == ""].shape[0]
print(missing, float(missing)/len(df))
df.role_category.value_counts()/df.shape[0]


19102 0.36057837511325885


                    0.246602
phd                 0.181214
prof                0.153032
masters/postgrad    0.109200
bachelors           0.078960
research            0.061953
postdoc             0.034789
head                0.020273
Name: role_category, dtype: float64

In [37]:
print('How many people have at least 1 category? ',)
print(df[~((pd.isnull(df.role_category)) | (df.role_category == ""))].oid.nunique())

How many people have at least 1 category? 
14955


In [38]:
# the type field isn't named well, rename
df.rename(index=str, columns={"type" : "emp_or_edu"}, inplace=True)

print(df.emp_or_edu.value_counts())

# BUT! If the type is "employment", then we shouldn't be assigning anything but prof, head, research or postdoc?
# Actually these look fine, lets leave it
df.loc[(df.emp_or_edu == "employment") & (df.role_category.isin(["phd"]))].clean_role.value_counts()

education     28623
employment    24353
Name: emp_or_edu, dtype: int64


phd student                                                                                 247
phd                                                                                          87
dr                                                                                           80
phd candidate                                                                                60
graduate research assistant                                                                  55
md                                                                                           45
phd researcher                                                                               24
doctor                                                                                       21
graduate teaching assistant                                                                  20
graduate student                                                                             20
phd fellow                              

# Translation of Department fields

## If you haven't done the translation yet you have to run all of this

In [39]:
import os
if os.path.exists("../../data/processed/translated_fieldnames.csv"):
    print('SKIP THIS SECTION YOU HAVE THE TRANSLATED DATA GO TO THE SECTION STARTING WITH "Otherwise"...')

SKIP THIS SECTION YOU HAVE THE TRANSLATED DATA GO TO THE SECTION STARTING WITH "Otherwise"...


In [None]:
#NSF Fields

tab = requests.get("https://www.nsf.gov/statistics/2017/nsf17306/datatables/tab-13.htm")
s = BeautifulSoup(tab.text,"lxml")
tab = s.find("table",{"id" : "data-table"})

def get_text_indent(row):
    text = row.text.replace(", general","").replace(", other","").lower().strip()
    indent = int(row['class'][0].replace("indent_",""))
    return text, indent

nsf_field_hierarchy = defaultdict(list)
nsf_terms = set()
rows = tab.find_all("th", {"scope" : "row"})

# stupid but it works
i = 1
while i < len(rows):
    # get my info
    curr_text, curr_indent  = get_text_indent(rows[i])
    # make sure this isn't already in the tree
    if curr_text not in nsf_terms:
        nsf_terms.add(curr_text)
        for j in reversed(range(i)):
            prev_text, prev_indent = get_text_indent(rows[j])
            if curr_indent - prev_indent == 1:
                nsf_field_hierarchy[prev_text].append(curr_text)
                break
    #else:
        #print rows[i].text, curr_text, ' already in tree'
    i += 1

nsf_termlist = Counter()
for x in nsf_terms:
    nsf_termlist.update([x for x in get_terms(x) if x not in STOPWORDS])
nsf_termlist = set(nsf_termlist.keys())

In [None]:
# Wikipedia Fields
#https://en.wikipedia.org/wiki/Outline_of_academic_disciplines
def get_text_indent_wiki(row):
    indent = -1
    if row.startswith("==="):
        indent = 1
    elif row.startswith("=="):
        indent = 0
    elif row.startswith("'''"):
        indent = 2
    elif row.startswith("***"):
        indent = 5
    elif row.startswith("**"):
        indent = 4
    elif row.startswith("*"):
        indent = 3
    else:
        pass
        #print "NOPE", row
    text = row.replace("*","").replace("=","").lower().strip()
    
    text = text.replace("'''","")

    text = re.sub("(#|\|)[a-z A-Z\(\)]+\]\]","",text)
    text = text.replace("[[","").replace("]]","").strip()
    if text.rfind("(outline") != -1:
        text = text[:text.rfind("(outline")]
    return text.strip(), indent

wiki_field_hierarchy = defaultdict(list)
rows = [get_text_indent_wiki(x) for x in io.open("../../data/wiki_page.txt").readlines()]
rows = [x for x in rows if x[1] != -1]
wiki_terms = set()

# stupid but it works
i = 1
while i < len(rows):
    # get my info
    curr_text, curr_indent  = rows[i]
    # make sure this isn't already in the tree
    if curr_text not in wiki_terms:
        wiki_terms.add(curr_text)
        for j in reversed(range(i)):
            prev_text, prev_indent = rows[j]
            if curr_indent - prev_indent > 0:
                wiki_field_hierarchy[prev_text].append(curr_text)
                break
    i += 1
    
wiki_termlist = Counter()
for x in wiki_terms:
    wiki_termlist.update([x for x in get_terms(x) if x not in STOPWORDS])
wiki_termlist = set(wiki_termlist.keys())


for term in wiki_terms:
    if term not in wiki_field_hierarchy:
        wiki_field_hierarchy[term] = []
        
        
# Archaeology is a social science discipline - https://en.wikipedia.org/wiki/Archaeology
# So is communications
wiki_field_hierarchy['social sciences'].append("archaelogy")
wiki_terms.add("archaeology")
wiki_field_hierarchy['social sciences'].append("communications")
wiki_terms.add("communications")

# if Music history is a subset of Music, art history should be a subset of art
wiki_field_hierarchy["visual arts"].append("art history")
wiki_terms.add("art history")

# Classics == classical literature
wiki_field_hierarchy["history of literature"].append("classics")
wiki_field_hierarchy["history of literature"].append("classical literature")
wiki_terms.add("classics")
wiki_terms.add("classical literature")

# If English lit is under language and arts, so is spanish
wiki_field_hierarchy["languages and literature"].append("spanish literature")
wiki_terms.add("spanish literature") 


In [None]:
# Bring back in cleaned version and count number of terms in each individual's department
manual_termlist = {x.split("\t")[0] for x in io.open("../../data/manual_keywords.tsv")}

In [None]:
# Write out the fields that we need to translate to English
non_eng_affl = df[~df.org_country.isin(["US","UK", "GB", "IE"])].dept.unique()
print(len(non_eng_affl))


In [None]:
to_translate = set()
for i,x in enumerate(non_eng_affl):
    if i % 30000 == 0:
        print(i)
    if x:
        terms = get_terms(x)
        if (len(terms & nsf_termlist) or 
            len(terms & wiki_termlist) or 
            len(terms & manual_termlist) or 
            langid.classify(x)[0] == 'en'):
                continue
        to_translate.add(x.lower())

In [40]:
already_translated = pd.read_csv("../../data/processed/translated_fieldnames.csv",encoding="utf8")
#already_translated.columns = ['from','to']
print(len(set(to_translate)), len(set(already_translated['from'].values)), 
      len(set(to_translate) - set(already_translated['from'].values)))


NameError: name 'to_translate' is not defined

In [92]:
with io.open("./to_translate.csv","w") as of:
    for x in set(to_translate) - set(already_translated['from'].values):
         of.write(x + u"\n")

# Otherwise, if you have the pre-translated data, just run this!

In [41]:
translated = pd.read_csv("../../data/processed/translated_fieldnames.csv",encoding="utf8",header=None)
translated.columns = ['from_field','to_field']

In [42]:
translated.drop_duplicates("from_field",inplace=True)
translated.from_field = translated.from_field.str.lower()
translated.to_field = translated.to_field.str.lower()


In [43]:
df.dept = df.dept.str.lower()
df = pd.merge(df, translated,left_on="dept",right_on="from_field",how="left")
df.dept = df.apply(lambda x : x['dept'] if pd.isnull(x['to_field']) else x['to_field'],axis=1)
df.drop(['from_field','to_field'],axis=1,inplace=True)
df.shape

(52976, 15)

# Field Matching

## Clean department names

In [44]:
df[~pd.isnull(df.org_name)]

Unnamed: 0,dept,disamb_org,oid,org_country,org_name,role,start_month,start_year,emp_or_edu,person_created_prof_date,person_first_name,person_last_mod_date,person_last_name,clean_role,role_category


In [45]:
df = df.assign(clean_dept = df.dept.apply(field_matching.clean_fieldnames))

## Perform matching

In [46]:
all_fields = df.clean_dept.value_counts()
fields_cum = (all_fields/float(all_fields.sum())).cumsum()

In [47]:
fields_cum[:20]

chemistry                 0.027019
physics                   0.048048
psychology                0.061654
mechanical engineering    0.074121
medicine                  0.085860
biology                   0.097138
mathematics               0.108222
education                 0.116687
computer science          0.124982
economics                 0.132355
electrical engineering    0.139098
civil engineering         0.145719
chemical engineering      0.151613
biochemistry              0.157216
engineering               0.162115
sociology                 0.166723
pharmacy                  0.171283
nursing                   0.175770
law                       0.179942
history                   0.183822
Name: clean_dept, dtype: float64

In [48]:
fields_cum[10000:10010]

entomology and zoology agrarian                                                  0.842299
telecommunication engneering                                                     0.842324
counselor education department                                                   0.842348
education and professional development                                           0.842372
campus vine                                                                      0.842396
programme of study: spatial planning                                             0.842421
obtstricia y ginecologia                                                         0.842445
coordenadoria de hospitalidade e lazer - cohl                                    0.842469
innovation - cleanwater team                                                     0.842493
cgiar research program on climate change, agriculture and food security ccafs    0.842518
Name: clean_dept, dtype: float64

In [49]:
fields_cum[5000], len(fields_cum)

(0.7210283773950164, 16503)

In [50]:
field_matching.perform_labeling("soil science"), field_matching.perform_labeling("naval engineering ")

(([('blacklist', 'soil science', 'soil sciences', 'editdist')], ''),
 ([('engineering', 'engineering', 'engineering', 'full_substring')], ' naval'))

In [51]:
black_root_re = re.compile(r"("+r"|".join([x.strip() for x in open("./black-list-roots.txt")])+r")[a-z]+($|\b)")

In [52]:
field_matching.clean_fieldnames('soil science')

'soil science'

In [53]:
errored = []
errors = 0
matches = []
remains = []
for v, f in enumerate(all_fields.items()):
    if v % 10000 == 0:
        print(v, errors)

    field, count = f
    field = field.replace("\t"," ").replace("\n", " ")
    try:
        labels, remaining_string = field_matching.perform_labeling(field)

        for label in labels:
            matches.append((field ,v) + label)

        if black_root_re.search(field):
            matches.append((field ,v, "","","","blacklist"))
        elif not len(labels):
            matches.append((field ,v, "","","","no_match"))

        remains.append(field,str(v),remaining_string, str(count))
    except UnicodeEncodeError:
        errors += 1
    except:
        errored.append(field)

0 0
10000 0


In [54]:
field_matches = pd.DataFrame(matches,
                             columns=['fieldname','index','matched_field','field_term','matched_text','matchtype'])

In [55]:
field_matches = field_matches.rename(index=str, columns={"index":"field_index"})

In [None]:
field_matches.to_csv("../../data/all_field_matches.csv",index=False, encoding="utf8")

In [56]:
field_matches.head()

Unnamed: 0,fieldname,field_index,matched_field,field_term,matched_text,matchtype
0,chemistry,0,chemistry,chemistry,chemistry,exact_full
1,physics,1,physics,physics,physics,exact_full
2,psychology,2,psychology,psychology,psychology,exact_full
3,mechanical engineering,3,engineering,engineering,engineering,full_substring
4,medicine,4,blacklist,medicine,medicine,exact_full


In [None]:
stats = pd.DataFrame(remains, columns=['fieldname','index','count'])
stats.to_csv("../../data/all_field_remains_and_counts.csv",index=False, encoding="utf8")

In [57]:
df.shape

(52976, 16)

# Combining matches with data

In [58]:
df.shape

(52976, 16)

In [59]:
df = df.assign(row_index=range(df.shape[0]))

In [60]:
field_matched_df = pd.merge(df, field_matches,left_on="clean_dept",right_on="fieldname",how="left")

In [61]:
field_matched_df.shape

(62580, 23)

In [123]:
field_matched_df.to_csv("../../data/processed/cleaned_all_affiliations.csv",encoding="utf8")

In [124]:
!gzip ../../data/cleaned_all_affiliations.csv