# Matching two list of author's name for publication
One list of author's name with email needs to be updated with current list of authors on our recent publication. The list needs to be cleaned and linked.

In [2]:
import pandas as pd
import re

censor = False

#### Get current list of authors in a pandas dataframe

In [31]:
char = '''Joshua T. Vogelstein1YB; Timothy Verstynen3YB; Konrad P. Kording2YB;
Leyla Isik1Y; John W. Krakauer1; Ralph Etienne-Cummings1; Elizabeth L. Ogburn1; Carey E. Priebe1;
Randal Burns1; Kwame Kutten1; James J. Knierim1; James B. Potash1; Thomas Hartung1; Lena
Smirnova1; Alena Savonenko1; Ian Phillips1; Michael I. Miller1; Rene Vidal1; Jeremias Sulam1; Adam
Charles1; Noah J. Cowan1; Maxim Bichuch1; Archana Venkataraman1; Chen Li1; Nitish Thakor1;
Justus M Kebschull1; Marilyn Albert1; Jinchong Xu1; Marshall Hussain Shuler1; Brian Caffo1; Tilak
Ratnanather1; Ali Geisa1; Seung-Eon Roh1; Eva Yezerets1; Meghana Madhyastha1; Javier J. How1;
Tyler M. Tomita1; Jayanta Dey1; Ningyuan (Teresa) Huang1; Ashwin De Silva1; Jong M. Shin1; Kaleab
Alemayehu Kinfu1; Soledad Villar1; Erik Peterson3 Pratik Chaudhari2; Ben Baker2; Anna C. Schapiro2;
Dinesh Jayaraman2; Eric Eaton2; Michael Platt2; Lyle Ungar2; Leila Wehbe3; Michael J. Tarr3; Adam
Kepecs4; Amy Christensen4; Jessica Cantlon3; Onyema Osuagwu5; Bing Brunton6; Brett Mensh7;
Alysson R. Muotri8; Gabriel Silva8; Francesca Puppo8; Florian Engert9; Elizabeth Hillman10; Julia
Brown11; Chris White12; Weiwei Yang12; Andrei A. Rusu13'''

In [32]:
tmp = char.replace('\n', '').split(';')
tmp[0].rfind('[0-9]')
tmp[0]

'Joshua T. Vogelstein1YB'

In [33]:
df = pd.DataFrame(tmp, columns=['Name'])

In [35]:
df.shape

(67, 1)

In [11]:
def remove_int_beyond(xx):
    out = []
    for x in xx:
        s = re.search('\d', x).start()
        out.append(x[:s].strip())
    return out

In [12]:
curr_df = df.apply(lambda x: remove_int_beyond(x),axis=0)
curr_df

Unnamed: 0,Name
0,Joshua T. Vogelstein
1,Timothy Verstynen
2,Konrad P. Kording
3,Leyla Isik
4,John W. Krakauer
...,...
62,Elizabeth Hillman
63,JuliaBrown
64,Chris White
65,Weiwei Yang


#### Load previous author list which is now outdated

In [13]:
df = pd.read_excel('authors/email-response-list.xlsx')

#### Redact email for privacy

In [14]:
def redact(x):
    if isinstance(x, str):
        s = x.find('<')
        e = x.find('@')
        return x.replace(x[s+1:e], 'XXX')
    else:
        return ''

In [15]:
if censor:
    df.iloc[:,0] = df.apply(lambda x: redact(x.iloc[0]), axis=1)

#### preprocess before probablistic linking

In [16]:
newdf = df.iloc[:,:2].dropna()
newdf.head()

Unnamed: 0,JOVO EMAIL LIST,WHITE PAPER AUTHOR LIST
0,"Adam Charles <adamsc@jhu.edu>,",Adam Charles
1,"Adam Kepecs <adam.cshl@gmail.com>,",Adam Kepecs
2,"Alena Savonenko <asavone1@jhmi.edu>,",Alena Savonenko
3,"Ali Geisa <realaligeisa@gmail.com>,",Ali Geisa
4,"Amy Christensen <chandra.christensen@gmail.com>,",Amy Christensen


In [17]:
newlist = df.iloc[:,0].tolist()
newlist2 = []

for i in newlist:
    if isinstance(i, str):
        tmp = i.split(' <')
        if len(tmp) == 1:
            continue

        tmp[1] = tmp[1].replace('>,','')
        newlist2.append(tmp)

In [18]:
df_out = pd.DataFrame(newlist2, columns=['Name', 'Email'])
df_out.shape, df_out.head()

((57, 2),
               Name                          Email
 0     Adam Charles                 adamsc@jhu.edu
 1      Adam Kepecs            adam.cshl@gmail.com
 2  Alena Savonenko              asavone1@jhmi.edu
 3        Ali Geisa         realaligeisa@gmail.com
 4  Amy Christensen  chandra.christensen@gmail.com)

In [19]:
df_out.to_excel('current_author_list.xlsx', index=False)

In [20]:
if not censor:
    df_out = pd.read_excel('authors/current_author_list.xlsx')

#### Join old author list with new author list
Old author list has a column for email whereas the new author list only has the name of authors

In [21]:
import fuzzymatcher

In [22]:
curr_df.head(5), df_out.head(5)

(                   Name
 0  Joshua T. Vogelstein
 1     Timothy Verstynen
 2     Konrad P. Kording
 3            Leyla Isik
 4      John W. Krakauer,
                    Name                    Email
 0  Joshua T. Vogelstein            jovo@progl.ai
 1     Timothy Verstynen       timothyv@gmail.com
 2     Konrad P. Kording       koerding@gmail.com
 3            Leyla Isik            lisik@jhu.edu
 4      John W. Krakauer  john.krakauer@gmail.com)

In [23]:
match_result = fuzzymatcher.fuzzy_left_join(
    df_left=curr_df, df_right=df_out, 
    left_on='Name', right_on='Name', 
    left_id_col='Name', right_id_col='Name')

match_result.shape

(67, 6)

In [24]:
match_result.head()

Unnamed: 0,best_match_score,__id_left,__id_right,Name_left,Name_right,Email
0,0.581172,Joshua T. Vogelstein,Joshua T. Vogelstein,Joshua T. Vogelstein,Joshua T. Vogelstein,jovo@progl.ai
1,0.359099,Timothy Verstynen,Timothy Verstynen,Timothy Verstynen,Timothy Verstynen,timothyv@gmail.com
2,0.431144,Konrad P. Kording,Konrad P. Kording,Konrad P. Kording,Konrad P. Kording,koerding@gmail.com
3,0.268181,Leyla Isik,Leyla Isik,Leyla Isik,Leyla Isik,lisik@jhu.edu
4,0.42111,John W. Krakauer,John W. Krakauer,John W. Krakauer,John W. Krakauer,john.krakauer@gmail.com


In [25]:
df_out = match_result[['Name_left', 'Email']]
df_out.columns=['Name','Email']
df_out.head()

Unnamed: 0,Name,Email
0,Joshua T. Vogelstein,jovo@progl.ai
1,Timothy Verstynen,timothyv@gmail.com
2,Konrad P. Kording,koerding@gmail.com
3,Leyla Isik,lisik@jhu.edu
4,John W. Krakauer,john.krakauer@gmail.com


In [26]:
if not censor:
    df_out.to_excel('authors/final_output_email_list.xlsx', index=False)

#### Manual entry of missing emails

In [27]:
df_linear = pd.read_excel('authors/final_output_email_list_modified.xlsx')

#### Linearize email for convenience

In [28]:
linear_txt = '; '.join(df_linear['Email'])

In [29]:
# output hidden for privacy
if not censor:
    linear_txt

In [30]:
with open('for_email.txt', 'w', encoding='utf-8') as f:
    f.write(linear_txt)

#### End of Notebook