## Text cleaning with pandas
The goal was to split author's names from our recent [publication](https://arxiv.org/abs/2201.07372) stored in a long multiline string into a clean form

In [1]:
import string
import pandas as pd
import re

seems to be semicolons are the separator. Also, each name should follow last name first followed up a comma format. Lastly, the number from the superscript needs to be removed.

In [2]:
char = '''Joshua T. Vogelstein1YB; Timothy Verstynen3YB; Konrad P. Kording2YB;
Leyla Isik1Y; John W. Krakauer1; Ralph Etienne-Cummings1; Elizabeth L. Ogburn1; Carey E. Priebe1;
Randal Burns1; Kwame Kutten1; James J. Knierim1; James B. Potash1; Thomas Hartung1; Lena
Smirnova1; Alena Savonenko1; Ian Phillips1; Michael I. Miller1; Rene Vidal1; Jeremias Sulam1; Adam
Charles1; Noah J. Cowan1; Maxim Bichuch1; Archana Venkataraman1; Chen Li1; Nitish Thakor1;
Justus M Kebschull1; Marilyn Albert1; Jinchong Xu1; Marshall Hussain Shuler1; Brian Caffo1; Tilak
Ratnanather1; Ali Geisa1; Seung-Eon Roh1; Eva Yezerets1; Meghana Madhyastha1; Javier J. How1;
Tyler M. Tomita1; Jayanta Dey1; Ningyuan (Teresa) Huang1; Ashwin De Silva1; Jong M. Shin1; Kaleab
Alemayehu Kinfu1; Soledad Villar1; Pratik Chaudhari2; Ben Baker2; Anna C. Schapiro2; Dinesh Jayaraman2;
Eric Eaton2; Michael Platt2; Lyle Ungar2; Leila Wehbe3; Michael J. Tarr3; Adam Kepecs4; Amy Christensen4;
Onyema Osuagwu5; Bing Brunton6; Brett Mensh7; Alysson R. Muotri8; Gabriel Silva8; Francesca
Puppo8; Florian Engert9; Elizabeth Hillman10; Julia Brown11; Chris White12; Weiwei Yang12; Andrei 
A. Rusu13'''

In [3]:
tmp = char.replace('\n', '').split(';')
tmp[0].rfind('[0-9]')
tmp[0]

'Joshua T. Vogelstein1YB'

In [4]:
tmp

['Joshua T. Vogelstein1YB',
 ' Timothy Verstynen3YB',
 ' Konrad P. Kording2YB',
 'Leyla Isik1Y',
 ' John W. Krakauer1',
 ' Ralph Etienne-Cummings1',
 ' Elizabeth L. Ogburn1',
 ' Carey E. Priebe1',
 'Randal Burns1',
 ' Kwame Kutten1',
 ' James J. Knierim1',
 ' James B. Potash1',
 ' Thomas Hartung1',
 ' LenaSmirnova1',
 ' Alena Savonenko1',
 ' Ian Phillips1',
 ' Michael I. Miller1',
 ' Rene Vidal1',
 ' Jeremias Sulam1',
 ' AdamCharles1',
 ' Noah J. Cowan1',
 ' Maxim Bichuch1',
 ' Archana Venkataraman1',
 ' Chen Li1',
 ' Nitish Thakor1',
 'Justus M Kebschull1',
 ' Marilyn Albert1',
 ' Jinchong Xu1',
 ' Marshall Hussain Shuler1',
 ' Brian Caffo1',
 ' TilakRatnanather1',
 ' Ali Geisa1',
 ' Seung-Eon Roh1',
 ' Eva Yezerets1',
 ' Meghana Madhyastha1',
 ' Javier J. How1',
 'Tyler M. Tomita1',
 ' Jayanta Dey1',
 ' Ningyuan (Teresa) Huang1',
 ' Ashwin De Silva1',
 ' Jong M. Shin1',
 ' KaleabAlemayehu Kinfu1',
 ' Soledad Villar1',
 ' Pratik Chaudhari2',
 ' Ben Baker2',
 ' Anna C. Schapiro2',
 ' D

In [5]:
df = pd.DataFrame(tmp)

In [6]:
def remove_int_beyond(xx):
    out = []
    for x in xx:
        s = re.search('\d', x).start()
        out.append(x[:s].strip())
    return out

In [7]:
df = df.apply(lambda x: remove_int_beyond(x),axis=0)
df

Unnamed: 0,0
0,Joshua T. Vogelstein
1,Timothy Verstynen
2,Konrad P. Kording
3,Leyla Isik
4,John W. Krakauer
...,...
61,Elizabeth Hillman
62,Julia Brown
63,Chris White
64,Weiwei Yang


In [8]:
df_app = pd.read_excel('authors/author-list-append.xlsx', header=None)
df_app

Unnamed: 0,0,1,2
0,A:,Timothy Verstynen,Carnegie Mellon University
1,A:,Konrad P. Kording,University of Pennsylvania
2,A:,Leyla Isik,Johns Hopkins University
3,A:,John W. Krakauer,Johns Hopkins University
4,A:,Ralph Etienne-Cummings,Johns Hopkins University
...,...,...,...
57,A:,Florian Engert,Harvard University
58,A:,Elizabeth Hillman,Columbia University
59,A:,Julia Brown,MindX
60,A:,Chris White,Microsoft Research


In [9]:
df_org = pd.read_excel('authors/author-list-original.xlsx', header=None)
df_org

Unnamed: 0,0,1,2
0,A:,"Arroyo-Relión, Jesús",Johns Hopkins University
1,A:,"Aguerrebere, Cecilia",Duke
2,A:,"Allen, Peter J",Memorial Sloan-Kettering Cancer Center
3,A:,"Athreya, Avanti",Johns Hopkins University
4,A:,"Badea, Alexandra",Duke
...,...,...,...
239,C:,"Vogelstein, Bert",Johns Hopkins University
240,C:,"Vogelstein, R. Jacob",Camden Partners Nexus
241,C:,"Yuste, Rafael",Columbia University
242,C:,"Zheng, Da",Amazon.com


In [14]:
#complete collaborator list for annual faculty review
pd.DataFrame(df_org[1] + ' - ' + df_org[2]).to_excel('collaborator-list.xlsx')

In [21]:
df_org[1][df_org[1].str.contains('Vogelstein')]

144        Vogelstein, Bert
145    Vogelstein, R. Jacob
239        Vogelstein, Bert
240    Vogelstein, R. Jacob
Name: 1, dtype: object

In [22]:
def swap(x):
    x[0], x[len(x)-1] = x[len(x)-1], x[0]
    return x

def reformat_name(x):
    if len(x) == 3:
        out = f"{x[0]}, {x[2]} {x[1].replace('.', '')}"
    elif len(x) == 2:
        out = f'{x[0]}, {x[1]}'
    else:
        out = 'ERROR'

    return out

def check_name(x, y):
    x1 = x.split()
    y1 = y.split()
    
    if x1[0] == y1[0] and x1[1] == y1[1]:
        if len(x) >= len(y):
            return x
        else:
            return y

def vectorize_without_middle_name(x):
    x1 = x.split()
    if len(x1) >= 3:
        return x1[:2]
    elif len(x1) == 2:
        return x1
    else:
        print(f'ERROR: {x1}')
        return 'ERROR'

In [23]:
df_app[1] = df_app[1].apply(lambda x: x.split()).apply(lambda x: swap(x))

In [24]:
df_app[1] = df_app[1].apply(lambda x: reformat_name(x))

In [25]:
df_out = pd.concat([df_org, df_app]).sort_values(0).drop_duplicates().reset_index(drop=True)

In [26]:
df_vect = df_out[1].apply(lambda x: vectorize_without_middle_name(x))
df_vect

ERROR: ['Faloutsos,Christos']


0      [Arroyo-Relión,, Jesús]
1             [Wang,, Tian-Li]
2              [Wang,, Yuxuan]
3              [White,, Chris]
4         [Weinberg,, Richard]
                ...           
297         [Patsolic,, Jesse]
298            [Packer,, Adam]
299          [Ryman,, Sephira]
300       [Mishchenko,, Yuriy]
301          [Paninski,, Liam]
Name: 1, Length: 302, dtype: object

In [27]:
df_out[0] = df_out[0].str.upper()

In [28]:
df_out.sort_values([0, 1]).to_excel('authors/author-list-final-final.xlsx')

In [29]:
auth_idx = []
for idx, auth in enumerate(df_app[0]):
    if auth not in df_org[0]:
        auth_idx.append(idx)

In [30]:
df_app.loc[auth_idx]

Unnamed: 0,0,1,2
0,A:,"Verstynen, Timothy",Carnegie Mellon University
1,A:,"Kording, Konrad P",University of Pennsylvania
2,A:,"Isik, Leyla",Johns Hopkins University
3,A:,"Krakauer, John W",Johns Hopkins University
4,A:,"Etienne-Cummings, Ralph",Johns Hopkins University
...,...,...,...
57,A:,"Engert, Florian",Harvard University
58,A:,"Hillman, Elizabeth",Columbia University
59,A:,"Brown, Julia",MindX
60,A:,"White, Chris",Microsoft Research
