# Code to extract languages and authors from Scopus search

In [26]:
import pandas as pd
import io

df = pd.read_csv('scopus_search.csv', encoding='utf-8')

                                               Authors  \
0                                  Stein S.D.; Plag I.   
1                    Mattingley W.; Hall K.C.; Hume E.   
2                             Ambrazaitis G.; House D.   
3                                Denby T.; Goldrick M.   
4    Mertz J.; Annucci C.; Aristodemo V.; Giustolis...   
5    Yazawa K.; Konishi T.; Whang J.; Escudero P.; ...   
6                            Sciberras C.; Mitterer H.   
7                               Scott J.H.G.; Darcy I.   
8                                     Soo R.; Babel M.   
9    Shaw J.A.; Foulkes P.; Hay J.; Evans B.G.; Doc...   
10                Strycharczuk P.; Derrick D.; Shaw J.   
11                           Lee-Kim S.-I.; Chou Y.-C.   
12       Puggaard-Rode R.; Horslund C.S.; Jørgensen H.   
13          Rathcke T.; Lin C.-Y.; Falk S.; Bella S.D.   
14                Tjuka A.; Thu Nguyen H.T.; Spalek K.   
15                                          Ziegler W.   
16   Noiray A.

In [27]:
df.columns

Index(['Authors', 'Author full names', 'Author(s) ID', 'Title', 'Year',
       'Source title', 'Volume', 'Issue', 'Art. No.', 'Page start', 'Page end',
       'Page count', 'Cited by', 'DOI', 'Link', 'Affiliations',
       'Authors with affiliations', 'Abstract', 'Author Keywords',
       'Index Keywords', 'Correspondence Address', 'Editors', 'Publisher',
       'ISSN', 'ISBN', 'CODEN', 'PubMed ID', 'Language of Original Document',
       'Abbreviated Source Title', 'Document Type', 'Publication Stage',
       'Open Access', 'Source', 'EID'],
      dtype='object')

## Identify all strings with sequences of words with capital letters (e.g. French Sign Language)

In [42]:
import re

cap_re = re.compile('[A-Z]\w*(?:[ -][A-Z]\w*)*')

In [29]:
cap_words = []

for x in df['Abstract']:
  cap_words = cap_words + cap_re.findall(x)

for l in sorted(list(set(cap_words))):
  print(l)

A
A Neapolitan Italian
A Type-I
ABX
ADS
AM
ANIM
AOS
ASP
AX
AXB
Absent
Accent
Accentual Prominence
According
Acoustic
Acoustically
Across
Additional
Additionally
African American Language
Afrikaans
After
Agreement
Alim
All
All Rights Reserved
Also
Although
Alveolar
American English
Amharic
An
Analyses
Analysis
Andalusian Spanish
Annotation
Anti-Proximity
Antoniou
AoA
Applied
Arabic
Argentina
Arrernte
Articulatory
Articulatory Phonology
Artificial
As
As Media Lengua
Assimilation
At
AusE
Australia
Australian
Australian English
Austronesian
Automatic Selective Perception
Autosegmental-Metrical
B
B SPL
Babel
Bantu
Baron Cohen
Based
Bearing
Because
Beckner
Beddor
Belgrade
Ben Hedia
Benjamin
Berkson
Best
Between
Beyond
Bock
Bongaerts
Bora
Both
Breiman
British English
Broca
Browman
Buenos Aires
Bulgarian
Bulgarian-English
By
Bybee
C
C1
C1C2
C2
CC-BY
CCV
CCa
CV
CVC English
CVCVLV-LV
CVLVCV-LV
CVd
CVl
Campbell
Campidanese
Campidanese Sardinian
Canadian
Cantonese
Cantonese-English
Carlson
Carmich

## We search for words beginning with non A-Z characters to ensure languages such as "ǂʼAmkoe" were not excluded. We find no languages in this list

In [30]:
non_lat_re = re.compile("[^A-Za-z0-9\s()\-,\.\-\'\"]\w*")

non_lat_list = []

for x in df['Abstract']:
  non_lat_list = non_lat_list + non_lat_re.findall(x)

for c in sorted(list(set(non_lat_list))):
  print(c)



#
#CV
#otros
%
%L
&
*
*LAPSE
+
+3
+H
+L
+closure
+diffuse
+grave
/
/10
/4
/Anti
/CVd
/CVl
/Trochaic
/a
/aI
/ai
/analysis
/and
/are
/at
/aul
/aw
/b
/bl
/by
/c
/compactness
/creativecommons
/d
/diphthong
/distinctively
/doi
/e
/g
/giC
/hVd
/hVl
/has
/i
/in
/is
/k
/kod
/kot
/l
/labphon
/licenses
/los
/m
/n
/nuclear
/o
/or
/p
/pasta
/pata
/phonetics
/r
/s
/should
/si
/st
/t
/than
/u
/versus
/vowel
/w
/weakly
/while
/z
/æ
/ŋ
/Ɯ
/Λ
/ᾶ
:
:l
:ur
:us
;
;10
<f
<h
<k
<sh
=
>
>8
?
?at
?uence
[
[ASP
[M
[No
[PAM
[_a
[_i
[_ou
[a
[aCCa
[burst
[d
[diffuse
[f
[fimje
[g
[grave
[h
[i
[ie
[j
[je
[k
[l
[lo
[m
[n
[ne
[o
[s
[si
[spread
[sumie
[t
[ts
[u
[voice
[w
[z
[ç
[ŋ
]
]ae
]u
{H
}
~
~200
©
ácz
æl
æͻl
émon
éry
ímya
ïve
öhr
úmia
āori
ŋ
Ɛ
Ɯ
̩
̪
͡
ͻl
ΔF
ε
–200
–f0
–production
—
—English
—being
—can
—except
—for
—given
—indexes
—listener
—speech
—the
—two
—vary
‘Vocalization
‘enhancement
‘offbeat
‘researcher
‘variable
‘words
’
’s
“Foxes
“He
“Production
“default
“ka
“ma
“whole
“他给她
“他给她狗
”
…
→
↔Ø
−H2
∔
∗
∗pitch

## From this list of capitalized word sequences, we manually identified those which refer to languages. The list of languages is below

In [31]:
raw_langs = ["Neapolitan Italian",
"African American Language",
"Afrikaans",
"American English",
"Amharic",
"Andalusian Spanish",
"Arabic",
"Arrernte",
"AusE",
"Australian English",
"Austronesian",
"Bantu",
"Bora",
"British English",
"Bulgarian",
"English",
"Campidanese",
"Campidanese Sardinian",
"Canadian",
"Cantonese",
"Central Australian",
"Chamic",
"Chinese",
"Chinese",
"Chinese Wu",
"Chru",
"Continental French",
"Corsican French",
"Danish",
"Dardic",
"Dialect B",
"Dutch",
"Dutch Low Saxon",
"English",
"Fering",
"French",
"French Sign Language",
"Gaelic",
"Georgian",
"German",
"German Low Saxon",
"Grassfields Bantu",
"Greater New Orleans English",
"Greek Thrace Romani",
"Guatemalan Mayan",
"Gyeonggi Korean",
"Hindi",
"Hollandic Dutch",
"Hong Kong Cantonese",
"Hungarian",
"Vietnamese",
"American English",
"Australian English",
"English",
"German",
"Luganda",
"Indo-Aryan",
"Indonesian",
"Japanese",
"Javanese",
"Kalasha",
"Kaqchikel",
"Katuic",
"Khmer",
"Kleurling Afrikaans",
"Korean",
"Kuy",
"Kyrgyz",
"English",
"Spanish",
"Japanese",
"Spanish",
"American English",
"LSF",
"Lao",
"Luganda",
"Makasar",
"Maltese",
"Mandarin",
"Mandarin Chinese",
"Media Lengua",
"Medumba",
"Midland American English",
"Moroccan Arabic",
"Māori",
"NZE",
"New Zealand English",
"North American English",
"North Frisian",
"Northern High German",
"Northern Vietnamese",
"Papuan Malay",
"Peninsular Spanish",
"Pitjantjatjara",
"Quichua",
"Russian",
"Salerno Italian",
"Santo Domingo Spanish",
"Sasak",
"Scottish Gaelic",
"Seoul Korean",
"Serbian",
"Shanghai Mandarin",
"Shanghai Wu",
"American English",
"Southern French",
"Standard Mandarin",
"Swedish",
"Taiwan Mandarin",
"Tashlhiyt",
"Tashlhiyt Berber",
"Thai",
"Bora",
"English",
"German",
"Media Lengua",
"Russian",
"Dutch",
"Trade Malay",
"Tripolitanian Libyan Arabic",
"Turkish",
"Uganda",
"Uyghur",
"Venezuelan Spanish",
"Vietnamese",
"Warlpiri",
"West Frisian",
"Western Andalusian",
"Western Campidanese",
"White Afrikaans",
"Zeelandic Dutch" ]

sorted_langs = sorted(list(set(raw_langs)))

lang_re = re.compile('|'.join(sorted_langs))

In [32]:
for l in sorted_langs:
  print(l)

African American Language
Afrikaans
American English
Amharic
Andalusian Spanish
Arabic
Arrernte
AusE
Australian English
Austronesian
Bantu
Bora
British English
Bulgarian
Campidanese
Campidanese Sardinian
Canadian
Cantonese
Central Australian
Chamic
Chinese
Chinese Wu
Chru
Continental French
Corsican French
Danish
Dardic
Dialect B
Dutch
Dutch Low Saxon
English
Fering
French
French Sign Language
Gaelic
Georgian
German
German Low Saxon
Grassfields Bantu
Greater New Orleans English
Greek Thrace Romani
Guatemalan Mayan
Gyeonggi Korean
Hindi
Hollandic Dutch
Hong Kong Cantonese
Hungarian
Indo-Aryan
Indonesian
Japanese
Javanese
Kalasha
Kaqchikel
Katuic
Khmer
Kleurling Afrikaans
Korean
Kuy
Kyrgyz
LSF
Lao
Luganda
Makasar
Maltese
Mandarin
Mandarin Chinese
Media Lengua
Medumba
Midland American English
Moroccan Arabic
Māori
NZE
Neapolitan Italian
New Zealand English
North American English
North Frisian
Northern High German
Northern Vietnamese
Papuan Malay
Peninsular Spanish
Pitjantjatjara
Quichua
R

In [33]:
print(len(sorted_langs))

112


## List of countries of affiliation with their frequency in this Scopus search

In [35]:
for c in sorted(list(set(countries))):
  print(c, countries.count(c))

Australia 27
Austria 1
Canada 33
China 4
Denmark 3
France 14
Geraci C. 1
Germany 58
Guatemala 1
Hong Kong 5
Italy 4
Japan 13
Malta 3
Morocco 1
Netherlands 13
New Zealand 19
Poland 2
Portugal 1
Singapore 1
South Africa 3
South Korea 4
Spain 2
Sweden 5
Switzerland 1
Taiwan 5
United Kingdom 23
United States 141
Viet Nam 1


## Extract countries of affiliation from the Scopus search, and extract languages using the regular expression above from the abstracts. The results are written out to the LabPhon_extract.csv file

In [38]:
country_col = []
lang_col = []

for author_list, abstract in zip(df['Authors with affiliations'], df['Abstract']):
    
    # check for NaN values
    if author_list != author_list or abstract != abstract:
        lang_col = lang_col + ['']
        country_col = country_col + ['']
        continue
    
    curr_langs = lang_re.findall(abstract)
    lang_string = '; '.join(curr_langs)
    
    
    # extract author affiliations
    affls = author_list.split(';')

    curr_countries = []
    for a in affls:
        # the country is the last field specified after the comma
        country = a.split(',')[-1].strip()
        curr_countries.append(country)
    
    country_string = '; '.join(curr_countries)
    
    lang_col = lang_col + [lang_string]
    country_col = country_col + [country_string]
    
out_df = pd.DataFrame(
    {
        'Title':df['Title'],
        'Authors':df['Authors'],
        'Author Countries of Affiliation':country_col,
        'Languages in Abstract':lang_col
    }
)

print(out_df)

Stein S.D., Department of English and American Studies, Heinrich Heine University, Düsseldorf, Germany; Plag I., Department of English and American Studies, Heinrich Heine University, Düsseldorf, Germany
Mattingley W., Department of Linguistics, University of Canterbury, New Zealand; Hall K.C., Department of Linguistics, University of British Columbia, Canada; Hume E., Office of Academic Affairs, Ohio State University, United States
Ambrazaitis G., Department of Swedish, Linnæus University, Växjö, Sweden; House D., Division of Speech, Music and Hearing, KTH (Royal Institute of Technology), Stockholm, Sweden
Denby T., Department of Linguistics, Northwestern University, Evanston, IL, United States; Goldrick M., Department of Linguistics, Northwestern University, Evanston, IL, United States
Mertz J., Université de Paris, CNRS, Laboratoire de linguistique formelle, Paris, F-75013, France; Annucci C., Université de Paris, CNRS, Laboratoire de linguistique formelle, Paris, F-75013, France; A

In [41]:
out_df.to_csv('LabPhon_extract.csv', encoding='utf-8')