In [1]:
#Point criteria
#64: guranteed ethnicity
#32: 2 other possibilities
#16: 3 other possibilities
#8: 4 other possibilities
#4: 5 other possibilities
#2: 6 other possibilities
#1: 7 other possibilities


In [2]:
points = {
    'alb': 0,
    'ang': 0,
    'arm': 0,
    'azr': 0,
    'bsq': 0,
    'bos': 0,
    'scm': 0,
    'bul': 0,
    'cze': 0,
    'dan': 0,
    'est': 0,
    'fin': 0,
    'fre': 0,
    'gae': 0,
    'geo': 0,
    'ger': 0,
    'gre': 0,
    'hun': 0,
    'ice': 0,
    'ita': 0,
    'lat': 0,
    'lit': 0,
    'mlt': 0,
    'mol': 0,
    'dch': 0,
    'pol': 0,
    'por': 0,
    'rom': 0,
    'rus': 0,
    'slk': 0,
    'svn': 0,
    'spa': 0,
    'swe': 0,
    'trk': 0,
    'ukr': 0,
    'wls': 0,
}


codes = {
    'alb': 'Albanian',
    'ang': 'English', #anglo
    'arm': 'Armenian',
    'azr': 'Azeri',
    'bsq': 'Basque',
    'bos': 'Bosniak',
    'cro': 'Croatian',
    'bul': 'Bulgarian',
    'cze': 'Czech',
    'dan': 'Danish', #dano-norwegian
    'est': 'Estonian',
    'fin': 'Finnish',
    'fre': 'French',
    'gae': 'Irish Gaelic', #gaelic
    'geo': 'Georgian',
    'ger': 'German', #germanic
    'gre': 'Greek',
    'hun': 'Hungarian',
    'ice': 'Icelandic',
    'ita': 'Italian',
    'lat': 'Latvian',
    'lth': 'Lithuanian',
    'mlt': 'Maltese',
    'mol': 'Moldovan',
    'nor': 'Norwegian',
    'dch': 'Dutch',
    'pol': 'Polish',
    'por': 'Portuguese',
    'rom': 'Romanian',
    'rus': 'Russian',
    'slk': 'Slovakian',
    'svn': 'Slovenian',
    'spa': 'Spanish',
    'srb': 'Serbian',
    'swe': 'Swedish',
    'trk': 'Turkish',
    'ukr': 'Ukrainian',
    'wls': 'Welsh',
    
}


In [None]:
#WEB SCRAPER TO GET CHARTABLE
import sys
import numpy as np
import pandas as pd # library for data analysis
import requests # library to handle requests
from bs4 import BeautifulSoup # library to parse HTML documents

###################################################################################
##############                       FUNCTIONS                       ##############


#fixes broken offset on lowercase letters list, leaving a clean lowercase list
def fixAlphabet(df):
    letter_list = []

    for column in df:
       # print(len(column))
        letter_list.append(column[0].lower()) #take first level and make it lowercase
    letter_list[0] = 'Ethnicity'
    df.columns = letter_list
    return df

#make list of desire eth indeces, then select only those
def selectEths(df):
    for i in df.index:
        keep = False
        if (df.iloc[i, 0].endswith("]")):
            df.iloc[i, 0] = df.iloc[i, 0].split('[')[0]  #remove wiki footnotes
        if df.iloc[i, 0] == 'Romani':
            df.iloc[i, 0] = "drop_this" #edge case for Romani, program was confusing romani and romanian
        for key, value in codes.items():
            if df.iloc[i, 0] in value:                #replace the eths with codes
                df.iloc[i, 0] = key
                keep = True
        if (not keep):
           # print("dropped: " + df.iloc[i, 0])              #print when we delete to make sure we didn't accidentally delete one
            df.iloc[i, 0] = "drop_this"          #replace with 'drop_this otherwise'
    
    dfcopy = df.copy()
    for i in df.index:
        if (df.iloc[i, 0] == "drop_this"):     #markk eths to be dropped with "drop_this" string
            dfcopy.drop(i, inplace = True)
                    
    dfcopy.set_index('Ethnicity', inplace = True)

    return dfcopy
            
#add missing ethnicities to the df with NaN values
def addMissingEths(df):
    for key in codes.keys():
        if key not in df.index:    #if a code is not in our df yet, add it with NaN values
            df = df.append(pd.Series(name=key, dtype='str'))        
    return df

#aggregate function to call all the methods that shape the dfs nicely
def makeNiceDF(df):
    df = fixAlphabet(df)
    df = selectEths(df)
    df = addMissingEths(df)
    df.sort_index(inplace=True)
    return df
 
###################################################################################    
##############                         MAIN                          ##############


# get the response in the form of html
wikiurl="https://en.wikipedia.org/wiki/List_of_Latin-script_alphabets"
response=requests.get(wikiurl)

#pull each table
soup = BeautifulSoup(response.text, 'html.parser')

#original 26 latin letters
latin_table = soup.find_all('table', class_="wikitable")[0]  
latin_df= pd.read_html(str(latin_table))
latin_df = pd.DataFrame(latin_df[0])
latin_df = makeNiceDF(latin_df)
latin_df.drop(latin_df.columns[len(latin_df.columns)-1], axis=1, inplace=True) #drop '#' column from latin_df'
latin_df.loc[latin_df.index[latin_df.isnull().all(1)]] = latin_df.columns #if they're all NaN, meaning they use all 26, f
                                                                          #fill with all 26 from columns
    
#special letters table
#manual adjustment due to wikitable[2] grouping langs
special_letters_table = soup.find_all('table', class_="wikitable")[2]  
special_letters_df = pd.read_html(str(special_letters_table))
special_letters_df = pd.DataFrame(special_letters_df[0])
special_letters_df.iloc[7, 0] = 'Danish' #reassign Scandinavian group to just Danish
special_letters_df.iloc[12, 0] = 'Icelandic' #reassign Icelandic/Norn to just Icelandic
special_letters_df.iloc[13, 0] = 'French' #reassign British Isles group to just French
special_letters_df = makeNiceDF(special_letters_df)


#letter-diacritic combos table
#manual adjustment due to wikitable[3] grouping langs
letter_diacritic_table = soup.find_all('table', class_="wikitable")[3]  
letter_diacritic_df = pd.read_html(str(letter_diacritic_table))
letter_diacritic_df = pd.DataFrame(letter_diacritic_df[0])
letter_diacritic_df.iloc[5, 0] = 'Polish' #reassign kashubian/polish to just polish
letter_diacritic_df.iloc[21, 0] = 'Croatian' #reassign Croatian/Sami to just Croatian
letter_diacritic_df.iloc[28, 0] = 'Danish' #reassign Danish/Scandinavian to just Danish
letter_diacritic_df = makeNiceDF(letter_diacritic_df)
letter_diacritic_df.loc[['fre', 'por', 'trk'], 'ç'] = 'Ç' #assign Ç to correct langs
letter_diacritic_df.loc['trk', 'ş'] = 'Ş'                 #assign Ş to turkish


#a through h table
a_h_table = soup.find_all('table', class_="wikitable")[4]  
a_h_df = pd.read_html(str(a_h_table))
a_h_df = pd.DataFrame(a_h_df[0])
a_h_df = makeNiceDF(a_h_df)
#print(a_h_df)


#i through o table
i_o_table = soup.find_all('table', class_="wikitable")[5]  
i_o_df = pd.read_html(str(i_o_table))
i_o_df = pd.DataFrame(i_o_df[0])
i_o_df = makeNiceDF(i_o_df)
#print(i_o_df)


#p through z table
p_z_table = soup.find_all('table', class_="wikitable")[6]  
p_z_df = pd.read_html(str(p_z_table))
p_z_df = pd.DataFrame(p_z_df[0])
p_z_df = makeNiceDF(p_z_df)
#print(p_z_df)


#combine each df into one master df with all chars
dfs = [latin_df, special_letters_df, letter_diacritic_df, a_h_df, i_o_df, p_z_df]
char_df = pd.concat(dfs, axis=1)
char_df = char_df.applymap(lambda s: s.lower() if type(s) == str else s) # convert all values not Nan to lowercase
char_df.dropna(axis=1, how='all', inplace=True)


#char_df.to_csv('charTable.csv')#, index=False)
#commented so we don't recreate chartable


df = pd.read_csv('charTable.csv', index_col = 0)


In the .csv file, I manually removed certain chars from some languages that were excessive. Many languages had chars listed on the Wikipedia tables due to loanwords that would never occur in a native name in that language. For instance, English is listed as having Â, Ä, Û, Ü, etc. These and other symbols like them are included to facilitate integrating loanwords and foreign last names. Obviously a last name bearing one of these symbols in their name would not be of English origin.

In [1]:
#LOCAL COPY OF NAMEBOT8 for testing
import sys
import numpy as np
import pandas as pd # library for data analysis
import requests # library to handle requests
from bs4 import BeautifulSoup # library to parse HTML documents

#add vowel/consonant ending code

def namebot8(name):
    #input char DF
    char_df = pd.read_csv('charTableEdited.csv', index_col = 0)

    #input suffix DF
    suf_df = pd.read_csv('sufTable.csv', index_col = 0)

    #drop eth from all tables when it fails a criterium
    def dropEth(index):
        char_df.drop(index=i, inplace=True)               
        suf_df.drop(index=i, inplace=True)

    name = name.lower() #make lowercase
    name = ''.join(name.split())  #remove whitespace

    c = 0 #counter for current index in name
    for letter in name:
        if (letter not in name[0:c]):  #skip if we already have seen this letter
            for i in char_df.index:
                if char_df.loc[i][letter] != letter:
                    #print("dropped " + i + " due to: " + letter)  #drop eth if it doesn't have letter,
                    dropEth(i)                                    #print little report each time

        c += 1
        #return if char df len = 1



    for c in suf_df.columns:
        if name.endswith(c):
            for i in suf_df.index:
                if pd.isnull(suf_df.loc[i][c]):
                    dropEth(i)
            break
    if len(suf_df.index) == 0:
        return 'no possibilities'
    return(suf_df.index[0])


In [2]:
#TESTING ZONE
import os
import sys
import numpy as np
import pandas as pd # library for data analysis
import requests # library to handle requests
from bs4 import BeautifulSoup # library to parse HTML documents

swe_df = pd.read_excel('1k/swe_1k.xlsx', index_col = 0)
count = 0
for name in swe_df.index:
    if namebot8(name) != 'swe':               #in the xl files, the names themselves are the indeces
        count+=1
        print(count)



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277


bibliography
https://deepbaltic.com/2016/09/23/why-you-will-almost-definitely-have-to-change-your-name-when-speaking-latvian/