forked from ferhatelmas/sexmachine
-
Notifications
You must be signed in to change notification settings - Fork 30
/
detector.py
101 lines (85 loc) · 4.06 KB
/
detector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import os.path
import codecs
class NoCountryError(Exception):
"""Raised when non-supported country is queried"""
pass
class Detector:
"""Get gender by first name"""
COUNTRIES = u"""great_britain ireland usa italy malta portugal spain france
belgium luxembourg the_netherlands east_frisia germany austria
swiss iceland denmark norway sweden finland estonia latvia
lithuania poland czech_republic slovakia hungary romania
bulgaria bosniaand croatia kosovo macedonia montenegro serbia
slovenia albania greece russia belarus moldova ukraine armenia
azerbaijan georgia the_stans turkey arabia israel china india
japan korea vietnam other_countries
""".split()
def __init__(self,
case_sensitive=True):
"""Creates a detector parsing given data file"""
self.case_sensitive = case_sensitive
self._parse(os.path.join(os.path.dirname(__file__), "data/nam_dict.txt"))
def _parse(self, filename):
"""Opens data file and for each line, calls _eat_name_line"""
self.names = {}
with codecs.open(filename, encoding="utf-8") as f:
for line in f:
self._eat_name_line(line.strip())
def _eat_name_line(self, line):
"""Parses one line of data file"""
if line[0] not in "#=":
parts = line.split()
country_values = line[30:-1]
name = parts[1]
if not self.case_sensitive:
name = name.lower()
if parts[0] == "M":
self._set(name, u"male", country_values)
elif parts[0] == "1M" or parts[0] == "?M":
self._set(name, u"mostly_male", country_values)
elif parts[0] == "F":
self._set(name, u"female", country_values)
elif parts[0] == "1F" or parts[0] == "?F":
self._set(name, u"mostly_female", country_values)
elif parts[0] == "?":
self._set(name, u"andy", country_values)
else:
raise "Not sure what to do with a sex of %s" % parts[0]
def _set(self, name, gender, country_values):
"""Sets gender and relevant country values for names dictionary of detector"""
if '+' in name:
for replacement in ['', ' ', '-']:
self._set(name.replace('+', replacement), gender, country_values)
else:
if name not in self.names:
self.names[name] = {}
self.names[name][gender] = country_values
def _most_popular_gender(self, name, counter):
"""Finds the most popular gender for the given name counting by given counter"""
if name not in self.names:
return u"unknown"
max_count, max_tie = (0, 0)
best = list(self.names[name].keys())[0]
for gender, country_values in list(self.names[name].items()):
count, tie = counter(country_values)
if count > max_count or (count == max_count and tie > max_tie):
max_count, max_tie, best = count, tie, gender
return best if max_count > 0 else u"andy"
def get_gender(self, name, country=None):
"""Returns best gender for the given name and country pair"""
if not self.case_sensitive:
name = name.lower()
if name not in self.names:
return u"unknown"
elif not country:
def counter(country_values):
country_values = list(map(ord, country_values.replace(" ", "")))
return (len(country_values),
sum([c > 64 and c-55 or c-48 for c in country_values]))
return self._most_popular_gender(name, counter)
elif country in self.__class__.COUNTRIES:
index = self.__class__.COUNTRIES.index(country)
counter = lambda e: (ord(e[index])-32, 0)
return self._most_popular_gender(name, counter)
else:
raise NoCountryError("No such country: %s" % country)