# 영문 이름으로 성별 예측

In [1]:
from nltk.corpus import names
import nltk
nltk.download('names')

[nltk_data] Downloading package names to
[nltk_data]     C:\Users\root\AppData\Roaming\nltk_data...
[nltk_data]   Package names is already up-to-date!


True

In [2]:
labeled_names = ([(name, '남자') for name in names.words('male.txt')] + 
                 [(name, '여자') for name in names.words('female.txt')])
labeled_names[:10]

[('Aamir', '남자'),
 ('Aaron', '남자'),
 ('Abbey', '남자'),
 ('Abbie', '남자'),
 ('Abbot', '남자'),
 ('Abbott', '남자'),
 ('Abby', '남자'),
 ('Abdel', '남자'),
 ('Abdul', '남자'),
 ('Abdulkarim', '남자')]

In [3]:
import random
random.shuffle(labeled_names, )
labeled_names[0:10]

[('Fayette', '여자'),
 ('Kingsly', '남자'),
 ('Viole', '여자'),
 ('Edy', '여자'),
 ('Wrennie', '여자'),
 ('Olive', '여자'),
 ('Alma', '여자'),
 ('Sheril', '여자'),
 ('Ichabod', '남자'),
 ('Hanford', '남자')]

In [4]:
def gender_features(word):
    return {'last_letter': word[-1]}

gender_features('Sopi')

{'last_letter': 'i'}

In [5]:
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
featuresets[0:10]

[({'last_letter': 'e'}, '여자'),
 ({'last_letter': 'y'}, '남자'),
 ({'last_letter': 'e'}, '여자'),
 ({'last_letter': 'y'}, '여자'),
 ({'last_letter': 'e'}, '여자'),
 ({'last_letter': 'e'}, '여자'),
 ({'last_letter': 'a'}, '여자'),
 ({'last_letter': 'l'}, '여자'),
 ({'last_letter': 'd'}, '남자'),
 ({'last_letter': 'd'}, '남자')]

In [6]:
train_set, test_set = featuresets[2000:], featuresets[:2000]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [7]:
classifier.classify(gender_features('Sephipa'))

'여자'

In [8]:
#  likelihood ratios
classifier.show_most_informative_features(5)

Most Informative Features
             last_letter = 'a'                여자 : 남자     =     36.3 : 1.0
             last_letter = 'k'                남자 : 여자     =     33.9 : 1.0
             last_letter = 'p'                남자 : 여자     =     17.7 : 1.0
             last_letter = 'f'                남자 : 여자     =     13.4 : 1.0
             last_letter = 'v'                남자 : 여자     =      9.2 : 1.0


In [9]:
print(nltk.classify.accuracy(classifier, test_set))

0.7635


In [10]:
def gender_features2(name):
    features = {}
    features["first_letter"] = name[0].lower()
    features["last_letter"] = name[-1].lower()
    features["length"] = len(name)
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count({})".format(letter)] = name.lower().count(letter)
#        features["has({})".format(letter)] = (letter in name.lower())
    return features

gender_features2('Joshua') 

{'first_letter': 'j',
 'last_letter': 'a',
 'length': 6,
 'count(a)': 1,
 'count(b)': 0,
 'count(c)': 0,
 'count(d)': 0,
 'count(e)': 0,
 'count(f)': 0,
 'count(g)': 0,
 'count(h)': 1,
 'count(i)': 0,
 'count(j)': 1,
 'count(k)': 0,
 'count(l)': 0,
 'count(m)': 0,
 'count(n)': 0,
 'count(o)': 1,
 'count(p)': 0,
 'count(q)': 0,
 'count(r)': 0,
 'count(s)': 1,
 'count(t)': 0,
 'count(u)': 1,
 'count(v)': 0,
 'count(w)': 0,
 'count(x)': 0,
 'count(y)': 0,
 'count(z)': 0}

In [11]:
labeled_names[0:10]

[('Fayette', '여자'),
 ('Kingsly', '남자'),
 ('Viole', '여자'),
 ('Edy', '여자'),
 ('Wrennie', '여자'),
 ('Olive', '여자'),
 ('Alma', '여자'),
 ('Sheril', '여자'),
 ('Ichabod', '남자'),
 ('Hanford', '남자')]

In [12]:
featuresets = [(gender_features2(n), gender) for (n, gender) in labeled_names]
featuresets[0:2]

[({'first_letter': 'f',
   'last_letter': 'e',
   'length': 7,
   'count(a)': 1,
   'count(b)': 0,
   'count(c)': 0,
   'count(d)': 0,
   'count(e)': 2,
   'count(f)': 1,
   'count(g)': 0,
   'count(h)': 0,
   'count(i)': 0,
   'count(j)': 0,
   'count(k)': 0,
   'count(l)': 0,
   'count(m)': 0,
   'count(n)': 0,
   'count(o)': 0,
   'count(p)': 0,
   'count(q)': 0,
   'count(r)': 0,
   'count(s)': 0,
   'count(t)': 2,
   'count(u)': 0,
   'count(v)': 0,
   'count(w)': 0,
   'count(x)': 0,
   'count(y)': 1,
   'count(z)': 0},
  '여자'),
 ({'first_letter': 'k',
   'last_letter': 'y',
   'length': 7,
   'count(a)': 0,
   'count(b)': 0,
   'count(c)': 0,
   'count(d)': 0,
   'count(e)': 0,
   'count(f)': 0,
   'count(g)': 1,
   'count(h)': 0,
   'count(i)': 1,
   'count(j)': 0,
   'count(k)': 1,
   'count(l)': 1,
   'count(m)': 0,
   'count(n)': 1,
   'count(o)': 0,
   'count(p)': 0,
   'count(q)': 0,
   'count(r)': 0,
   'count(s)': 1,
   'count(t)': 0,
   'count(u)': 0,
   'count(v)': 0,


In [13]:
train_set, test_set = featuresets[2000:], featuresets[:2000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.8005


In [14]:
classifier.show_most_informative_features(100)

Most Informative Features
             last_letter = 'a'                여자 : 남자     =     36.3 : 1.0
             last_letter = 'k'                남자 : 여자     =     33.9 : 1.0
             last_letter = 'p'                남자 : 여자     =     17.7 : 1.0
             last_letter = 'f'                남자 : 여자     =     13.4 : 1.0
             last_letter = 'v'                남자 : 여자     =      9.2 : 1.0
             last_letter = 'o'                남자 : 여자     =      8.8 : 1.0
             last_letter = 'd'                남자 : 여자     =      8.6 : 1.0
             last_letter = 'm'                남자 : 여자     =      7.9 : 1.0
             last_letter = 'r'                남자 : 여자     =      7.3 : 1.0
                count(v) = 2                  여자 : 남자     =      6.4 : 1.0
             last_letter = 'w'                남자 : 여자     =      5.9 : 1.0
            first_letter = 'w'                남자 : 여자     =      5.2 : 1.0
                count(w) = 2                  남자 : 여자     =      5.1 : 1.0

In [15]:
>>> train_names = labeled_names[1500:]
>>> devtest_names = labeled_names[500:1500]
>>> test_names = labeled_names[:500]

In [16]:
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
test_set = [(gender_features(n), gender) for (n, gender) in test_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set)) 

0.767


In [17]:
train_set[:20]

[({'last_letter': 'e'}, '여자'),
 ({'last_letter': 'a'}, '여자'),
 ({'last_letter': 's'}, '남자'),
 ({'last_letter': 'm'}, '남자'),
 ({'last_letter': 'l'}, '여자'),
 ({'last_letter': 'y'}, '여자'),
 ({'last_letter': 'e'}, '여자'),
 ({'last_letter': 'y'}, '남자'),
 ({'last_letter': 'y'}, '남자'),
 ({'last_letter': 'a'}, '여자'),
 ({'last_letter': 'o'}, '남자'),
 ({'last_letter': 'e'}, '여자'),
 ({'last_letter': 'a'}, '여자'),
 ({'last_letter': 'i'}, '여자'),
 ({'last_letter': 'd'}, '남자'),
 ({'last_letter': 'd'}, '남자'),
 ({'last_letter': 'a'}, '여자'),
 ({'last_letter': 'e'}, '여자'),
 ({'last_letter': 'o'}, '남자'),
 ({'last_letter': 'd'}, '남자')]

In [18]:
errors = []
for (name, tag) in devtest_names:
    guess = classifier.classify(gender_features(name))
    if guess != tag:
        errors.append( (tag, guess, name) )

In [19]:
for (tag, guess, name) in sorted(errors):
    print('correct={:<8} guess={:<8s} name={:<30}'.format(tag, guess, name))

correct=남자       guess=여자       name=Abbie                         
correct=남자       guess=여자       name=Aditya                        
correct=남자       guess=여자       name=Aguste                        
correct=남자       guess=여자       name=Ajay                          
correct=남자       guess=여자       name=Allie                         
correct=남자       guess=여자       name=Ambrose                       
correct=남자       guess=여자       name=Arne                          
correct=남자       guess=여자       name=Artie                         
correct=남자       guess=여자       name=Bailey                        
correct=남자       guess=여자       name=Barnabe                       
correct=남자       guess=여자       name=Barney                        
correct=남자       guess=여자       name=Barri                         
correct=남자       guess=여자       name=Barth                         
correct=남자       guess=여자       name=Bernie                        
correct=남자       guess=여자       name=Bjorne     

In [20]:
def gender_features(word):
    return {'suffix1': word[-1:],
            'suffix2': word[-2:]}

In [21]:
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set))

0.785


In [22]:
# train_set[:20]
train_names[:20]

[('Robbie', '여자'),
 ('Onlea', '여자'),
 ('Sturgis', '남자'),
 ('Liam', '남자'),
 ('Murial', '여자'),
 ('Clemmy', '여자'),
 ('Brynne', '여자'),
 ('Johny', '남자'),
 ('Dewey', '남자'),
 ('Catarina', '여자'),
 ('Alphonso', '남자'),
 ('Elnore', '여자'),
 ('Elisa', '여자'),
 ('Tiphani', '여자'),
 ('Lenard', '남자'),
 ('Ellwood', '남자'),
 ('Selina', '여자'),
 ('Blondie', '여자'),
 ('Gustavo', '남자'),
 ('Alfred', '남자')]