In [1]:
import nltk, random
from nltk.corpus import names
from _collections import defaultdict
from nltk.probability import FreqDist
labeled_names = ([(name, 'male') for name in names.words('male.txt')] + \
[(name, 'female') for name in names.words('female.txt')])
random.seed(55)
random.shuffle(labeled_names)
print(labeled_names[0:15])

[('Ethelyn', 'female'), ('Nance', 'female'), ('Sherwynd', 'male'), ('Diahann', 'female'), ('Colly', 'female'), ('Florentia', 'female'), ('Dmitri', 'male'), ('Noah', 'male'), ('Luise', 'female'), ('Clarke', 'male'), ('Patin', 'male'), ('Rubia', 'female'), ('Dionis', 'female'), ('Shel', 'female'), ('Marysa', 'female')]


In [2]:
def gender_features(word):
    gf_f=['0',word[0]]
    gf_l=['-1',word[-1]]
    gf_m=[]
    gf_m=['m',word[1:3]]
    gf_f.extend(gf_m)
    gf_f.extend(gf_l)
    return gf_f

In [3]:
print("gender_features('Avengers'): ",gender_features('Avengers'))
print("gender_features('Batman'): ",gender_features('Batman'))
print("gender_features('Scooby'): ",gender_features('Scooby'))

gender_features('Avengers'):  ['0', 'A', 'm', 've', '-1', 's']
gender_features('Batman'):  ['0', 'B', 'm', 'at', '-1', 'n']
gender_features('Scooby'):  ['0', 'S', 'm', 'co', '-1', 'y']


In [4]:
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[1000:], featuresets[:1000]
print(train_set[0:10]) 

[(['0', 'C', 'm', 'el', '-1', 'a'], 'female'), (['0', 'S', 'm', 'ti', '-1', 'g'], 'male'), (['0', 'I', 'm', 'll', '-1', 'a'], 'female'), (['0', 'C', 'm', 'al', '-1', 'a'], 'female'), (['0', 'A', 'm', 'li', '-1', 'e'], 'female'), (['0', 'C', 'm', 'ar', '-1', 'a'], 'female'), (['0', 'Y', 'm', 'eh', '-1', 'i'], 'male'), (['0', 'L', 'm', 'at', '-1', 'a'], 'female'), (['0', 'C', 'm', 'yb', '-1', 'l'], 'female'), (['0', 'B', 'm', 'yr', '-1', 'n'], 'male')]


In [5]:
trSamplesSize = len(train_set)
print('trSamplesSize ', len(train_set))
train_set_m = [x for x in train_set if x[1]=='male']
trainMale = len(train_set_m)
print('trainMale ', len(train_set_m))
P_male = trainMale/trSamplesSize
P_fem= 1-P_male
print('P(y) probabilities, y male, y female: ', P_male,P_fem)

trSamplesSize  6944
trainMale  2571
P(y) probabilities, y male, y female:  0.3702476958525346 0.6297523041474654


In [6]:
featuresetsSquished = [(''.join(gender_features(n)), gender) \
for (n, gender) in labeled_names]
train_sq, test_sq = featuresetsSquished[1000:], featuresetsSquished[:1000]
print(len(train_sq),train_sq[0:10],'\n')


6944 [('0Cmel-1a', 'female'), ('0Smti-1g', 'male'), ('0Imll-1a', 'female'), ('0Cmal-1a', 'female'), ('0Amli-1e', 'female'), ('0Cmar-1a', 'female'), ('0Ymeh-1i', 'male'), ('0Lmat-1a', 'female'), ('0Cmyb-1l', 'female'), ('0Bmyr-1n', 'male')] 



In [7]:
tr_sq_m = [f for f in train_sq if f[1]=='male']
tr_sq_f = [f for f in train_sq if f[1]=='female']
print(len(tr_sq_m),tr_sq_m[0:10])
print()
print(len(tr_sq_f),tr_sq_f[0:10])

2571 [('0Smti-1g', 'male'), ('0Ymeh-1i', 'male'), ('0Bmyr-1n', 'male'), ('0Mmat-1s', 'male'), ('0Smca-1e', 'male'), ('0Pmat-1k', 'male'), ('0Gmus-1v', 'male'), ('0Dmou-1g', 'male'), ('0Rmic-1y', 'male'), ('0Jmoh-1k', 'male')]

4373 [('0Cmel-1a', 'female'), ('0Imll-1a', 'female'), ('0Cmal-1a', 'female'), ('0Amli-1e', 'female'), ('0Cmar-1a', 'female'), ('0Lmat-1a', 'female'), ('0Cmyb-1l', 'female'), ('0Bmli-1y', 'female'), ('0Dmaf-1y', 'female'), ('0Emir-1a', 'female')]


In [8]:
from collections import Counter
tr_counter=Counter(train_sq)
trm_counter=Counter(tr_sq_m)
trf_counter=Counter(tr_sq_f)
print('type(tr_counter)',type(tr_counter))
print('counter: ',list(tr_counter)[0:10])
print('counter keys: ',list(tr_counter.keys())[0:10])

type(tr_counter) <class 'collections.Counter'>
counter:  [('0Nmat-1n', 'male'), ('0Dmes-1i', 'female'), ('0Rmud-1o', 'male'), ('0Lmia-1a', 'female'), ('0Emun-1e', 'female'), ('0Gmol-1e', 'female'), ('0Kmer-1e', 'female'), ('0Amnn-1e', 'female'), ('0Rmoc-1y', 'male'), ('0Xmev-1r', 'male')]
counter keys:  [('0Nmat-1n', 'male'), ('0Dmes-1i', 'female'), ('0Rmud-1o', 'male'), ('0Lmia-1a', 'female'), ('0Emun-1e', 'female'), ('0Gmol-1e', 'female'), ('0Kmer-1e', 'female'), ('0Amnn-1e', 'female'), ('0Rmoc-1y', 'male'), ('0Xmev-1r', 'male')]


In [9]:
f = open('tr_counterdis', 'w')
for word, count in tr_counter.most_common(None):
    wc = word[0]+' '+word[1]+'\t'+str(count)+'\n'
    f.write(wc)
f.close()


f = open('trf_counterdis', 'w')
for word, count in trf_counter.most_common(None):
    wc = word[0]+' '+word[1]+'\t'+str(count)+'\n'
    f.write(wc)
f.close()

f = open('trm_counterdis', 'w')
for word, count in trm_counter.most_common(None):
    wc = word[0]+' '+word[1]+'\t'+str(count)+'\n'
    f.write(wc)
f.close()

In [10]:
for k in list(tr_counter.keys())[0:10]:
#print(k[0],k[1],tr_counter[k])
    if k[1]=='male':
        nsex = 'female'
    else:
        nsex='male'
print(nsex,tr_counter[k])


female 1


In [11]:
nk=(k[0],nsex)
print(nk[0],nk[1],tr_counter[nk])
if tr_counter[nk]>0:
    print(k[0],k[1],tr_counter[k])
    print(nk[0],nk[1],tr_counter[nk])
    print()


0Xmev-1r female 0


In [12]:
fd= FreqDist(train_sq)
fd.plot(40)


In [13]:
total = float(sum(tr_counter.values()))
prob_tr= [(key[0], count / total) for key, count in tr_counter.items()]
f = open('tr_probs', 'w')
for word, count in prob_tr:
    wc = word+'\t '+str(count)+'\n'
    print(wc)
    f.write(wc)
f.close()


0Nmat-1n	 0.00014400921658986175

0Dmes-1i	 0.00014400921658986175

0Rmud-1o	 0.00014400921658986175

0Lmia-1a	 0.0002880184331797235

0Emun-1e	 0.00014400921658986175

0Gmol-1e	 0.00014400921658986175

0Kmer-1e	 0.0002880184331797235

0Amnn-1e	 0.002880184331797235

0Rmoc-1y	 0.00014400921658986175

0Xmev-1r	 0.00014400921658986175

0Amma-1i	 0.00014400921658986175

0Lmeo-1d	 0.0007200460829493088

0Rmup-1a	 0.00014400921658986175

0Jmod-1e	 0.0002880184331797235

0Hman-1h	 0.00014400921658986175

0Fmer-1s	 0.00014400921658986175

0Hmow-1d	 0.00014400921658986175

0Gmlo-1i	 0.00014400921658986175

0Lmaw-1n	 0.00014400921658986175

0Cmle-1s	 0.0002880184331797235

0Mmen-1d	 0.00014400921658986175

0Rmho-1y	 0.00014400921658986175

0Pmat-1o	 0.00014400921658986175

0Smiv-1t	 0.00014400921658986175

0Bmer-1i	 0.0002880184331797235

0Rmup-1o	 0.00014400921658986175

0Mmai-1y	 0.00014400921658986175

0Cmla-1l	 0.00014400921658986175

0Amne-1e	 0.00014400921658986175

0Imsi-1a	 0.0001440092

In [14]:
total = float(sum(trm_counter.values()))
prob_trm= {(key[0], count / total) for key,count in trm_counter.most_common(None)}
f = open('trm_probs', 'w')
for word, count in prob_trm:
    wc = word+'\t '+str(count)+'\n'
    print(wc)
    f.write(wc)
f.close()


0Cmal-1n	 0.00038895371450797355

0Omle-1g	 0.00038895371450797355

0Mmor-1s	 0.00038895371450797355

0Imke-1e	 0.00038895371450797355

0Imvo-1r	 0.00038895371450797355

0Gmuy-1y	 0.00038895371450797355

0Nmee-1l	 0.00038895371450797355

0Kmen-1l	 0.0007779074290159471

0Hmad-1y	 0.00038895371450797355

0Emd-1d	 0.00038895371450797355

0Mmik-1l	 0.0007779074290159471

0Rmon-1y	 0.00038895371450797355

0Nmap-1n	 0.00038895371450797355

0Wmhi-1n	 0.00038895371450797355

0Lmia-1m	 0.00038895371450797355

0Wmol-1m	 0.00038895371450797355

0Pmac-1o	 0.00038895371450797355

0Emv-1v	 0.00038895371450797355

0Zmeu-1s	 0.00038895371450797355

0Hmom-1r	 0.00038895371450797355

0Bmen-1e	 0.00038895371450797355

0Omri-1n	 0.0007779074290159471

0Imgn-1e	 0.00038895371450797355

0Vmic-1r	 0.00038895371450797355

0Cmha-1s	 0.0011668611435239206

0Hmas-1n	 0.00038895371450797355

0Amng-1s	 0.00038895371450797355

0Smhe-1a	 0.00038895371450797355

0Tmed-1e	 0.0007779074290159471

0Dmur-1d	 0.000777907

In [15]:
total = float(sum(trf_counter.values()))
prob_trf= {(key[0],count / total) for key, count in trf_counter.items()}
f = open('trf_probs', 'w')
for word, count in prob_trf:
    wc = word +'\t '+str(count)+'\n'
    print(wc)
    f.write(wc)
f.close()


0Amlb-1a	 0.000686027898467871

0Fmer-1n	 0.000228675966155957

0Bmon-1y	 0.000228675966155957

0Nmor-1h	 0.000228675966155957

0Cmos-1e	 0.000228675966155957

0Fmor-1a	 0.000228675966155957

0Rmho-1y	 0.000228675966155957

0Vmik-1y	 0.000228675966155957

0Bmen-1y	 0.000228675966155957

0Dmyn-1a	 0.000228675966155957

0Cmri-1l	 0.000457351932311914

0Gmig-1i	 0.000228675966155957

0Cmhe-1h	 0.000228675966155957

0Dmie-1a	 0.000228675966155957

0Kmen-1e	 0.000228675966155957

0Smha-1n	 0.002058083695403613

0Wmen-1y	 0.000228675966155957

0Omli-1a	 0.000228675966155957

0Ambi-1l	 0.000457351932311914

0Cmhe-1i	 0.000228675966155957

0Dmar-1b	 0.000228675966155957

0Nmad-1e	 0.000228675966155957

0Emng-1a	 0.000228675966155957

0Fmri-1e	 0.000228675966155957

0Imda-1a	 0.000457351932311914

0Gmue-1e	 0.000228675966155957

0Bmil-1e	 0.000228675966155957

0Gmly-1n	 0.000228675966155957

0Dmaw-1a	 0.000228675966155957

0Lmee-1e	 0.000457351932311914

0Bmli-1s	 0.000228675966155957

0Dmau-1e

In [25]:
examples = ['Aravind','Shashmi','Atchaya', 'Nidheesh', 'Monica', 'Rachell', 'Xhxin']
for ex in examples:
    print("gender_features(",ex, "): ",gender_features(ex))

gender_features( Aravind ):  ['0', 'A', 'm', 'ra', '-1', 'd']
gender_features( Shashmi ):  ['0', 'S', 'm', 'ha', '-1', 'i']
gender_features( Atchaya ):  ['0', 'A', 'm', 'tc', '-1', 'a']
gender_features( Nidheesh ):  ['0', 'N', 'm', 'id', '-1', 'h']
gender_features( Monica ):  ['0', 'M', 'm', 'on', '-1', 'a']
gender_features( Rachell ):  ['0', 'R', 'm', 'ac', '-1', 'l']
gender_features( Xhxin ):  ['0', 'X', 'm', 'hx', '-1', 'n']


In [26]:
exsf_sq = [''.join(gender_features(n)) for n in examples]
for ex in exsf_sq:
    print("sq_features(",ex, "): ",ex)


sq_features( 0Amra-1d ):  0Amra-1d
sq_features( 0Smha-1i ):  0Smha-1i
sq_features( 0Amtc-1a ):  0Amtc-1a
sq_features( 0Nmid-1h ):  0Nmid-1h
sq_features( 0Mmon-1a ):  0Mmon-1a
sq_features( 0Rmac-1l ):  0Rmac-1l
sq_features( 0Xmhx-1n ):  0Xmhx-1n


In [27]:
examples_fsq = [[n,''.join(gender_features(n))] for n in examples]
for ex in examples_fsq:
    print(ex)


['Aravind', '0Amra-1d']
['Shashmi', '0Smha-1i']
['Atchaya', '0Amtc-1a']
['Nidheesh', '0Nmid-1h']
['Monica', '0Mmon-1a']
['Rachell', '0Rmac-1l']
['Xhxin', '0Xmhx-1n']


In [28]:
for [name, f] in examples_fsq:
    print(name,'as female', end= ' ')
    print('feature prob',f, (dict(prob_trf)).get(f))
    print('\n')


Aravind as female feature prob 0Amra-1d None


Shashmi as female feature prob 0Smha-1i 0.000686027898467871


Atchaya as female feature prob 0Amtc-1a None


Nidheesh as female feature prob 0Nmid-1h None


Monica as female feature prob 0Mmon-1a 0.000686027898467871


Rachell as female feature prob 0Rmac-1l 0.000457351932311914


Xhxin as female feature prob 0Xmhx-1n None




In [29]:
for [name, f] in examples_fsq:
    print(name,'as male', end= ' ')
    print('feature prob',f, (dict(prob_trm)).get(f))
    

Aravind as male feature prob 0Amra-1d None
Shashmi as male feature prob 0Smha-1i None
Atchaya as male feature prob 0Amtc-1a None
Nidheesh as male feature prob 0Nmid-1h None
Monica as male feature prob 0Mmon-1a None
Rachell as male feature prob 0Rmac-1l None
Xhxin as male feature prob 0Xmhx-1n None


In [30]:
print('P_fem , P_male ', P_fem, P_male)


P_fem , P_male  0.6297523041474654 0.3702476958525346


In [31]:
type(prob_trf)

set

In [32]:
trf_dict = dict(prob_trf)
for [name, f] in examples_fsq:
    f_prob=trf_dict.get(f)
    if f_prob == None:
        f_prob=0
        print(name,'as female', f_prob, end= ' ')
    if f_prob == None:
        f_prob=0
        print('feature prob',f, f_prob, 'fem_probability ',
        round(P_fem * f_prob,6))

Aravind as female 0 Atchaya as female 0 Nidheesh as female 0 Xhxin as female 0 

In [35]:
trm_dict = dict(prob_trm)
for [name, f] in examples_fsq:
    f_prob=trm_dict.get(f)
    if f_prob == None:
        f_prob=0
        print(name,'as male', f_prob, end= ' ')
        print()
    if f_prob == None:
        f_prob=0
print()
print('feature prob',f, f_prob, 'male_probability ', round(P_male * f_prob,6))

Aravind as male 0 
Shashmi as male 0 
Atchaya as male 0 
Nidheesh as male 0 
Monica as male 0 
Rachell as male 0 
Xhxin as male 0 

feature prob 0Xmhx-1n 0 male_probability  0.0
