In [1]:
import nltk, random
from nltk.corpus import names
from _collections import defaultdict
from nltk.probability import FreqDist
labeled_names = ([(name, 'male') for name in names.words('male.txt')] + \
[(name, 'female') for name in names.words('female.txt')])
random.seed(21)
random.shuffle(labeled_names)
print(labeled_names[0:10])

[('Dorette', 'female'), ('Michaelina', 'female'), ('Derrol', 'male'), ('Rozella', 'female'), ('Van', 'male'), ('Amaleta', 'female'), ('Ephram', 'male'), ('Allis', 'female'), ('Arden', 'female'), ('Jana', 'female')]


In [2]:
def gender_features(word):
    gf_f=['0',word[0]]
    gf_l=['-1',word[-1]]
    gf_m=[]
    gf_m=['m',word[1:2]]
    gf_f.extend(gf_m)
    gf_f.extend(gf_l)
    return gf_f

In [3]:
print("gender_features('Shrek'): ",gender_features('Sriharish'))
print("gender_features('Neo'): ",gender_features('Trisha'))
print("gender_features('Matrix'): ",gender_features('Scooby'))

gender_features('Shrek'):  ['0', 'S', 'm', 'r', '-1', 'h']
gender_features('Neo'):  ['0', 'T', 'm', 'r', '-1', 'a']
gender_features('Matrix'):  ['0', 'S', 'm', 'c', '-1', 'y']


In [4]:
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[1000:], featuresets[:1000]
print(train_set[0:10]) 

[(['0', 'P', 'm', 'a', '-1', 'e'], 'female'), (['0', 'H', 'm', 'e', '-1', 'e'], 'female'), (['0', 'M', 'm', 'a', '-1', 'o'], 'female'), (['0', 'B', 'm', 'e', '-1', 'e'], 'female'), (['0', 'A', 'm', 'u', '-1', 'a'], 'female'), (['0', 'A', 'm', 'l', '-1', 'h'], 'male'), (['0', 'D', 'm', 'a', '-1', 'y'], 'female'), (['0', 'L', 'm', 'e', '-1', 'i'], 'male'), (['0', 'C', 'm', 'a', '-1', 'a'], 'female'), (['0', 'H', 'm', 'a', '-1', 't'], 'female')]


In [5]:
trSamplesSize = len(train_set)
print('trSamplesSize ', len(train_set))
train_set_m = [x for x in train_set if x[1]=='male']
trainMale = len(train_set_m)
print('trainMale ', len(train_set_m))
P_male = trainMale/trSamplesSize
P_fem= 1-P_male
print('P(y) probabilities, y male, y female: ', P_male,P_fem)

trSamplesSize  6944
trainMale  2578
P(y) probabilities, y male, y female:  0.3712557603686636 0.6287442396313364


In [6]:
featuresetsSquished = [(''.join(gender_features(n)), gender) \
for (n, gender) in labeled_names]
train_sq, test_sq = featuresetsSquished[1000:], featuresetsSquished[:1000]
print(len(train_sq),train_sq[0:10],'\n')


6944 [('0Pma-1e', 'female'), ('0Hme-1e', 'female'), ('0Mma-1o', 'female'), ('0Bme-1e', 'female'), ('0Amu-1a', 'female'), ('0Aml-1h', 'male'), ('0Dma-1y', 'female'), ('0Lme-1i', 'male'), ('0Cma-1a', 'female'), ('0Hma-1t', 'female')] 



In [7]:
tr_sq_m = [f for f in train_sq if f[1]=='male']
tr_sq_f = [f for f in train_sq if f[1]=='female']
print(len(tr_sq_m),tr_sq_m[0:10])
print()
print(len(tr_sq_f),tr_sq_f[0:10])

2578 [('0Aml-1h', 'male'), ('0Lme-1i', 'male'), ('0Amm-1i', 'male'), ('0Lmo-1r', 'male'), ('0Bmo-1s', 'male'), ('0Smi-1e', 'male'), ('0Amr-1o', 'male'), ('0Hma-1d', 'male'), ('0Wmi-1n', 'male'), ('0Fmi-1e', 'male')]

4366 [('0Pma-1e', 'female'), ('0Hme-1e', 'female'), ('0Mma-1o', 'female'), ('0Bme-1e', 'female'), ('0Amu-1a', 'female'), ('0Dma-1y', 'female'), ('0Cma-1a', 'female'), ('0Hma-1t', 'female'), ('0Jme-1a', 'female'), ('0Lma-1y', 'female')]


In [8]:
from collections import Counter
tr_counter=Counter(train_sq)
trm_counter=Counter(tr_sq_m)
trf_counter=Counter(tr_sq_f)
print('type(tr_counter)',type(tr_counter))
print('counter: ',list(tr_counter)[0:10])
print('counter keys: ',list(tr_counter.keys())[0:10])

type(tr_counter) <class 'collections.Counter'>
counter:  [('0Smh-1i', 'female'), ('0Rme-1y', 'male'), ('0Yma-1y', 'male'), ('0Emt-1i', 'female'), ('0Tma-1h', 'female'), ('0Fml-1d', 'male'), ('0Amm-1e', 'female'), ('0Rme-1o', 'male'), ('0Jmo-1o', 'female'), ('0Yme-1t', 'female')]
counter keys:  [('0Smh-1i', 'female'), ('0Rme-1y', 'male'), ('0Yma-1y', 'male'), ('0Emt-1i', 'female'), ('0Tma-1h', 'female'), ('0Fml-1d', 'male'), ('0Amm-1e', 'female'), ('0Rme-1o', 'male'), ('0Jmo-1o', 'female'), ('0Yme-1t', 'female')]


In [9]:
f = open('tr_counterdis', 'w')
for word, count in tr_counter.most_common(None):
    wc = word[0]+' '+word[1]+'\t'+str(count)+'\n'
    f.write(wc)
f.close()


f = open('trf_counterdis', 'w')
for word, count in trf_counter.most_common(None):
    wc = word[0]+' '+word[1]+'\t'+str(count)+'\n'
    f.write(wc)
f.close()

f = open('trm_counterdis', 'w')
for word, count in trm_counter.most_common(None):
    wc = word[0]+' '+word[1]+'\t'+str(count)+'\n'
    f.write(wc)
f.close()

In [10]:
for k in list(tr_counter.keys())[0:10]:
#print(k[0],k[1],tr_counter[k])
    if k[1]=='male':
        nsex = 'female'
    else:
        nsex='male'
print(nsex,tr_counter[k])
nk=(k[0],nsex)
print(nk[0],nk[1],tr_counter[nk])
if tr_counter[nk]>0:
    print(k[0],k[1],tr_counter[k])
    print(nk[0],nk[1],tr_counter[nk])
    print()


male 1
0Yme-1t male 0


In [11]:
fd= FreqDist(train_sq)
fd.plot(40)


In [12]:
total = float(sum(tr_counter.values()))
prob_tr= [(key[0], count / total) for key, count in tr_counter.items()]
f = open('tr_probs', 'w')
for word, count in prob_tr:
    wc = word+'\t '+str(count)+'\n'
    print(wc)
    f.write(wc)
f.close()


0Smh-1i	 0.0007200460829493088

0Rme-1y	 0.0002880184331797235

0Yma-1y	 0.0002880184331797235

0Emt-1i	 0.00014400921658986175

0Tma-1h	 0.0004320276497695853

0Fml-1d	 0.00014400921658986175

0Amm-1e	 0.0012960829493087558

0Rme-1o	 0.0002880184331797235

0Jmo-1o	 0.0002880184331797235

0Yme-1t	 0.00014400921658986175

0Emr-1s	 0.0002880184331797235

0Rmo-1d	 0.0007200460829493088

0Rmu-1s	 0.0002880184331797235

0Jmo-1a	 0.0018721198156682027

0Kmr-1e	 0.0007200460829493088

0Vmi-1i	 0.000576036866359447

0Dmr-1u	 0.00014400921658986175

0Rmi-1c	 0.00014400921658986175

0Bmo-1t	 0.00014400921658986175

0Mmo-1y	 0.0008640552995391706

0Jmo-1e	 0.000576036866359447

0Mma-1u	 0.00014400921658986175

0Rmy-1a	 0.00014400921658986175

0Hme-1k	 0.0002880184331797235

0Wme-1y	 0.00014400921658986175

0Mma-1v	 0.00014400921658986175

0Dma-1y	 0.0012960829493087558

0Bmu-1l	 0.00014400921658986175

0Jma-1e	 0.000576036866359447

0Mmi-1i	 0.0008640552995391706

0Emn-1d	 0.00014400921658986175


In [13]:
total = float(sum(trm_counter.values()))
prob_trm= {(key[0], count / total) for key,count in trm_counter.most_common(None)}
f = open('trm_probs', 'w')
for word, count in prob_trm:
    wc = word+'\t '+str(count)+'\n'
    print(wc)
    f.write(wc)
f.close()


0Jmo-1y	 0.0023273855702094647

0Pmr-1n	 0.0007757951900698216

0Dmo-1g	 0.0003878975950349108

0Dmo-1c	 0.0007757951900698216

0Dmu-1t	 0.0003878975950349108

0Ema-1e	 0.0003878975950349108

0Lmo-1e	 0.001939487975174554

0Pmi-1n	 0.0003878975950349108

0Tma-1t	 0.0015515903801396431

0Vma-1r	 0.0003878975950349108

0Imr-1e	 0.0003878975950349108

0Hmi-1l	 0.0011636927851047323

0Wmo-1g	 0.0007757951900698216

0Hmi-1n	 0.0003878975950349108

0Tmr-1e	 0.0011636927851047323

0Wmi-1y	 0.0007757951900698216

0Jme-1b	 0.0003878975950349108

0Cmy-1s	 0.0007757951900698216

0Sma-1a	 0.0007757951900698216

0Lmi-1y	 0.0011636927851047323

0Gme-1g	 0.0003878975950349108

0Fme-1n	 0.0003878975950349108

0Cmy-1e	 0.0003878975950349108

0Amr-1e	 0.0023273855702094647

0Fml-1t	 0.0003878975950349108

0Eml-1h	 0.0007757951900698216

0Dmi-1h	 0.0003878975950349108

0Cml-1f	 0.0003878975950349108

0Dma-1o	 0.0003878975950349108

0Fme-1s	 0.0003878975950349108

0Dme-1s	 0.0011636927851047323

0Gma-1a	 

In [14]:
total = float(sum(trf_counter.values()))
prob_trf= {(key[0],count / total) for key, count in trf_counter.items()}
f = open('trf_probs', 'w')
for word, count in prob_trf:
    wc = word +'\t '+str(count)+'\n'
    print(wc)
    f.write(wc)
f.close()


0Tma-1i	 0.0009161704076958314

0Wmi-1t	 0.00022904260192395785

0Wmh-1y	 0.00022904260192395785

0Kma-1h	 0.0009161704076958314

0Tmh-1n	 0.00022904260192395785

0Nme-1e	 0.001603298213467705

0Emv-1n	 0.0006871278057718735

0Omp-1l	 0.00022904260192395785

0Vmi-1n	 0.0011452130096197893

0Cmh-1e	 0.0057260650480989465

0Wme-1h	 0.00022904260192395785

0Fmr-1y	 0.0009161704076958314

0Jma-1i	 0.0006871278057718735

0Emf-1e	 0.00022904260192395785

0Omc-1a	 0.00022904260192395785

0Amn-1s	 0.00022904260192395785

0Dmo-1l	 0.00022904260192395785

0Gma-1s	 0.00022904260192395785

0Emd-1a	 0.001374255611543747

0Fmr-1d	 0.00022904260192395785

0Lmo-1n	 0.0011452130096197893

0Kme-1a	 0.0018323408153916628

0Cma-1a	 0.011223087494273936

0Lmu-1e	 0.001603298213467705

0Rmi-1a	 0.001603298213467705

0Mmo-1h	 0.00022904260192395785

0Lmi-1l	 0.0006871278057718735

0Rmy-1n	 0.00022904260192395785

0Umr-1e	 0.00022904260192395785

0Bme-1a	 0.006184150251946862

0Mmu-1n	 0.00022904260192395785


In [23]:
examples = ['Dharshan','Allen','Dicaprio', 'Davinci', 'Matrix', 'Harry', 'Hermonie']
for ex in examples:
    print("gender_features(",ex, "): ",gender_features(ex))

gender_features( Dharshan ):  ['0', 'D', 'm', 'h', '-1', 'n']
gender_features( Allen ):  ['0', 'A', 'm', 'l', '-1', 'n']
gender_features( Dicaprio ):  ['0', 'D', 'm', 'i', '-1', 'o']
gender_features( Davinci ):  ['0', 'D', 'm', 'a', '-1', 'i']
gender_features( Matrix ):  ['0', 'M', 'm', 'a', '-1', 'x']
gender_features( Harry ):  ['0', 'H', 'm', 'a', '-1', 'y']
gender_features( Hermonie ):  ['0', 'H', 'm', 'e', '-1', 'e']


In [24]:
exsf_sq = [''.join(gender_features(n)) for n in examples]
for ex in exsf_sq:
    print("sq_features(",ex, "): ",ex)


sq_features( 0Dmh-1n ):  0Dmh-1n
sq_features( 0Aml-1n ):  0Aml-1n
sq_features( 0Dmi-1o ):  0Dmi-1o
sq_features( 0Dma-1i ):  0Dma-1i
sq_features( 0Mma-1x ):  0Mma-1x
sq_features( 0Hma-1y ):  0Hma-1y
sq_features( 0Hme-1e ):  0Hme-1e


In [25]:
examples_fsq = [[n,''.join(gender_features(n))] for n in examples]
for ex in examples_fsq:
    print(ex)


['Dharshan', '0Dmh-1n']
['Allen', '0Aml-1n']
['Dicaprio', '0Dmi-1o']
['Davinci', '0Dma-1i']
['Matrix', '0Mma-1x']
['Harry', '0Hma-1y']
['Hermonie', '0Hme-1e']


In [28]:
for [name, f] in examples_fsq:
    print(name,'as female', end= ' ')
    print('feature prob',f, (dict(prob_trf)).get(f))


Dharshan as female feature prob 0Dmh-1n None
Allen as female feature prob 0Aml-1n 0.002061383417315621
Dicaprio as female feature prob 0Dmi-1o 0.00022904260192395785
Davinci as female feature prob 0Dma-1i 0.001603298213467705
Matrix as female feature prob 0Mma-1x 0.0004580852038479157
Harry as female feature prob 0Hma-1y 0.0022904260192395786
Hermonie as female feature prob 0Hme-1e 0.002977553825011452


In [29]:
for [name, f] in examples_fsq:
    print(name,'as male', end= ' ')
    print('feature prob',f, (dict(prob_trm)).get(f))
    

Dharshan as male feature prob 0Dmh-1n None
Allen as male feature prob 0Aml-1n 0.00504266873545384
Dicaprio as male feature prob 0Dmi-1o 0.0007757951900698216
Davinci as male feature prob 0Dma-1i 0.0003878975950349108
Matrix as male feature prob 0Mma-1x 0.0003878975950349108
Harry as male feature prob 0Hma-1y 0.0027152831652443757
Hermonie as male feature prob 0Hme-1e 0.0027152831652443757


In [30]:
print('P_fem , P_male ', P_fem, P_male)


P_fem , P_male  0.6287442396313364 0.3712557603686636


In [31]:
type(prob_trf)

set

In [45]:
trf_dict = dict(prob_trf)
for [name, f] in examples_fsq:
    f_prob=trf_dict.get(f)
    if f_prob == None:
        f_prob=0
    print(name,'as female', f_prob, end= ' ')
    print()
    if f_prob == None:
        f_prob=0
print()
print('feature prob',f, f_prob, 'fem_probability ',
round(P_fem * f_prob,6))

Dharshan as female 0 
Allen as female 0.002061383417315621 
Dicaprio as female 0.00022904260192395785 
Davinci as female 0.001603298213467705 
Matrix as female 0.0004580852038479157 
Harry as female 0.0022904260192395786 
Hermonie as female 0.002977553825011452 

feature prob 0Hme-1e 0.002977553825011452 fem_probability  0.001872


In [44]:
trm_dict = dict(prob_trm)
for [name, f] in examples_fsq:
    f_prob=trm_dict.get(f)
    if f_prob == None:
        f_prob=0
    print(name,'as male', f_prob, end= ' ')
    print()
    if f_prob == None:
        f_prob=0
print()
print('feature prob',f, f_prob, 'male_probability ', round(P_male * f_prob,6))

Dharshan as male 0 
Allen as male 0.00504266873545384 
Dicaprio as male 0.0007757951900698216 
Davinci as male 0.0003878975950349108 
Matrix as male 0.0003878975950349108 
Harry as male 0.0027152831652443757 
Hermonie as male 0.0027152831652443757 

feature prob 0Hme-1e 0.0027152831652443757 male_probability  0.001008
