Calculating Entropy

In [11]:
import nltk
import math
def entropy(labels):
    freqdist = nltk.FreqDist(labels)
    probs = [freqdist.freq(l) for l in freqdist]
    return -sum(p * math.log(p,2) for p in probs)

In [12]:
print(entropy([2]+[1]*1)) 

1.0


In [75]:
print(entropy([2]+[1]*2)) 

0.9182958340544896


In [14]:
print(entropy([2]+[1]*4))

0.7219280948873623


In [15]:
print(entropy([2]+[1]*9))

0.4689955935892812


In [16]:
print(entropy([2]+[1]*30))

0.20559250818508304


In [17]:
print(entropy([2]+[1]*100)) 

0.08013604733127526


In [19]:
print(entropy([3,2,1]+[1]*1)) 

1.5


In [20]:
print(entropy([3,2,1]+[1]*2)) 

1.3709505944546687


In [22]:
print(entropy([3,2,1]+[1]*4)) 

1.1488348542809168


In [23]:
print(entropy([3,2,1]+[1]*9))

0.8166890883150209


In [24]:
print(entropy([3,2,1]+[1]*30)) 

0.39045213081320695


In [25]:
print(entropy([3,2,1]+[1]*100)) 

0.1575747042906818


In [34]:
print(entropy(['male','female','high','low','blah','game'])) 

2.584962500721156


Computing Information Gain

In [69]:
def informationGain(myList, split=2):
    entropy_before = entropy(myList)
    random.shuffle(myList)
    division = len(myList) / float(split) 
    myList_split = [myList[int(round(division * i)): int(round(division * (i + 1)))] for i in range(split)]
    
    #Computing the indivudual entropy (Assumed split is 2)
    myList_entropy1 = entropy(myList_split[0])
    myList_entropy2 = entropy(myList_split[1])
    
    #Computing the We combine split entropies using the number of instances down each branch as weight factor
    #and get the final entropy after the split
    entropy_after =  (len(myList_split[0])/len(myList))*myList_entropy1 + (len(myList_split[1])/len(myList))*myList_entropy2
    information_Gain = entropy_before - entropy_after
    return information_Gain

In [78]:
myList = ([3,2,1]+[1]*30)

infoGain = informationGain(myList)
print(infoGain)


0.06064881665781319


Computing information gain for feature set in Names


In [81]:
import nltk, random
from nltk.corpus import names
from _collections import defaultdict
from nltk.probability import FreqDist
labeled_names = ([(name) for name in names.words('male.txt')] + [(name) for name in names.words('female.txt')])
random.seed(55)
random.shuffle(labeled_names)
print(labeled_names[0:15])


['Ethelyn', 'Nance', 'Sherwynd', 'Diahann', 'Colly', 'Florentia', 'Dmitri', 'Noah', 'Luise', 'Clarke', 'Patin', 'Rubia', 'Dionis', 'Shel', 'Marysa']


In [72]:
def gender_features(word):
    gf_f=['0',word[0]]
    gf_l=['-1',word[-1]]
    gf_m=[]
    gf_m=['m',word[1:3]]
    gf_f.extend(gf_m)
    gf_f.extend(gf_l)
    return gf_f

In [73]:
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[1000:], featuresets[:1000]
print(train_set[0:10]) 

[(['0', 'C', 'm', 'el', '-1', 'a'], 'female'), (['0', 'S', 'm', 'ti', '-1', 'g'], 'male'), (['0', 'I', 'm', 'll', '-1', 'a'], 'female'), (['0', 'C', 'm', 'al', '-1', 'a'], 'female'), (['0', 'A', 'm', 'li', '-1', 'e'], 'female'), (['0', 'C', 'm', 'ar', '-1', 'a'], 'female'), (['0', 'Y', 'm', 'eh', '-1', 'i'], 'male'), (['0', 'L', 'm', 'at', '-1', 'a'], 'female'), (['0', 'C', 'm', 'yb', '-1', 'l'], 'female'), (['0', 'B', 'm', 'yr', '-1', 'n'], 'male')]


In [87]:
featureList = [gender_features(n) for n in labeled_names]
featureList

[['0', 'E', 'm', 'th', '-1', 'n'],
 ['0', 'N', 'm', 'an', '-1', 'e'],
 ['0', 'S', 'm', 'he', '-1', 'd'],
 ['0', 'D', 'm', 'ia', '-1', 'n'],
 ['0', 'C', 'm', 'ol', '-1', 'y'],
 ['0', 'F', 'm', 'lo', '-1', 'a'],
 ['0', 'D', 'm', 'mi', '-1', 'i'],
 ['0', 'N', 'm', 'oa', '-1', 'h'],
 ['0', 'L', 'm', 'ui', '-1', 'e'],
 ['0', 'C', 'm', 'la', '-1', 'e'],
 ['0', 'P', 'm', 'at', '-1', 'n'],
 ['0', 'R', 'm', 'ub', '-1', 'a'],
 ['0', 'D', 'm', 'io', '-1', 's'],
 ['0', 'S', 'm', 'he', '-1', 'l'],
 ['0', 'M', 'm', 'ar', '-1', 'a'],
 ['0', 'D', 'm', 'uk', '-1', 'e'],
 ['0', 'A', 'm', 'dr', '-1', 'e'],
 ['0', 'A', 'm', 'id', '-1', 'n'],
 ['0', 'C', 'm', 'li', '-1', 'n'],
 ['0', 'A', 'm', 'bd', '-1', 'l'],
 ['0', 'A', 'm', 'ug', '-1', 'e'],
 ['0', 'S', 'm', 'is', '-1', 'e'],
 ['0', 'N', 'm', 'ik', '-1', 'e'],
 ['0', 'M', 'm', 'uf', '-1', 'n'],
 ['0', 'B', 'm', 'er', '-1', 'a'],
 ['0', 'L', 'm', 'or', '-1', 'i'],
 ['0', 'J', 'm', 'ad', '-1', 'e'],
 ['0', 'L', 'm', 'au', '-1', 'a'],
 ['0', 'I', 'm', 'ng

In [91]:
featuresetsSquished = [(''.join(gender_features(n))) for n in labeled_names]
featuresetsSquished

['0Emth-1n',
 '0Nman-1e',
 '0Smhe-1d',
 '0Dmia-1n',
 '0Cmol-1y',
 '0Fmlo-1a',
 '0Dmmi-1i',
 '0Nmoa-1h',
 '0Lmui-1e',
 '0Cmla-1e',
 '0Pmat-1n',
 '0Rmub-1a',
 '0Dmio-1s',
 '0Smhe-1l',
 '0Mmar-1a',
 '0Dmuk-1e',
 '0Amdr-1e',
 '0Amid-1n',
 '0Cmli-1n',
 '0Ambd-1l',
 '0Amug-1e',
 '0Smis-1e',
 '0Nmik-1e',
 '0Mmuf-1n',
 '0Bmer-1a',
 '0Lmor-1i',
 '0Jmad-1e',
 '0Lmau-1a',
 '0Imng-1d',
 '0Rmen-1o',
 '0Tmip-1e',
 '0Smha-1e',
 '0Smky-1y',
 '0Mmor-1e',
 '0Tmit-1s',
 '0Cmar-1a',
 '0Amve-1l',
 '0Smam-1a',
 '0Kmay-1e',
 '0Pmam-1a',
 '0Mmar-1e',
 '0Emth-1d',
 '0Dmel-1l',
 '0Bmen-1a',
 '0Ambb-1e',
 '0Rmeg-1e',
 '0Yman-1y',
 '0Gmer-1d',
 '0Smar-1e',
 '0Lmou-1e',
 '0Rmud-1d',
 '0Bmre-1a',
 '0Amnu-1g',
 '0Pmau-1l',
 '0Tmhe-1c',
 '0Amdr-1k',
 '0Pmry-1e',
 '0Iman-1e',
 '0Vmal-1e',
 '0Kmri-1r',
 '0Gmle-1n',
 '0Pmhy-1a',
 '0Dmuf-1e',
 '0Rmus-1l',
 '0Smhe-1n',
 '0Dmim-1s',
 '0Fmor-1r',
 '0Lmil-1a',
 '0Gmar-1y',
 '0Ammb-1e',
 '0Rmod-1e',
 '0Lmor-1y',
 '0Imda-1a',
 '0Kmat-1e',
 '0Pman-1y',
 '0Tmho-1n',
 '0Ombe-1n',

In [92]:
entropy(featuresetsSquished)

11.446631389898391

In [94]:
informationGain(featuresetsSquished)

0.4796717237290853

In [95]:
informationGain(featuresetsSquished)

0.4762976360529674

In [96]:
informationGain(featuresetsSquished)

0.4889120260330362

In [97]:
informationGain(featuresetsSquished)

0.48704182535621676

Look up the semantic grammar. What is the SEM value assigned ‘every girl’ based on this little grammar. 

In [165]:
from nltk.sem import cooper_storage as cs
sentence = 'every girl'
trees = cs.parse_with_bindops(sentence, grammar='test_storage.fcfg')
semrep = trees[0].label()['SEM']
cs_semrep = cs.CooperStore(semrep)
print(semrep)


[ CORE  = <z35>                                ]
[ STORE = (bo(\P.all x.(girl(x) -> P(x)),z35)) ]


In [158]:
trees

[Tree(NP[SEM=[CORE=<z30>, STORE=(bo(\P.all x.(girl(x) -> P(x)),z30))]], [Tree(Det[SEM=[CORE=<\Q P.all x.(Q(x) -> P(x))>, STORE=()]], ['every']), Tree(N[SEM=[CORE=<girl>, STORE=()]], ['girl'])])]

In [160]:
semrep

[CORE=<z30>, STORE=(bo(\P.all x.(girl(x) -> P(x)),z30))]

In [159]:
for bo in cs_semrep.store:
    print(bo)

bo(\P.all x.(girl(x) -> P(x)),z30)
