-
Notifications
You must be signed in to change notification settings - Fork 0
/
Word_collocations_pmi_mi_chisq.py
139 lines (104 loc) · 4.13 KB
/
Word_collocations_pmi_mi_chisq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# coding: utf-8
# Word Collocations
# ### Used Tweet Tokenizer in nltk to tokenize the tweets. The strip handler parameter removes the twitter user handles.
# In[135]:
import pandas as pd
import math
import nltk
from nltk.collocations import *
from nltk.tokenize.casual import TweetTokenizer
from collections import Counter
import string
import re
import csv
import math
# In[138]:
tweets=pd.read_csv('tweets.txt', header=None, names='t')
print tweets.head()
# In[144]:
tknzr = TweetTokenizer(strip_handles=True, preserve_case=False)
unigram_freq=Counter()
bigram_freq=Counter()
for tweet in tweets.t:
tokens= tknzr.tokenize(str(tweet))
bigram_tuples=list(nltk.bigrams(tokens))
unigram_freq.update(tokens)
bigram_freq.update(bigram_tuples)
total_unigram_freq=sum(unigram_freq.values())
unique_unigrams=len(unigram_freq.keys())
total_bigram_freq=sum(bigram_freq.values())
unique_bigrams=len(bigram_freq.keys())
print "Size of the corpus : ", total_unigram_freq
print "Number of unique words : ",unique_unigrams
print "Total number of bigrams : ",total_bigram_freq
print "Number of unique bigrams : ", unique_bigrams
# ###Point wise Mutual Information
# In[145]:
def pmi(unigram_freq, bigram_freq,n):
pmi_list=[]
for bigram in bigram_freq:
prob_word1 = unigram_freq[bigram[0]]
prob_word2 = unigram_freq[bigram[1]]
prob_word1_word2 = bigram_freq[bigram]
pmi_score= math.log(prob_word1_word2/float(prob_word1*prob_word2))
pmi_list.append([bigram,pmi_score, bigram[0],prob_word1, bigram[1],prob_word2])
pmi_sorted=sorted(pmi_list, key=lambda key_value: key_value[1], reverse=True)
return pmi_sorted[0:n]
print pmi(unigram_freq, bigram_freq,100)
# ###Mutual Information
# In[149]:
def mi(unigram_freq, bigram_freq,n):
mi_list=[]
NB=float(sum(bigram_freq.values()))
NU=float(sum(unigram_freq.values()))
for bigram in bigram_freq:
XY=bigram_freq[bigram]
XYB=unigram_freq[bigram[0]]-bigram_freq[bigram]
XBY=unigram_freq[bigram[1]]-bigram_freq[bigram]
XBYB= NB-XY-XYB-XBY
X=unigram_freq[bigram[0]]
Y=unigram_freq[bigram[1]]
XB=NU-X
YB=NU-Y
try:
mi1=XY*math.log(XY/float(X*Y))
except:
mi1=0.0
try:
mi2=XYB*math.log(XYB/float(X*YB))
except:
mi2=0.0
try:
mi3=XBY*math.log(XBY/float(XB*Y))
except:
mi3=0.0
try:
mi4=XBYB*math.log(XBYB/float(XB*YB))
except:
mi4=0.0
mi_score=mi1+mi2+mi3+mi4
mi_list.append([bigram,mi_score, bigram[0],X, bigram[1],Y])
mi_sorted=sorted(mi_list, key=lambda key_value: key_value[1], reverse=True)
return mi_sorted[0:n]
print mi(unigram_freq, bigram_freq,100)
# ###Chi Square
# In[151]:
def chi_sq(unigram_freq, bigram_freq,n):
chisq=[]
N=float(sum(bigram_freq.values()))
for bigram in bigram_freq:
A=bigram_freq[bigram]
B=unigram_freq[bigram[0]]-bigram_freq[bigram]
C=unigram_freq[bigram[1]]-bigram_freq[bigram]
D= N-A-B-C
num=(N *pow(((A*D)-(B*C)),2))
denom=(A+C)*(B+D)*(A+B)*(C+D)
chi_score= num/denom
chisq.append([bigram,chi_score, bigram[0],unigram_freq[bigram[0]], bigram[1],unigram_freq[bigram[1]]])
chi_sorted=sorted(chisq, key=lambda key_value: key_value[1], reverse=True)
return chi_sorted[0:n]
print chi_sq(unigram_freq, bigram_freq,100)
# ###It can be seen that the results from the three measures - pmi, mi and chi square are different.
# ###*Upon compariison of the results from the three measures, it can be seen that the mutual information measure does a better job of finding the top ten word collocations which actually make sense together and are most often used together. Also, their individual frequencies are higher and MI considers the weighted sum of probabilities.
# ###By the definition of Mutual Information, a high value should mean that one feature gives a lot of information about the other and by the definition of Chi Square, a high value of Chi Square means that the two features must be dependent.
#