In [1]:
import pandas as pd
from nltk.tokenize import word_tokenize
# nltk.download('stopwords')  #If you can not import stopwords, you can download using this command
# nltk.download('punkt')
from nltk.corpus import stopwords
stop = stopwords.words('english')

df = pd.read_csv('dataset.csv', sep='\t', engine = 'python', encoding='WINDOWS-1252')

In [2]:
#filtering the data-set
df = df.replace('[-",?!.\n()]', '', regex=True)
df = df.replace('é', 'e', regex=True)
df['Review'] = df['Review'].str.lower()
df['Review'] = df['Review'].str.strip()

In [3]:
#removing the stopwords from the data-set so that we only get useful associations among words
df['Review'] = df['Review'].apply(lambda words: ' '.join([word for word in words.split() if word not in stop]))

In [17]:
df = df[0:100] #if last cell is taking time slice the reviews

In [18]:
total_reviews = df.size
words_dict = {}

for i in range(total_reviews):
    review = df.iloc[i]['Review']
    tokenList = word_tokenize(''.join(review)) #tokenizes the words in the review and adds their count as well    
    
    for token in tokenList:
        if token in words_dict.keys():#if word is present get it's old count and add 1
            words_dict.update({token : words_dict[token] + 1})
        
        else: #if word is not present in the dict add it and its value=1
            words_dict.update({token : 1})

In [19]:
#   print(words_dict) #now we've got the count of every word
len(words_dict)

706

In [20]:
probs = {}
for key, value in words_dict.items():
     probs.update({key : value / total_reviews}) 
    #divide each value of the word with the total number of words in the doc to get the prob of each word

# print(probs)

In [21]:
import math
nested_dict = {}

for i in range(total_reviews): #take each review
    index = 0
    nextWord = 1
    word = 0
    review = df.iloc[i]['Review']
    tokenList = word_tokenize(''.join(review)) #tokenize the words
    
    for token in tokenList: #for each token
        if(word < len(tokenList)):
            probA = tokenList[word]
            
            for i in range(len(tokenList)):#check token with every other token
                if(nextWord + 1 < len(tokenList)):
                    probB = tokenList[nextWord]
                    if probA in nested_dict.keys():#if it is already present update it's count
                        if probB in nested_dict[token].keys():#if the key in nested dictionary is already present update its val
                            nested_dict[probA][probB] = nested_dict[probA][probB] + 1
                        else:#else add it with value of 1
                            nested_dict[probA][probB] = 1
                    else:#for first time add it with value of 1
                          nested_dict[probA] = {probB : 1}

                    nextWord = nextWord + 1
                    index = index + 1
            word = word + 1
            nextWord = word + 1
            index = 0

In [22]:
len(nested_dict)

682

In [23]:
scores = pd.DataFrame({}, columns = ['A', 'B', 'MI Score'])

#iterate throught the nested dictionary and find the entropy and conditional entropy
#use then the mutual information score using the difference of two
for word in nested_dict.keys(): 
    for nextWord in nested_dict[word]:
        probA = probs[word]
        countA = words_dict[word]
        countB = words_dict[nextWord]
        countAwithBINT = nested_dict[word][nextWord]
        countAwithB = float(countAwithBINT)
        probAwithB = countAwithB / countB
        
        if(probA > probAwithB):
            try:
                entropyA = - 1 * probA * (math.log(probA, 2) + 0.1)
                entropyAtoB = -1 * probAwithB * (math.log(probAwithB, 2))  - (1 - probAwithB) * (math.log(1 - probAwithB, 2))
                mi = -(entropyA - entropyAtoB) 
#                 print(mi)
                scores = scores.append({'A': word, 'B':nextWord, 'MI Score': mi}, ignore_index = True)
            except ValueError:
                continue
              

In [25]:
#will take quite some time to print. You can slice the reviews to test
scores = scores.sort_values(by = 'MI Score', ascending = False)
scores = scores.reset_index(drop = True)
scores = scores
print(scores)

               A         B  MI Score
0          place       use  0.709799
1          place       may  0.709799
2          place     right  0.709799
3          place  congrats  0.709799
4          place      work  0.709799
...          ...       ...       ...
1487       areas     hotel -0.200555
1488       seixo     hotel -0.212116
1489  everything     place -0.219841
1490        food       one -0.250750
1491   fantastic       one -0.257682

[1492 rows x 3 columns]
