In [1]:
import pandas as pd

# import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
from math import log

In [2]:
df = pd.read_csv('reviews_data.csv', delimiter = '\n', encoding = 'Windows-1252')

In [3]:
# function to sanitize the data (works for only one column, but is faster)
def sanitize_data(df, column_name):
    df[column_name] = df[column_name].str.strip() # trim the text
    df[column_name] = df[column_name].str.lower() # text to lowercase
    
    df = df.replace('[!?";\d\.,()–-]', '', regex = True) # remove non-textual characters
    df = df.replace('\n', ' ', regex = True) # replace \n with a space
    
    # remove stop words
    stop_words = stopwords.words('english')
    df[column_name] = df[column_name].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
    
    return df

df = sanitize_data(df, 'Review')
df

Unnamed: 0,Review
0,everything weather staff food property fire pi...
1,hotel fantastic built sea living together natu...
2,one dream cozy comfortable hotel best personal...
3,hotel concept hard grasp communicate environme...
4,wonderful hotel romantic escape every room the...
...,...
396,extraordinary place amazing architecture home ...
397,wonderful place relax enjoy countryside specia...
398,thank best ever best dinner pool view beach pl...
399,know returned home days areias seixo would lik...


In [4]:
# ------ P(A) ------

# get the probabilities of all the words in the reviews
num_of_reviews = df.size
word_probs = {}

for i in range(num_of_reviews):
    review = df.iloc[i]['Review']
    unique_words = set(review.split(' ')) # get the unique words from the review
    
    for item in unique_words:
        # if key exists in dictionary add 1 else create a new key of that word with value 1
        word_probs[item] = word_probs[item] + 1 if item in word_probs.keys() else 1
        
# convert the dictionary to a dataframe
word_probs = pd.DataFrame(word_probs.items(), columns = ['word', 'count'])
word_probs = word_probs.set_index('word')

# divide all the occurances of the word by the total number of reviews to get the probability for each word
word_probs['prob'] = word_probs['count'] / num_of_reviews
word_probs = word_probs.sort_values(by = 'prob', ascending = False)

# calculate entropies for all the words
word_probs['entropy'] = word_probs['prob'].apply(lambda x:  -1 * x * log(x, 2))
word_probs

Unnamed: 0_level_0,count,prob,entropy
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
place,134,0.334165,0.528438
thank,125,0.311721,0.524213
hotel,105,0.261845,0.506203
wonderful,98,0.244389,0.496781
back,86,0.214464,0.476366
...,...,...,...
confused,1,0.002494,0.021565
culminates,1,0.002494,0.021565
greatly,1,0.002494,0.021565
magically,1,0.002494,0.021565


In [5]:
# ------ Conditional probability - P(A|B) ------

# only calculating the conditional probs of top 50 most occurring words (for convenience)
word_top_probs = word_probs[:50]

# dictionary to store the occurance count of each word with other words
occur_counts = {}

for i in range(word_top_probs.index.size):
    B = word_top_probs.index[i]
    
    for j in range(num_of_reviews):
        review = df.iloc[j]['Review']
        unique_words = set(review.split(' '))
        
        # if B is not in the review then ignore this review (do not window this review)
        if not B in unique_words:
            continue
            
        # check if any of the other words than B are part of the review
        for k in range(word_top_probs.index.size):
            if k == i:
                continue
                
            A = word_top_probs.index[k]
            
            # if A is part of the review, than B occured with A, +1 times
            if A in unique_words:
                if B not in occur_counts.keys():
                    occur_counts[B] = {A: 1}
                    continue
                    
                occur_counts[B][A] = occur_counts[B][A] + 1 if A in occur_counts[B].keys() else 1

In [6]:
# ------ entropy and MI score ------
# H(A|B) = -P(A|B) * log(P(A|B)) -P(not A|B) * log(P(not A|B))
# P(A|B) = count(A and B) / count(B)
mi_scores = pd.DataFrame({}, columns = ['X', 'Y', 'Score'])

for i in occur_counts.keys():
    for j in occur_counts[i]:
        # P(A|B)
        b_count = word_top_probs.loc[j, 'count']
        a_b_count = occur_counts[i][j]
        p_a_b = a_b_count / b_count
        
        # H(A|B)
        h_a_b = -1 * p_a_b * log(p_a_b, 2) - (1 - p_a_b) * log(1 - p_a_b, 2)
        word_top_probs.loc[i, j] = h_a_b
        
        # if a combination X and Y is already stored, skip the vice versa
        if(mi_scores[(mi_scores['X'] == j) & (mi_scores['Y'] == i)].size > 0):
            continue
            
        # MI(A|B)
        score = h_a_b - word_top_probs.loc[i, 'entropy']
        mi_scores = mi_scores.append({'X': i, 'Y': j, 'Score': score if score > 0 else 0 }, ignore_index = True)

mi_scores = mi_scores.sort_values(by = 'Score', ascending = False)
mi_scores = mi_scores.reset_index(drop = True)
top_mi_scores = mi_scores[0:50]
top_mi_scores

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


Unnamed: 0,X,Y,Score
0,like,would,0.699805
1,soon,see,0.668006
2,service,well,0.620097
3,seixo,really,0.613092
4,nice,people,0.607745
5,also,really,0.594968
6,also,would,0.593731
7,everything,perfect,0.587237
8,restaurant,perfect,0.583154
9,one,would,0.58305
