# NLP

# Computing PMI

Our task is to discover strong associations between concepts in Airbnb reviews.

### Imports, data loading and helper functions

We first connect our google drive, import pandas, numpy and some useful nltk and collections modules, then load the dataframe and define a function for printing the current time, useful to log our progress in some of the tasks.

In [0]:
import pandas as pd
from nltk.tag import pos_tag
import re
from collections import defaultdict,Counter
from nltk.stem import WordNetLemmatizer
from datetime import datetime
from tqdm import tqdm
from nltk import bigrams
import numpy as np
import itertools
import os
import math
tqdm.pandas()

In [0]:
# nltk imports, note that these outputs may be different if you are using colab or local jupyter notebooks
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize,sent_tokenize

In [0]:
# load stopwords
sw = set(stopwords.words('english'))

In [0]:
#p = 'some_directory'
#df = pd.read_csv(os.path.join(p,'reviews.csv'))
df = pd.read_csv('/dbfs/FileStore/tables/reviews.csv')
# deal with empty reviews
df.comments = df.comments.fillna('')

In [0]:
df.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,2818,1191,2009-03-30,10952,Lam,Daniel is really cool. The place was nice and ...
1,2818,1771,2009-04-24,12798,Alice,Daniel is the most amazing host! His place is ...
2,2818,1989,2009-05-03,11869,Natalja,We had such a great time in Amsterdam. Daniel ...
3,2818,2797,2009-05-18,14064,Enrique,Very professional operation. Room is very clea...
4,2818,3151,2009-05-25,17977,Sherwin,Daniel is highly recommended. He provided all...


In [0]:
df.shape

### Process reviews

In [0]:
def process_reviews(df):
    '''
    this function creates 3 additional columns to the dataframe
    'tokenized' column applies 'word_tokenize' method to the 'comments' column
    'tagged' column applies 'pos_tag' function to the tokenized column
    Similarly, 'lower_tagged' column lowers the 1st item in the tuple of 'tagged' column.
    '''
    df['tokenized'] = df["comments"].apply(word_tokenize)
    df['tagged']  = df['tokenized'].apply(lambda x: pos_tag(x))
    df['lower_tagged'] = df['tagged'].apply(lambda x: [(i[0].lower(),i[1]) for i in x])
    return df

In [0]:
df = process_reviews(df)

In [0]:
# printing dataframe head
df.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,tokenized,tagged,lower_tagged
0,2818,1191,2009-03-30,10952,Lam,Daniel is really cool. The place was nice and ...,"[Daniel, is, really, cool, ., The, place, was,...","[(Daniel, NNP), (is, VBZ), (really, RB), (cool...","[(daniel, NNP), (is, VBZ), (really, RB), (cool..."
1,2818,1771,2009-04-24,12798,Alice,Daniel is the most amazing host! His place is ...,"[Daniel, is, the, most, amazing, host, !, His,...","[(Daniel, NNP), (is, VBZ), (the, DT), (most, R...","[(daniel, NNP), (is, VBZ), (the, DT), (most, R..."
2,2818,1989,2009-05-03,11869,Natalja,We had such a great time in Amsterdam. Daniel ...,"[We, had, such, a, great, time, in, Amsterdam,...","[(We, PRP), (had, VBD), (such, JJ), (a, DT), (...","[(we, PRP), (had, VBD), (such, JJ), (a, DT), (..."
3,2818,2797,2009-05-18,14064,Enrique,Very professional operation. Room is very clea...,"[Very, professional, operation, ., Room, is, v...","[(Very, RB), (professional, JJ), (operation, N...","[(very, RB), (professional, JJ), (operation, N..."
4,2818,3151,2009-05-25,17977,Sherwin,Daniel is highly recommended. He provided all...,"[Daniel, is, highly, recommended, ., He, provi...","[(Daniel, NNP), (is, VBZ), (highly, RB), (reco...","[(daniel, NNP), (is, VBZ), (highly, RB), (reco..."


### Create a vocabulary

A function `get_vocab(df)` which takes as input the DataFrame generated, and returns two lists, one for the 1,000 most frequent center words (nouns) and one for the 1,000 most frequent context words (either verbs or adjectives).

In [0]:
def get_vocab(df):
    '''
    this function creates 1000 'center words' from most frequent nouns and 
    1000 'context words' from most frequent verbs or adjectives.
    '''
    cent_vocab = [token[0] for x in df['lower_tagged'] for token in x if token[1].startswith('N')]
    cent_vocab = nltk.FreqDist(cent_vocab)
    cont_vocab = [token[0] for x in df['lower_tagged'] for token in x if token[1].startswith(('J','V'))]
    cont_vocab = nltk.FreqDist(cont_vocab) #we use nltk freqdist to get the frequency of vocabulary
    cent_vocab = pd.DataFrame(dict(cent_vocab).items(), columns=['noun', 'count'])
    cont_vocab = pd.DataFrame(dict(cont_vocab).items(), columns=['verb_adjective', 'count'])
    cent_vocab = cent_vocab.sort_values(by=['count'],ascending=False)[:1000]
    cont_vocab = cont_vocab.sort_values(by=['count'],ascending=False)[:1000]
    cent_vocab = list(cent_vocab['noun'])
    cont_vocab = list(cont_vocab['verb_adjective'])
    return cent_vocab, cont_vocab

In [0]:
cent_vocab, cont_vocab = get_vocab(df)

In [0]:
len(cent_vocab),len(cont_vocab)

### Count co-occurrences between center and context words

A function `get_coocs(df, center_vocab, context_vocab)` which takes as input the DataFrame generated, and the lists generated  and returns a dictionary of dictionaries.

In [0]:
def get_coocs(df, cent_vocab, cont_vocab):
    '''
    This function generally takes alot of time to iterate through whole dataframe ~5hrs
    so, you can take subset of data for dataframe 'df' for testing purpose, for example 100 rows, df=df[:100]
    we are returning a dictionary of dictionaries which has the center word 
    and its associated context words with its count value
    recur_dict() this function finally creates dictionary of dictionaries from a pandas a dataframe
    '''
    def recur_dict(frame):
        if len(frame.columns) == 1:
            if frame.values.size == 1:
                return frame.values[0][0]
        return frame.values.squeeze()
        grouped = frame.groupby(frame.columns[0])
        d = {k: recur_dictify(g.iloc[:,1:]) for k,g in grouped}
    return d

    count =0
    final = pd.DataFrame()
    #iterating through each center word and looking for its cooccurance with verbs and adjectives
    for i in cent_vocab:
        if count%100 ==0:
            #printing the count of center words completed while execution
            print(count)
        t = pd.DataFrame()
        try:
            #filter only comments which has center word in it
            y = df['comments'].str.lower().apply(lambda row: re.findall(r'\b' + str(i) + r'\b', str(row))!=[])
        except Exception:
            pass
        #get index of rows which has the centre word
        inx = [i for i, x in enumerate(y) if x] 
        # create temp dataframe 't'
        t['tokenized'] = df['tokenized'].iloc[inx] 
        #get bigrams for each center word from all the comments
        bigram = [list(bigrams(list(list(i)))) for i in t['tokenized'] if i!=''] #if 
        bigram = list(itertools.chain.from_iterable(bigram))
        bigram = [word for word in bigram if str(i) in word]
        bigram_freq = nltk.FreqDist(bigram)
        to_list = [list(k)+[v] for k,v in bigram_freq.items()]
        temp_df = pd.DataFrame.from_records(to_list)
        #creating dataframe which has noun, verb and its count of occurance
        temp_df = temp_df.rename(columns={0: "noun",1:"verb",2:"count"})
        for j,rows in temp_df.iterrows():
            if rows['noun'] == str(i):
                final = final.append(rows)
            else:
                temp = rows['noun']
                rows['noun'] = rows['verb']
                rows['verb'] = temp
                final = final.append(rows)
        #filtering only the noun,verbs and adjectives which are in cent_vocab and cont_vocab
        final_df = final[final['noun'].isin(cent_vocab) & final['verb'].isin(cont_vocab)]
        final_df = final_df.groupby(by=['noun','verb'], as_index=False)['count'].sum().reset_index(drop=True)
        #passing dataframe to recur_dict to convert it into dictionary of dictionaries
        coocs = recur_dict(final_df)
        return coocs

In [0]:
coocs = get_coocs(df, cent_vocab, cont_vocab)

In [0]:
#below is the dictionary of dictionaries
coocs

### Convert co-occurrence dictionary to 1000x1000 dataframe
A function called `cooc_dict2df(cooc_dict)`, which takes as input the dictionary of dictionaries generated and returns a DataFrame where each row corresponds to one center word, and each column corresponds to one context word, and cells are their corresponding co-occurrence value. Some (x,y) pairs will never co-occur, you should have a 0 value for those cases.

In [0]:
def cooc_dict2df(coocs,cent_vocab, cont_vocab):
    '''
    this function returns the dataframe in 1000x1000 format where rows are the center words and columns are context words.
    we are creating a co_occurance_matrix with of size 1000x1000 initially using np.zeros
    later, we are iterating the coocs dictionary and placing the count accordingly in the matrix and returning it as pandas
    dataframe.
    '''
    #converting coocs dictionary to pandas dataframe
    coocdf = pd.concat({
            k: pd.DataFrame.from_dict(v, 'index') for k, v in coocs.items()
        }, 
        axis=0)
    coocdf = coocdf.reset_index(drop=False)
    coocdf = coocdf.rename(columns={'level_0': 'noun','level_1': 'verb',0:"count"})
    #creating a co_occurance_matrix with of size 1000x1000 initially using np.zeros
    co_occurrence_matrix = np.zeros((1000,1000))
    vocab_index_noun = {word: i for i, word in enumerate(cent_vocab)}
    vocab_index_verb = {word: i for i, word in enumerate(cont_vocab)}
    #iterating through each row in pandas dataframe and placing the count of each combination in appropriate location 
    #of the matrix
    for i,row in coocdf.iterrows():
        pos_current = vocab_index_noun[row['noun']]
        current = row['noun']
        pos_previous = vocab_index_verb[row['verb']]
        previous = row['verb']
        count = row['count']
    co_occurrence_matrix[pos_current][pos_previous] = count
    co_occurrence_matrix = np.matrix(co_occurrence_matrix)
    coocdf = pd.DataFrame(co_occurrence_matrix, index=vocab_index_noun, columns=vocab_index_verb)
    return coocdf

In [0]:
coocdf = cooc_dict2df(coocs,cent_vocab, cont_vocab)
coocdf.shape

In [0]:
#below is the final 1000x1000 dataframe and counts of center words with their context words
coocdf

Unnamed: 0,was,is,great,nice,had,clean,were,recommend,stay,are,good,perfect,i,comfortable,easy,have,be,quiet,helpful,beautiful,amazing,get,super,un,'s,amsterdam,located,wonderful,has,central,est,’,friendly,lovely,staying,need,made,nous,enjoyed,walking,...,+,everyday,heated,compared,expecting,informed,locked,complimentary,genial,center,smart,não,haben,allows,inner,facile,gleich,abbiamo,limited,changed,chat,fair,understand,particular,hoped,respectful,everywhere,tired,tricky,okay,returned,ubicado,tucked,chilled,well-located,gracias,based,avant,silent,cafes
place,19630.0,29356.0,15894.0,7374.0,367.0,1247.0,165.0,22.0,36.0,185.0,2551.0,4757.0,604.0,931.0,106.0,30.0,6.0,1577.0,5.0,1819.0,2155.0,5.0,101.0,4.0,22310.0,21.0,244.0,1749.0,989.0,182.0,11.0,257.0,87.0,3591.0,3.0,5.0,197.0,6.0,3.0,10.0,...,9.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,5.0,0.0,0.0,5.0,0.0,1.0,0.0,0.0,2.0,2.0,0.0,3.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0,13.0,0.0,0.0,7.0,8.0,15.0,0.0,18.0,1.0,29.0,0.0
apartment,24583.0,40703.0,4635.0,3935.0,937.0,2121.0,420.0,13.0,20.0,649.0,335.0,694.0,184.0,962.0,29.0,52.0,5.0,269.0,0.0,3697.0,1144.0,4.0,111.0,1.0,11937.0,233.0,804.0,1057.0,2171.0,95.0,0.0,143.0,35.0,3602.0,3.0,5.0,231.0,0.0,1.0,15.0,...,9.0,6.0,6.0,1.0,0.0,0.0,2.0,0.0,0.0,7.0,14.0,0.0,1.0,6.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,7.0,0.0,0.0,2.0,2.0,0.0,4.0,0.0,0.0,6.0,4.0,29.0,0.0,11.0,0.0,2.0,0.0
location,10282.0,23253.0,31499.0,1990.0,17.0,43.0,112.0,1.0,9.0,119.0,8402.0,8691.0,87.0,42.0,187.0,2.0,2.0,1315.0,2.0,526.0,2799.0,2.0,362.0,2.0,389.0,40.0,9.0,1097.0,81.0,2731.0,30.0,8.0,23.0,547.0,0.0,0.0,134.0,5.0,1.0,89.0,...,37.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,17.0,2.0,0.0,0.0,26.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0,1.0,5.0,0.0,0.0,1.0,0.0,0.0,2.0,0.0,0.0,3.0,2.0,0.0,0.0,2.0,0.0,5.0,0.0
amsterdam,1703.0,2710.0,209.0,41.0,51.0,4.0,119.0,20.0,237.0,224.0,25.0,189.0,1382.0,20.0,51.0,45.0,10.0,34.0,7.0,387.0,108.0,17.0,30.0,47.0,1206.0,1.0,7.0,129.0,755.0,6677.0,197.0,1072.0,3.0,127.0,197.0,2.0,84.0,52.0,132.0,30.0,...,11.0,17.0,0.0,2.0,0.0,0.0,0.0,0.0,3.0,531.0,0.0,3.0,20.0,2.0,22.0,1.0,0.0,1.0,0.0,2.0,0.0,1.0,3.0,1.0,2.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,7.0,11.0,2.0,0.0,1.0
i,8005.0,35.0,23.0,30.0,10259.0,24.0,764.0,4874.0,294.0,114.0,13.0,21.0,10.0,97.0,5.0,4556.0,22.0,12.0,14.0,15.0,16.0,203.0,12.0,12.0,115.0,1382.0,9.0,8.0,18.0,7.0,0.0,3399.0,19.0,16.0,14.0,311.0,223.0,0.0,1807.0,7.0,...,4.0,6.0,0.0,1.0,1.0,16.0,13.0,0.0,1.0,26.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,22.0,1.0,2.0,113.0,3.0,31.0,2.0,71.0,5.0,0.0,0.0,69.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
peace,3.0,6.0,3.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,24.0,0.0,3.0,0.0,0.0,0.0,5.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
все,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
petits,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
découvrir,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,29.0,0.0,379.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Raw co-occurrences to PMI scores
A function `cooc2pmi(df)` that takes as input the DataFrame generated, and returns a new DataFrame with the same rows and columns, but with PMI scores instead of raw co-occurrence counts.

In [0]:
def cooc2pmi(df):
    '''
    this function takes the coocdf dataframe and returns it with the PMI scores instead of raw counts
    we used np.log2() function to convert them into log scores.
    '''
    #total sum of all values in matrix
    all_sum = df.sum().sum()
    #total sum of each row
    df['pj'] = df.sum(axis=1)
    #total sum of each column
    df.loc["pi"] = df.sum()
    testdf = pd.DataFrame()
    testdf['test']= df.loc['pi'].T
    testdf = testdf.reset_index(drop=True)
    testdf['test_1'] = df.reset_index(drop=False)['index']
    testdf = testdf.rename(columns={"test": "pi","test_1":"index"})
    df = df.reset_index(drop=False)
    df = pd.concat([df, testdf], axis=1)
    df = df.iloc[:,:-1]
    df = df.set_index('index')
    df = df.div(all_sum)
    df['pw'] = df['pj']*df['pi']
    #using np.log2 transformation
    df = np.log2(df.loc[:,"was":"cafes"].div(df["pw"], axis=0))
    return pmidf

In [0]:
pmidf = cooc2pmi(coocdf)
pmidf.shape

### Retrieve top-k context words, given a center word

A function `topk(df, center_word, N=10)` that takes as input: (1) the DataFrame generated, (2) a `center_word` (a string like `‘towels’`), and (3) an optional named argument called `N` with default value of 10; and returns a list of `N` strings, in order of their PMI score with the `center_word`. You do not need to handle cases for which the word `center_word` is not found in `df`.

In [0]:
def topk(df, center_word, N=10):
    '''
    here we will pass the final dataframe with PMI scores and the center word with the N count of strings needed.
    in the below function call we passed 'coffee' and N=10 which returned the words with their PMI scores in descending order.
    '''
    df = df.loc[center_word]
    top_words = df.to_frame(name=str(center_word)).sort_values(by=str(center_word),ascending=False)[:N]
    return top_words

In [0]:
topk(pmidf, 'coffee')

Unnamed: 0,coffee
great,5.542672
good,4.914641
was,4.262564
nice,4.163554
nespresso,4.048734
provided,3.923979
like,3.766795
had,3.429214
free,3.3621
making,3.107286
