## Text analysis

### Imports, data loading and helper functions

We first connect our google drive, import pandas, numpy and some useful nltk and collections modules, then load the dataframe and define a function for printing the current time, useful to log our progress in some of the tasks.

In [7]:
import pandas as pd
from nltk.tag import pos_tag
import re
from collections import defaultdict,Counter
from nltk.stem import WordNetLemmatizer
from datetime import datetime
from tqdm import tqdm
import numpy as np
import os
import string
import math
import collections
tqdm.pandas()

In [8]:
# nltk imports, note that these outputs may be different if you are using colab or local jupyter notebooks
import nltk
#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
from nltk.corpus import brown
#nltk.download('brown')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize,sent_tokenize

In [9]:
# load stopwords
sw = set(stopwords.words('english') + stopwords.words('french') + stopwords.words('spanish') + stopwords.words('dutch') + stopwords.words('german'))

In [10]:
#input the appropriate file location
p = ''
df = pd.read_csv(os.path.join(p,'reviews.csv'))
# deal with empty reviews
df.comments = df.comments.fillna('')

In [11]:
df.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,2818,1191,2009-03-30,10952,Lam,Daniel is really cool. The place was nice and ...
1,2818,1771,2009-04-24,12798,Alice,Daniel is the most amazing host! His place is ...
2,2818,1989,2009-05-03,11869,Natalja,We had such a great time in Amsterdam. Daniel ...
3,2818,2797,2009-05-18,14064,Enrique,Very professional operation. Room is very clea...
4,2818,3151,2009-05-25,17977,Sherwin,Daniel is highly recommended. He provided all...


In [12]:
df.shape

(452143, 6)

### Process reviews

In [13]:
#set of english words
words = set(nltk.corpus.words.words())

In [14]:
#put a space between punctuation and words
df['comments'] =  [re.sub(r"([,.;\?/!])+\ *", r" \1 ", str(x)) for x in df['comments']]


In [15]:
#keeping only the words that are not stopwords and are english words
df['comments'] = df['comments'].apply(lambda x: " ".join(x for x in x.split() if x not in sw and x in words))

In [16]:
def process_reviews(df):   
    df['tokenized'] = [word_tokenize(str(s)) for s in df['comments']] 
    df['tagged'] = [pos_tag(t) for t in df['tokenized']]
    df['lower_tagged'] = [[([s.lower() for s in item]) for item in df['tagged'][i]] for i in range(len(df['tagged']))] 
    return df

In [17]:
df = process_reviews(df)

In [18]:
df

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,tokenized,tagged,lower_tagged
0,2818,1191,2009-03-30,10952,Lam,Daniel really cool The place nice clean quiet ...,"[Daniel, really, cool, The, place, nice, clean...","[(Daniel, NNP), (really, RB), (cool, VBZ), (Th...","[[daniel, nnp], [really, rb], [cool, vbz], [th..."
1,2818,1771,2009-04-24,12798,Alice,Daniel amazing host place extremely clean ever...,"[Daniel, amazing, host, place, extremely, clea...","[(Daniel, NNP), (amazing, VBG), (host, NN), (p...","[[daniel, nnp], [amazing, vbg], [host, nn], [p..."
2,2818,1989,2009-05-03,11869,Natalja,great time Daniel excellent host friendly help...,"[great, time, Daniel, excellent, host, friendl...","[(great, JJ), (time, NN), (Daniel, NNP), (exce...","[[great, jj], [time, nn], [daniel, nnp], [exce..."
3,2818,2797,2009-05-18,14064,Enrique,professional operation clean comfortable close...,"[professional, operation, clean, comfortable, ...","[(professional, JJ), (operation, NN), (clean, ...","[[professional, jj], [operation, nn], [clean, ..."
4,2818,3151,2009-05-25,17977,Sherwin,Daniel highly provided actually went way beyon...,"[Daniel, highly, provided, actually, went, way...","[(Daniel, NNP), (highly, RB), (provided, VBD),...","[[daniel, nnp], [highly, rb], [provided, vbd],..."
...,...,...,...,...,...,...,...,...,...
452138,46522591,714678579,2020-12-11,67834020,Nikolay,private comfortable,"[private, comfortable]","[(private, JJ), (comfortable, JJ)]","[[private, jj], [comfortable, jj]]"
452139,46522658,710509852,2020-11-22,228202558,Alekos,central,[central],"[(central, JJ)]","[[central, jj]]"
452140,46558182,710474593,2020-11-22,23339864,Khalil,night last minute enjoyable stay bed clean bat...,"[night, last, minute, enjoyable, stay, bed, cl...","[(night, NN), (last, JJ), (minute, NN), (enjoy...","[[night, nn], [last, jj], [minute, nn], [enjoy..."
452141,46591961,711401716,2020-11-26,366346169,Alejandro,,[],[],[]


###  Create a vocabulary

input the DataFrame generated in step 1 and returns two lists, one for the 1,000 most frequent center words (nouns) and one for the 1,000 most frequent context words (either verbs or adjectives). 

In [19]:
def get_vocab(df):
    
    nouns=[]
    verb_adj=[]
    
    #taking each word and checking if it is a noun or verb, adjective by looking into its tag
    #we dont want punctuations to be taken
    #words must be of length 3 atleast to be taken to make sure that most of the words are english and correct
    for i in range(len(df['lower_tagged'])):
        nouns += [token[0] for token in df['lower_tagged'][i] if token[1] in ['nn','nnp','nns'] and token[0] not in string.punctuation and len(token[0])>=3]
        verb_adj += [token[0] for token in df['lower_tagged'][i] if token[1] in ['jj','jjr','jjs','vb','vbd','vbg','vbn','vbp','vbz']and token[0] not in string.punctuation and len(token[0])>=3]
        
    n = Counter(nouns)  
    va = Counter(verb_adj)
    
    cent_vocab = n.most_common(1000)
    cont_vocab = va.most_common(1000)
    
    return cent_vocab, cont_vocab

In [20]:
cent_vocab, cont_vocab = get_vocab(df)

### Count co-occurrences between center and context words

input the DataFrame generated in step 1, and the lists generated in step 2 and returns a dictionary of dictionaries. 

In [21]:
def get_coocs(df, cent_vocab, cont_vocab):
    
    cent = []
    cont = []
    for i in range(1000):
        cont.append(cont_vocab[i][0])
        cent.append(cent_vocab[i][0])
    
    #creating the 1000x1000 dictionary with center words as rows
    #context words in columns
    #the default value as 0
    coocs = {row: {col: 0 for col in cont} for row in cent}    
    
    count=[]
    for x in df['lower_tagged']:
        for y in range(len(x)-2): #going throught each review by word
            if x[y][0] in cent and x[y+1][0] in cont: #if center and context word are next to each other
                count.append((x[y][0],x[y+1][0]))     #example: bed clean
            elif x[y][0] in cent and x[y+2][0] in cont: #if center and context word are seperated by only one word
                count.append((x[y][0],x[y+2][0]))       #example: daniel really cool
    
    #created 'count' which is a list of tuples of each center and context word
    #Counting the occurance of each tuple 
    c = collections.Counter(count)
    lk = list(c.keys())
    lv = list(c.values())

    #adding the center and context words values in the dictionary
    for x in range(len(lk)):
        coocs[lk[x][0]][lk[x][1]] = lv[x]
    
    return coocs  

In [22]:
coocs = get_coocs(df, cent_vocab, cont_vocab)

###  Convert co-occurrence dictionary to 1000x1000 dataframe
input the dictionary of dictionaries generated in step 3 and returns a DataFrame where each row corresponds to one center word, and each column corresponds to one context word, and cells are their corresponding co-occurrence value.

In [23]:
def cooc_dict2df(coocs):
    #convert the dictionary of dictionaries into a dataframe 
    #inverting the rows with the columns to have the center words as rows
    coocdf = pd.DataFrame(coocs).transpose()
    return coocdf

coocdf = cooc_dict2df(coocs)
coocdf.shape

(1000, 1000)

In [24]:
coocdf

Unnamed: 0,great,nice,clean,stay,recommend,good,comfortable,easy,perfect,quiet,...,hassle,higher,needing,pendant,enormous,improve,accommodation,tranquil,sleeper,enthusiastic
place,9676,4635,6461,16690,916,2085,1231,1138,5243,1457,...,7,9,10,2,6,0,24,16,0,3
apartment,10072,6095,9476,1145,732,2543,2063,1219,5580,2244,...,9,4,7,0,7,1,19,19,1,6
location,8730,2812,1984,876,720,2844,801,2845,7122,1966,...,4,0,2,2,5,1,83,10,0,2
host,2862,2262,699,1057,1179,968,227,1280,616,107,...,3,1,0,0,2,5,103,1,0,15
stay,3652,2032,1226,767,3390,1053,1479,1048,1534,630,...,20,4,4,0,2,5,98,9,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
difficulty,1,1,0,6,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
choose,11,13,6,137,6,5,3,6,7,4,...,0,0,0,0,0,0,6,0,0,0
destination,8,2,1,1,1,2,0,2,2,1,...,0,0,0,0,0,0,0,0,0,0
fire,10,1,2,4,1,2,2,1,1,0,...,0,0,0,0,0,0,0,0,0,0


### Raw co-occurrences to PMI scores

input the DataFrame generated in step 4, and returns a new DataFrame with the same rows and columns, but with PMI scores instead of raw co-occurrence counts. 

In [25]:
pmidf = coocdf

In [26]:
def cooc2pmi(df):
    
    #total sum of all the values in the dataframe
    total = df.values.sum()

    for i in range(len(df)): 
        for j in range(len(df)):
            
            #checking if numerator and denominator are not equal to zero
            #log can not have zero values
            #P(x)*P(y)
            if ((sum(df.iloc[i])/total) * (sum(df.iloc[:,j])/total))==0:
                 pmidf.iloc[i,j]=0
              
            #P(x,y)
            elif (df.iloc[i,j]/total)==0:
                 pmidf.iloc[i,j]=0
            
            #PMI = Log( P(x,y) / (P(x)*P(y)) )
            else:
                pmidf.iloc[i,j] = math.log( (df.iloc[i,j]/total) / ((sum(df.iloc[i])/total) * (sum(df.iloc[:,j])/total)) )

    return pmidf

In [27]:
pmidf = cooc2pmi(coocdf)
pmidf.shape

(1000, 1000)

In [28]:
pmidf

Unnamed: 0,great,nice,clean,stay,recommend,good,comfortable,easy,perfect,quiet,...,hassle,higher,needing,pendant,enormous,improve,accommodation,tranquil,sleeper,enthusiastic
place,0.806002,0.520597,1.013620,1.851995,0.109731,0.641787,0.094828,0.074603,1.470314,0.549362,...,3.771188,5.032881,4.979205,2.283488,4.326061,0.000000,2.748408,4.470873,0.00000,3.848475
apartment,0.903035,0.826261,1.474287,-0.601971,-0.227799,0.744328,0.494728,0.032272,1.504582,0.877967,...,4.005670,4.229793,4.630655,0.000000,4.461027,2.820815,2.491480,4.640403,1.39941,4.503401
location,1.141747,0.432789,0.342792,-0.634872,-0.002660,1.139171,-0.174269,1.127941,2.123589,1.090173,...,3.377718,0.000000,3.560794,2.416163,4.303169,2.967436,4.132690,4.242814,0.00000,3.616734
host,0.517804,0.608203,-0.307373,-0.087526,0.868368,0.502822,-1.068072,0.743189,0.204853,-1.515015,...,3.027820,2.942142,0.000000,0.000000,3.326800,4.494993,4.310001,1.901763,0.00000,5.560278
stay,0.423386,0.159105,-0.121612,-0.779232,1.569471,0.237639,0.430812,0.210056,0.759605,-0.107864,...,4.801792,4.196999,4.062773,0.000000,3.204606,4.385431,4.179536,3.973186,0.00000,3.498098
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
difficulty,1.944185,2.004961,0.000000,3.898341,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000
choose,2.954379,3.192904,3.080628,5.666784,2.767166,2.478815,2.590961,2.769656,2.818352,2.775925,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,3.508002,0.000000,0.00000,0.000000
destination,4.381465,3.079513,3.030206,2.557536,2.564834,3.139833,0.000000,3.240101,3.125290,2.937881,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000
fire,4.405387,2.189678,3.525613,3.751261,2.381370,2.958863,3.574695,2.360889,2.246789,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000


### Retrieve top-k context words, given a center word

input: (1) the DataFrame generated in step 5, (2) a `center_word` (a string like `‘towels’`), and (3) an optional named argument called `N` with default value of 10; and returns a list of `N` strings, in order of their PMI score with the `center_word`. 

In [29]:
def topk(df, center_word, N=10):
    
    #getting the series of values related to the center_word
    #sorting them from highest to lowest, output the first N highest values
    top_words = df.loc[center_word].sort_values(ascending=False)[0:N]

    return top_words

In [30]:
topk(pmidf, 'coffee')

shop         6.399179
tea          5.789745
cake         5.718825
kettle       5.443869
electric     5.364879
microwave    4.687970
shampoo      4.328431
bread        4.293461
reading      4.211819
deck         4.135539
Name: coffee, dtype: float64