In [26]:
import pandas as pd
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import csv
from pprint import pprint

## Keep only aplhanumeric characters in reviews comments

In [27]:
def remove_nonalphanumeric(text):
    return re.sub("[^a-zA-Z0-9]"," ", str(text)).lower()

## Extracting the reviews from the dataset and filter out listings with less than 100 reviews

In [28]:
reviews = pd.read_csv('C:/JK/Masters Studies/Spring 2018/Information Retrieval/Project/data/reviews.csv/reviews.csv')

In [29]:
reviews = reviews.groupby('listing_id').filter(lambda g: (g.listing_id.size >= 100))
unique_listings = reviews.listing_id.unique()
len(unique_listings)

290

## Clean the Data by tokenizing and removing stop words.

In [30]:
reviews['comments_1'] = reviews['comments'].apply(remove_nonalphanumeric)
reviews['tokenized_comments'] = reviews['comments_1'].apply(word_tokenize)
stopset = stopwords.words('english') + list(string.punctuation)
reviews['stop_comments'] = reviews['tokenized_comments'].apply(lambda x: [item for item in x if item not in stopset])
def func(row):
    return " ".join(row)
reviews['data'] = reviews['stop_comments'].apply(lambda x: func(x))

## Get top K diverse reviews by performing LDA and calculating the Rao's Diversity coefficient for each review and pick the K reviews with the highest score

In [37]:
def get_best_reviews_listing(listing_id, num_of_LDA_topics=5, K_reviews=5):
    listing_reviews=reviews[reviews['listing_id']==listing_id]
    tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1000)
    tf = tf_vectorizer.fit_transform(listing_reviews['data'])
    tf_feature_names = tf_vectorizer.get_feature_names()
    
    #LDA
    lda = LatentDirichletAllocation(n_topics= num_of_LDA_topics, learning_method='online',learning_offset=20., batch_size=16).fit(tf)
    lda_W = lda.transform(tf)
    lda_H = lda.components_
    
    #Rao's Diversity coefficient for each document d
    #div(d)=T∑i=1T∑j=1 P(i|d)P(j|d)δ(i,j)
    
    doc_features=np.asarray(tf.todense())
    
    D_T=[]
    for doc in doc_features:
        nd=doc.sum()
        nd_j=lda_H[:, np.nonzero(doc)].sum(axis=2)
        D_T.append(nd_j/nd)
    D_T=np.hstack(D_T).T
    
    #Pairwise cosine similarity between each topic
    sim_i_j=cosine_similarity(lda_H, lda_H)
    
    #Both of dau of formats produce similar results.
    
    #dau_i_j=np.divide(np.ones_like(sim_i_j),sim_i_j)
    dau_i_j=np.subtract(np.ones_like(sim_i_j),sim_i_j)
    
    
    rao_diversity=np.nan_to_num(np.diagonal(np.matmul(np.matmul(D_T, dau_i_j), D_T.T)))
    rao_diversity=rao_diversity[np.nonzero(rao_diversity)]
    
    div_ind=np.argsort(-rao_diversity)[:K_reviews]
    
    docs=list(listing_reviews['comments'])
    #pprint([(cnt, docs[i]) for cnt, i in enumerate(div_ind)])
    top_K_reviews=[docs[i] for i in div_ind]
    
    return top_K_reviews

## To try only one listing, get a random listing id with more than 100 reviews.

In [38]:
selected_listing=np.random.choice(reviews.groupby('listing_id').filter(lambda g: (g.listing_id.size >= 100))["listing_id"])
# get_topics_listing(selected_listing, 10, 5)

In [39]:
#listings = pd.read_csv('/home/kavin/Silo/CollegeWork/InfoRet/Project/reviews.csv')

In [42]:
with open("top_k_reviews_by_listing.csv",'w',encoding='utf-8') as resultFile:
    csv_writer = csv.writer(resultFile, dialect='excel', delimiter=',',lineterminator='\n')
    for listing_id in unique_listings:
        top_reviews = get_best_reviews_listing(listing_id)
        csv_writer.writerow([listing_id]+top_reviews)



## Fetch the most dissimilar reviews based on topics generated

In [43]:
reviews_topics = pd.read_csv('C:/JK/Masters Studies/Spring 2018/Information Retrieval/Project/data/top_k_reviews_by_listing.csv')
reviews_topics.head()

Unnamed: 0,Listing Id,Review 1,Review 2,Review 3,Review 4,Review 5
0,7441144,It's a great quiet stay.,The host has been very accommodating and helpf...,The host was extremely welcoming and obliging....,Nice and easy stay - with good accommodations ...,"Pretty nice, quiet, cozy place to stay. Toilet..."
1,12233830,This was such a great place to stay! We came w...,"Bonne communication, lit confortable, appartem...",We had a great time in Boston. There was every...,Estee could not have been more helpful and the...,Estee's place is a lovely space for a family g...
2,14586440,Matthew's place was amazing in every aspect. L...,Great location and cute apartment.,Matthew was a great host and the location was ...,The location is great!,Great location! Walking distance from many tou...
3,15444930,Great place and cheap.,Very practical check in with punch code. Clean...,"I stayed here for a work trip over a few days,...",Sonder's Place was fantastic. Great location a...,Great apartment at awesome location. Apartment...
4,1596470,Das Townhouse von Michael ist fantastisch!\nPe...,Michael's place is beautiful and he is a great...,"Outstanding host, place, and location.",Great Place - Deff recommended!,"It was a great place, Michael was very helpful..."


## Removing newline

In [44]:
reviews_topics = reviews_topics.replace(r'\n', ' ', regex=True)
reviews_topics.head()

Unnamed: 0,Listing Id,Review 1,Review 2,Review 3,Review 4,Review 5
0,7441144,It's a great quiet stay.,The host has been very accommodating and helpf...,The host was extremely welcoming and obliging....,Nice and easy stay - with good accommodations ...,"Pretty nice, quiet, cozy place to stay. Toilet..."
1,12233830,This was such a great place to stay! We came w...,"Bonne communication, lit confortable, appartem...",We had a great time in Boston. There was every...,Estee could not have been more helpful and the...,Estee's place is a lovely space for a family g...
2,14586440,Matthew's place was amazing in every aspect. L...,Great location and cute apartment.,Matthew was a great host and the location was ...,The location is great!,Great location! Walking distance from many tou...
3,15444930,Great place and cheap.,Very practical check in with punch code. Clean...,"I stayed here for a work trip over a few days,...",Sonder's Place was fantastic. Great location a...,Great apartment at awesome location. Apartment...
4,1596470,Das Townhouse von Michael ist fantastisch! Per...,Michael's place is beautiful and he is a great...,"Outstanding host, place, and location.",Great Place - Deff recommended!,"It was a great place, Michael was very helpful..."


## Fetching the listings data containing name, neighbourhood and other info. per listing

In [45]:
listings = pd.read_csv('C:/JK/Masters Studies/Spring 2018/Information Retrieval/Project/data/listings.csv/listings.csv')
# listings.head()

In [46]:
## Creating new df having only the listings with >=100 reviews and only certain required columns : name, neighbourhood, room type, url

result_list = []
for each_unique in unique_listings:
    listings_1 = listings[listings['id'] == each_unique]
    listing_url = "https://www.airbnb.com/rooms/"+str(each_unique)+"?location=Boston%2C%20MA"
    result = (listings_1['id'].item(),listings_1['name'].item(),listings_1['neighbourhood_cleansed'].item(),listings_1['room_type'].item(),listings_1['picture_url'].item(),listing_url)
    result_list.append(result)
listings_cropped = pd.DataFrame(result_list,columns=['Listing Id','name','neighbourhood','room_type','picture_url','listing_url'])

## Combine the listings and reviews

In [47]:
listings_topics = pd.merge(reviews_topics,listings_cropped)

In [48]:
## Keep only alphanumeric text and punctuation marks in reviews and name
listings_topics['Review 1']=listings_topics['Review 1'].str.replace("[^ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890abcdefghijklmnopqrstuvwxyz'-.;:!? ]", "")
listings_topics['Review 2']=listings_topics['Review 2'].str.replace("[^ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890abcdefghijklmnopqrstuvwxyz'-.;:!? ]", "")
listings_topics['Review 3']=listings_topics['Review 3'].str.replace("[^ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890abcdefghijklmnopqrstuvwxyz'-.;:!? ]", "")
listings_topics['Review 4']=listings_topics['Review 4'].str.replace("[^ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890abcdefghijklmnopqrstuvwxyz'-.;:!? ]", "")
listings_topics['Review 5']=listings_topics['Review 5'].str.replace("[^ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890abcdefghijklmnopqrstuvwxyz'-.;:!? ]", "")
listings_topics['name']=listings_topics['name'].str.replace("[^ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890abcdefghijklmnopqrstuvwxyz'-.;:!? ]", "")

## Creating the csv & json to be parsed to the website

In [49]:
listings_topics.to_csv("listings_topics_edit_execution.txt",columns=['Listing Id','Review 1','Review 2','Review 3','Review 4','Review 5','name','neighbourhood','room_type','picture_url','listing_url'],sep='|',index=False)
listings_topics_json_df= listings_topics.set_index('Listing Id')
listings_topics_json_df.reset_index().to_json('listing_topics_1.json',orient='records')