# Part III: Which listing to select?
### 2. What ratings and reviews tell?

In [9]:
import nltk
nltk.download('wordnet') 
import textblob
import langdetect
from textblob import TextBlob, Word
from nltk.tokenize import sent_tokenize
from langdetect import detect
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator 


# Pre: read review data and remove non-English comments
reviews = pd.read_csv('reviews.csv')

reviews['comments'] = reviews['comments'].astype(str)
reviews['language'] = None
for i in range(len(reviews)):
    try:
        reviews['language'][i] = detect(reviews['comments'][i])
    except:
        reviews['language'][i] = "non-en"   #language detection from nltk is sensitive to empty string
reviews=reviews.query('language =="en" ')


In [None]:
## 1.Scatter plot showing relationship between rating scores and number of reviews
lists.plot(kind='scatter', y='reviews_per_month', x='review_scores_rating')


In [None]:
## 2.Wordcloud of reviews for each listing
file = open('stopwords.txt','r')
stopwords = []
for line in file:
    line = line.rstrip().strip(' ')
    stopwords.append(line)  
    
def comment(id):
    '''
    Generate a dataframe with all the comments of the listing with the id you input
    '''
    dic = {}
    dic2 = {}
    for index, row in reviews.iterrows():
        if row['listing_id'] == id :
            dic[index] = row['comments']
            dic2[index] = row['listing_id']
    df = pd.DataFrame({'listing_id':dic2, 'comments':dic})  
    return df
    
def wordcloud(id):
    '''
    Generate wordcloud for the listing with the id you input
    '''  
    df = comment(id)
    # text cleaning for listing comments
    df['comments'] = df['comments'].apply(lambda x: " ".join(x.lower() for x in x.split()))   #lower cases
    df['comments'] = df['comments'].str.replace('[^\w\s]','')   #remove punctuation
    df['comments'] = df['comments'].apply(lambda x: " ".join(x for x in x.split() if x not in stopwords))  #remove stop words
    df['comments'] = df['comments'].apply(lambda x: str(TextBlob(x).correct()))  #correct spelling 
    df['comments'] = df['comments'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))  #lemmatization
    
    # create wordcloud
    total_comments = " ".join(com for com in df.comments) 
    wordcloud = WordCloud(background_color = "white").generate(total_comments)
    plt.imshow(wordcloud, interpolation = 'bilinear')
    plt.axis("off")
    plt.savefig("wordcloud.png") 


In [12]:
## 3.Clustering Summarization
def  summarization(id):
    '''
    Generate summarization for each listing with the id you input
    '''
    df = comment(id)
    # tokenize English sentences
    eng_comments = " ".join(sent for sent in comment['comments'])
    tok_comments = sent_tokenize(eng_comments)
    
    # skip thoughts sentence embedding
    all_com = [sent for sent in tok_comments]
    enc_com = encoder.encode(all_sent, verbose=False)   #pre-trained model can be checked in #Appendix#
    
    # KMeans clustering
    n_clusters = int(np.ceil(len(enc_com)**0.2))  #determine number of clusters
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    kmeans = kmeans.fit(enc_com)
    avg = []
    closest = []
    for j in range(n_clusters):
        idx = np.where(kmeans.labels_ == j)[0]
        avg.append(np.mean(idx))
    closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, enc_com)
    summary = " ".join(cleaned_text[idx] for idx in closest)
    print(summary)

    
## Appendix: Import pre-trained Skip Thoughts encode model
import scipy.spatial.distance as sd
import configuration
import encoder_manager

VOCAB_FILE = "../skip_thoughts_bi_2017_02_16/vocab.txt"
EMBEDDING_MATRIX_FILE = "../skip_thoughts_bi_2017_02_16/embeddings.npy"
CHECKPOINT_PATH = "../skip_thoughts_bi_2017_02_16/model.ckpt-500008"

encoder = encoder_manager.EncoderManager()
encoder.load_model(configuration.model_config(bidirectional_encoder=True),
                   vocabulary_file=VOCAB_FILE,
                   embedding_matrix_file=EMBEDDING_MATRIX_FILE,
                   checkpoint_path=CHECKPOINT_PATH)


In [None]:
## 4.Sentiment Analysis
reviews['sentiment'] = reviews['comments'].apply(lambda x: TextBlob(x).sentiment[0])
