# Preprocessing

## Libraries

In [None]:
import os
import re
import nltk
import numpy as np
import pandas as pd

from scipy.sparse import csr_matrix

from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

## Functions

In [None]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    
    elif tag.startswith('V'):
        return wordnet.VERB
    
    elif tag.startswith('N'):
        return wordnet.NOUN
    
    elif tag.startswith('R'):
        return wordnet.ADV
    
    else:
        return None

## Data

In [None]:
"""
# Define words in hotel name as stopwords.
hotel_names = [hotel_name[:-4] for hotel_name in os.listdir('crawled_data')]
cleaned_names = [str.lower(re.sub(pattern='[^a-zA-Z]', repl=' ', string=name)) for name in hotel_names]

hotel_name_words = []

for i in range(len(cleaned_names)):
    temp = cleaned_names[i].split()
    
    for j in range(len(temp)):
        hotel_name_words.append(temp[j])
        
hotel_name_words = [word for word in hotel_name_words]
hotel_name_words = list(set(hotel_name_words)) # remove duplicated words

hotel_name_stopwords = pd.DataFrame(data=hotel_name_words, columns=['stopwords'])
hotel_name_stopwords.to_excel('preprocessing/stopwords_hotel.xlsx', index=False)
"""

In [None]:
hotel_list = os.listdir('crawled_data')
stopwords_hotel = pd.read_excel('preprocessing/stopwords_hotel.xlsx')
stopwords_custom = pd.read_excel('preprocessing/stopwords_custom.xlsx')

## DT Matrix, TF-IDF Matrix

In [None]:
for i, hotel in enumerate(hotel_list):
    print(str(i + 1) + 'th hotel out of ' + str(len(hotel_list)) + ' hotels.')

    data = pd.read_csv('crawled_data/' + hotel, sep='|', usecols=['review_id', 'review'])
    
    # tokenizing
    data['review'] = data['review'].apply(lambda review: re.sub(pattern='[^a-zA-Z]', repl=' ', string=review)) # remove non-English character
    data['review'] = data['review'].apply(str.lower) # to lower-case
    data['review'] = data['review'].apply(word_tokenize)
    
    # pos tagging
    documents = list(data['review'])
    pos_documents = [pos_tag(document) for document in documents]
    
    # stopwords
    stop_documents = []
      
    for pos_document in pos_documents:
        stop_document = [tagged_word for tagged_word in pos_document if tagged_word[0] not in stopwords.words('english')]
        stop_document = [tagged_word for tagged_word in stop_document if tagged_word[0] not in list(stopwords_hotel['stopwords'])]
        stop_document = [tagged_word for tagged_word in stop_document if tagged_word[1].startswith(('J', 'V', 'N', 'R'))]
        stop_documents.append(stop_document)
        
    # lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_documents = []
    
    for stop_document in stop_documents:
        lemmatized_document = [lemmatizer.lemmatize(tagged_word[0], pos=get_wordnet_pos(tagged_word[1])) for tagged_word in stop_document]
        lemmatized_documents.append(lemmatized_document)
    
    # stopwords_custom
    lemmatized_documents_ = []
    
    for lemmatized_document in lemmatized_documents:
        lemmatized_document = [word for word in lemmatized_document if word not in list(stopwords_custom['stopwords'])]
        lemmatized_documents_.append(lemmatized_document)
        
    # document-term matrix
    vectorizer = CountVectorizer(tokenizer=lambda x: x, lowercase=False)
    dtm = vectorizer.fit_transform(lemmatized_documents_)
    
    term_frequency = pd.DataFrame(data={'word': vectorizer.get_feature_names(), 
                                      'frequency': dtm.sum(axis=0).flat
                                      })
    tf_50 = term_frequency.sort_values(by='frequency', ascending=False).head(50) # top 50 terms
    tf_50.to_csv('preprocessing/term_frequency/tf_' + hotel)
    
    tf = term_frequency.sort_values(by='frequency', ascending=False)
    column_list = list(tf['word'])
    
    df_tf = pd.DataFrame(data=dtm.todense(), index=data['review_id'], columns=vectorizer.get_feature_names())
    df_tf = df_tf[column_list] # reordering columns
    df_tf.to_csv('preprocessing/document_term_matrix/dt_' + hotel)
    
    # tf-idf matrix
    transformer = TfidfTransformer()
    tf_idf = transformer.fit_transform(dtm)
    
    df_tf_idf = pd.DataFrame(data=tf_idf.todense(), index=data['review_id'], columns=vectorizer.get_feature_names())
    df_tf_idf = df_tf_idf[column_list] # reordering columns
    df_tf_idf.to_csv('preprocessing/tf_idf_matrix/tf_idf_' + hotel)

## TF-IDF Matrix for non-partnership reviews

In [None]:
for i, hotel in enumerate(hotel_list):
    print(str(i + 1) + 'th hotel out of ' + str(len(hotel_list)) + ' hotels.')

    data = pd.read_csv('crawled_data/' + hotel, sep='|', usecols=['review_id', 'review', 'isPartner'])
    data = data[data['isPartner'] == 0].reset_index(drop=True)
    
    # tokenizing
    data['review'] = data['review'].apply(lambda review: re.sub(pattern='[^a-zA-Z]', repl=' ', string=review)) # remove non-English character
    data['review'] = data['review'].apply(str.lower) # to lower-case
    data['review'] = data['review'].apply(word_tokenize)
    
    # pos tagging
    documents = list(data['review'])
    pos_documents = [pos_tag(document) for document in documents]
    
    # stopwords
    stop_documents = []
      
    for pos_document in pos_documents:
        stop_document = [tagged_word for tagged_word in pos_document if tagged_word[0] not in stopwords.words('english')]
        stop_document = [tagged_word for tagged_word in stop_document if tagged_word[0] not in list(stopwords_hotel['stopwords'])]
        stop_document = [tagged_word for tagged_word in stop_document if tagged_word[1].startswith(('J', 'V', 'N', 'R'))]
        stop_documents.append(stop_document)
        
    # lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_documents = []
    
    for stop_document in stop_documents:
        lemmatized_document = [lemmatizer.lemmatize(tagged_word[0], pos=get_wordnet_pos(tagged_word[1])) for tagged_word in stop_document]
        lemmatized_documents.append(lemmatized_document)
    
    # stopwords_custom
    lemmatized_documents_ = []
    
    for lemmatized_document in lemmatized_documents:
        lemmatized_document = [word for word in lemmatized_document if word not in list(stopwords_custom['stopwords'])]
        lemmatized_documents_.append(lemmatized_document)
        
    # document-term matrix
    vectorizer = CountVectorizer(tokenizer=lambda x: x, lowercase=False)
    dtm = vectorizer.fit_transform(lemmatized_documents_)
    
    term_frequency = pd.DataFrame(data={'word': vectorizer.get_feature_names(), 
                                      'frequency': dtm.sum(axis=0).flat
                                      })
    tf_50 = term_frequency.sort_values(by='frequency', ascending=False).head(50) # top 50 terms
    tf_50.to_csv('preprocessing/isPartner/term_frequency/tf_' + hotel)
    
    tf = term_frequency.sort_values(by='frequency', ascending=False)
    column_list = list(tf['word'])
    
    df_tf = pd.DataFrame(data=dtm.todense(), index=data['review_id'], columns=vectorizer.get_feature_names())
    df_tf = df_tf[column_list] # reordering columns
    df_tf.to_csv('preprocessing/isPartner/document_term_matrix/dt_' + hotel)
    
    # tf-idf matrix
    transformer = TfidfTransformer()
    tf_idf = transformer.fit_transform(dtm)
    
    df_tf_idf = pd.DataFrame(data=tf_idf.todense(), index=data['review_id'], columns=vectorizer.get_feature_names())
    df_tf_idf = df_tf_idf[column_list] # reordering columns
    df_tf_idf.to_csv('preprocessing/isPartner/tf_idf_matrix/tf_idf_' + hotel)