In [None]:
import pandas as pd
import nltk
import numpy as np
import wikipedia
import math

import requests
from bs4 import BeautifulSoup  
from urllib.request import urlopen

import re

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import datetime
import time

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize
import gensim

%matplotlib inline

import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.stats.api as sms
import sklearn as sk
import hdbscan

import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pylab as plt
from mpl_toolkits.mplot3d import Axes3D

import seaborn as sns
sns.set()
sns.set_style("whitegrid")
sns.set_color_codes()

from sklearn import cluster, datasets, mixture
from sklearn.cluster import KMeans

In [None]:
!pip install gensim

In [None]:
df = pd.read_csv("travel_wiki_en_nlp1.csv", encoding = "cp949")
df = df.rename(columns = {"Unnamed: 0" : "place", "0" : "nature", "1" : "city", "2" : "star", "3" : "review", "4" : "short_info", "5": "long_info", "6" : "f_wiki", "7" : "s_wiki"})
df.head()

In [None]:
class SentenceTokenizer() :
    def __init__(self, text) :
        self.text = text
        self.sentence = []
        self.noun = []
        self.word_count = []
        
    def making_sentence(self) :
        self.text = self.text.replace("?", "").replace("=", "")
        self.sentence = nltk.sent_tokenize(self.text)
    
    def making_noun(self) :
        for sentence in self.sentence :
            if sentence not in [" ", ""] :
                sentence = sentence.replace("'", "")
                tokens = nltk.word_tokenize(sentence)
                tagged = nltk.pos_tag(tokens)
                nn_tagged = list(filter(lambda x: "NN" in x[1], tagged))
                nn_tagged = list(map(lambda x: x[0], nn_tagged))
                self.noun.append(" ".join(nn_tagged))
                
    def word_counter(self) :
        dic = {}
        
        for noun in self.noun :
            noun_lst = noun.split()
            
            for n in noun_lst :
                if n in dic :
                    dic[n] += 1
                else :
                    dic[n] = 1
        
        self.word_count = sorted(dic.items(), reverse = True, key = lambda x : x[1])
        
    def get_sentence(self) :
        return self.sentence

    def get_noun(self) :
        return self.noun
    
    def get_word_count(self) :
        return self.word_count
    

In [None]:
class GraphMatrix() :
    def __init__(self):
        self.tfidf = TfidfVectorizer()
        self.cnt_vec = CountVectorizer()
        self.graph_sentence = []
    
    def build_sent_graph(self, sentence):
        tfidf_mat = self.tfidf.fit_transform(sentence).toarray()
        self.graph_sentence = np.dot(tfidf_mat, tfidf_mat.T)
        
        return self.graph_sentence
    
    def build_words_graph(self, sentence):
        cnt_vec_mat = normalize(self.cnt_vec.fit_transform(sentence).toarray().astype(float), axis=0)
        vocab = self.cnt_vec.vocabulary_
        
        return np.dot(cnt_vec_mat.T, cnt_vec_mat), {vocab[word] : word for word in vocab}

In [None]:
class Rank(object):
    def get_ranks(self, graph, d=0.85): # d = damping factor
        A = graph
        matrix_size = A.shape[0]
        
        for id in range(matrix_size):
            A[id, id] = 0 # diagonal 부분을 0으로
            link_sum = np.sum(A[:,id]) # A[:, id] = A[:][id]
            
            if link_sum != 0:
                A[:, id] /= link_sum
                A[:, id] *= -d
                A[id, id] = 1
        B = (1-d) * np.ones((matrix_size, 1))
        
        new_A = A+0.00001*np.random.rand(len(A), len(A)) # noise 발생
        ranks = np.linalg.solve(new_A, B) # 연립방정식 Ax = b
        
        return {idx: r[0] for idx, r in enumerate(ranks)}



In [None]:
class TextRank(object):
    def __init__(self, text):
        self.text = SentenceTokenizer(text)
        self.text.making_sentence()
        self.text.making_noun()
        self.text.word_counter()
            
        self.sentences = self.text.get_sentence()
        self.nouns = self.text.get_noun()
        self.counts = self.text.get_word_count()
        
        self.graph_matrix = GraphMatrix()
        self.sent_graph = self.graph_matrix.build_sent_graph(self.nouns)
        self.words_graph, self.idx2word = self.graph_matrix.build_words_graph(self.nouns)
        
        self.rank = Rank()
        self.sent_rank_idx = self.rank.get_ranks(self.sent_graph)
        self.sorted_sent_rank_idx = sorted(self.sent_rank_idx, key=lambda k: self.sent_rank_idx[k], reverse=True)
        
        self.word_rank_idx = self.rank.get_ranks(self.words_graph)
        self.sorted_word_rank_idx = sorted(self.word_rank_idx, key=lambda k: self.word_rank_idx[k], reverse=True)
        
    def summarize(self, sent_num=3):
        summary = []
        index=[]
        
        if sent_num > len(self.sorted_sent_rank_idx) :
            sent_num = len(self.sorted_sent_rank_idx)
        
        for idx in self.sorted_sent_rank_idx[:sent_num]:
            index.append(idx)
        index.sort()
        
        for idx in index:
            summary.append(self.sentences[idx])
            
        return summary
    
    def keywords(self, word_num=20):
        
        if word_num > len(self.sorted_word_rank_idx) :
            word_num = len(self.sorted_word_rank_idx)
        
        rank = Rank()
        rank_idx = rank.get_ranks(self.words_graph)
        sorted_rank_idx = sorted(rank_idx, key=lambda k: rank_idx[k], reverse=True)
        keywords = []
        index=[]
        
        for idx in sorted_rank_idx[:word_num]:
            index.append(idx)
            
        #index.sort()
        for idx in index:
            keywords.append((self.idx2word[idx], rank_idx[idx]))
            
        return keywords



In [None]:
 def making_sentence(text) :
        result = []
        
        if type(text) != str or len(text) == 0 :
            return []
        
        text = text.replace("?", "").replace("=", "")
        sentence = nltk.sent_tokenize(text)
        
        for sent in sentence :
            sent = [x.strip().lower() for x in re.split(r'([&~"\'\(.;,\) ])', sent) if x not in [" ", ""]] 
            result.append(sent)
               
        return result

In [None]:
def w2v(place, s_idx, w_idx) :
    result = []
    
    try :
        sentences = place[1][s_idx]
        words =  place[1][w_idx]
        model = word2vec.Word2Vec(sentences)

        for idx in words :
            try :
                vec = model.wv[idx[0]]
                result.append((idx[0], vec))
            except :
                continue

        return result
    
    except :
        return []

In [None]:
def w2v2(place, s_idx, w_idx, model) :
    result = []
    
    try :
        sentences = place[1][s_idx]
        words =  place[1][w_idx]

        for idx in words :
            try :
                vec = model.wv[idx[0]]
                result.append((idx[0], vec))
            except :
                continue

        return result
    
    except :
        return []

In [None]:
def making_vector(idx, vector) :
    return vector[idx]

In [None]:
def dist (A,B):
    return (B[0], np.linalg.norm(np.asarray(A[1])-np.asarray(B[1])))

In [None]:
def dist2 (A,B):
    return np.dot(A[1], B[1]) / (math.sqrt(np.dot(A[1],A[1])) * math.sqrt(np.dot(B[1],B[1]))) 

In [None]:
df["f_split"] = df.f_wiki.map(lambda x : making_sentence(x))
df["s_split"] = df.s_wiki.map(lambda x : making_sentence(x))

df.head()

In [None]:
result = []

for idx in range(len(df)):
    temp = df.iloc[idx]
    result.append((temp[0],(temp[1], temp[2], temp[3], temp[4], temp[5], temp[6], temp[7], temp[8], temp[9], temp[10])))

In [None]:
err_lst = []
result2 = []

for idx, place in enumerate(result) :
    if place[1][6] in ("", " ")  : 
        continue
    
    try :
        if place[1][7] in ("", " ") :
            continue
            
        else :
            f_textrank = TextRank(place[1][6])
            s_textrank = TextRank(place[1][7])   

            f_keyword = f_textrank.keywords()
            s_keyword = s_textrank.keywords()

            result2.append((place[0], (place[1][0], place[1][1], place[1][2], place[1][3], place[1][4], place[1][5], place[1][6], place[1][7], place[1][8], place[1][9], f_keyword, s_keyword)))    

    except :
        err_lst.append((place[0], (place[1][0], place[1][1])))
        print(place[0],"err")
    
    

In [None]:
wiki_w2v = pd.DataFrame.from_items(result2).T

print(len(wiki_w2v))
wiki_w2v.head()

In [None]:
result3 = []

for place in result2 :
    result3.append((place[0], (place[1][0],place[1][1], place[1][2], place[1][3], w2v(place, 8, 10), w2v(place, 9, 11), place[1][10], place[1][11])))
    
wiki_w2v_tfidf = pd.DataFrame.from_items(result3).T

print(len(wiki_w2v_tfidf))
wiki_w2v_tfidf = wiki_w2v_tfidf.rename(columns = {0 : "nature", 1 : "city", 2 : "star", 3 : "review", 4 : "f_w2v", 5 : "s_w2v", 6 : "f_tfidf", 7 : "s_tfidf"})
wiki_w2v_tfidf.head(30)

wiki_w2v_tfidf = wiki_w2v_tfidf.reset_index().rename(columns = {"index" : "place"})
wiki_w2v_tfidf.head(10).f_w2v[9]

In [None]:
wiki_w2v_tfidf.iloc[0]

In [None]:
vector = []


for idx in range(len(wiki_w2v_tfidf)) :
    V = np.asarray([float(0)]*100)
    
    w2v = wiki_w2v_tfidf.iloc[idx][5]
    tfidf = dict(wiki_w2v_tfidf.iloc[idx][7])
    
    for w in w2v :
        if  tfidf[w[0]] > 0 :
            #constant = wiki_w2v.iloc[idx][6].count(w[0])
            constant = 1
            v = constant * tfidf[w[0]] * np.asarray(w[1])
            V += v
    
    vector.append(V)

In [None]:
wiki_w2v_tfidf["vector"] = 0
wiki_w2v_tfidf["vector"] = wiki_w2v_tfidf.index.map(lambda x : making_vector(x, vector))
wiki_w2v_tfidf.head()

In [None]:
wiki_w2v_tfidf2 = wiki_w2v_tfidf.drop(["s_w2v", "s_tfidf"], axis = 1)
print(len(wiki_w2v_tfidf2))
wiki_w2v_tfidf2.head()

In [None]:
vector2 = list(map(lambda x: (0, x.tolist()), vector))

vector_df = pd.DataFrame.from_items(vector2).T
vector_df = vector_df.reset_index().drop("index", axis =1)
vector_df.index.names = ["index"]
vector_df = vector_df.reset_index()

print(len(vector_df.columns))
vector_df.head(30)

In [None]:
target = ["index"]
feature = list(range(100))

cluster = hdbscan.HDBSCAN()
cluster.fit(vector_df[feature], vector_df[target])

In [None]:
label = {}

for idx in list(cluster.labels_) :
    if idx in label :
        label[idx] += 1
    else :
        label[idx] = 1

label_lst = sorted(label.items(), reverse = True, key = lambda x : x[1])
label_lst

In [None]:
print(cluster.labels_)