In [745]:
import pandas as pd
import nltk
import numpy as np
import wikipedia
import math

import requests
from bs4 import BeautifulSoup  
from urllib.request import urlopen

import re

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import datetime
import time

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize
import gensim

%matplotlib inline

import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.stats.api as sms
import sklearn as sk
import hdbscan

import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pylab as plt
from mpl_toolkits.mplot3d import Axes3D

import seaborn as sns
sns.set()
sns.set_style("whitegrid")
sns.set_color_codes()

from sklearn import cluster, datasets, mixture
from sklearn.cluster import KMeans

This call to matplotlib.use() has no effect because the backend has already
been chosen; matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.



In [3]:
!pip install gensim

Collecting gensim
  Downloading gensim-3.2.0-cp36-cp36m-win_amd64.whl (15.5MB)
Collecting smart-open>=1.2.1 (from gensim)
  Downloading smart_open-1.5.6.tar.gz
Collecting bz2file (from smart-open>=1.2.1->gensim)
  Downloading bz2file-0.98.tar.gz
Collecting boto3 (from smart-open>=1.2.1->gensim)
  Downloading boto3-1.5.6-py2.py3-none-any.whl (128kB)
Collecting jmespath<1.0.0,>=0.7.1 (from boto3->smart-open>=1.2.1->gensim)
  Downloading jmespath-0.9.3-py2.py3-none-any.whl
Collecting botocore<1.9.0,>=1.8.20 (from boto3->smart-open>=1.2.1->gensim)
  Downloading botocore-1.8.20-py2.py3-none-any.whl (4.0MB)
Collecting s3transfer<0.2.0,>=0.1.10 (from boto3->smart-open>=1.2.1->gensim)
  Downloading s3transfer-0.1.12-py2.py3-none-any.whl (59kB)
Building wheels for collected packages: smart-open, bz2file
  Running setup.py bdist_wheel for smart-open: started
  Running setup.py bdist_wheel for smart-open: finished with status 'done'
  Stored in directory: C:\Users\LG_\AppData\Local\pip\Cache\whee

In [602]:
df = pd.read_csv("travel_wiki_en_nlp1.csv", encoding = "cp949")
df = df.rename(columns = {"Unnamed: 0" : "place", "0" : "nature", "1" : "city", "2" : "star", "3" : "review", "4" : "short_info", "5": "long_info", "6" : "f_wiki", "7" : "s_wiki"})
df.head()

Unnamed: 0,place,nature,city,star,review,short_info,long_info,f_wiki,s_wiki
0,Buen Retiro Park,스페인,Madrid,4.6,24568,Vast 19th-century park with fountains,"Expansive, 19th-century park with boating lake...",The Buen Retiro Park (Spanish: Parque del Buen...,The Buen Retiro Park (Spanish: Parque del Buen...
1,Royal Palace of Madrid,스페인,Madrid,4.4,6566,"Regal 2,000-room palace, armory &amp; garden","18th-century, ridge-top palace for state occas...",The Royal Palace of Madrid (Spanish: Palacio R...,The Royal Palace of Madrid (Spanish: Palacio R...
2,"Plaza Mayor, Madrid",스페인,Madrid,4.3,17375,Madrid's vibrant main square,Cafes &amp; restaurants line the arches of thi...,The Plaza Mayor (English Main Square) was buil...,The Plaza Mayor (English Main Square) was buil...
3,Museo Nacional Del Prado,스페인,Madrid,4.6,13871,World-class European art collection,"Art museum with Velazquez, Goya and El Greco m...",The Prado Museum (Spanish pronunciation: [mu?s...,The Prado Museum (Spanish pronunciation: [mu?s...
4,Puerta del Sol,스페인,Madrid,4.3,26932,Vast pedestrianized public square,Public square with an equine statue of King Ca...,"The Puerta del Sol (Spanish for ""Gate of the S...","The Puerta del Sol (Spanish for ""Gate of the S..."


In [603]:
class SentenceTokenizer() :
    def __init__(self, text) :
        self.text = text
        self.sentence = []
        self.noun = []
        self.word_count = []
        
    def making_sentence(self) :
        self.text = self.text.replace("?", "").replace("=", "")
        self.sentence = nltk.sent_tokenize(self.text)
    
    def making_noun(self) :
        for sentence in self.sentence :
            if sentence not in [" ", ""] :
                sentence = sentence.replace("'", "")
                tokens = nltk.word_tokenize(sentence)
                tagged = nltk.pos_tag(tokens)
                nn_tagged = list(filter(lambda x: "NN" in x[1], tagged))
                nn_tagged = list(map(lambda x: x[0], nn_tagged))
                self.noun.append(" ".join(nn_tagged))
                
    def word_counter(self) :
        dic = {}
        
        for noun in self.noun :
            noun_lst = noun.split()
            
            for n in noun_lst :
                if n in dic :
                    dic[n] += 1
                else :
                    dic[n] = 1
        
        self.word_count = sorted(dic.items(), reverse = True, key = lambda x : x[1])
        
    def get_sentence(self) :
        return self.sentence

    def get_noun(self) :
        return self.noun
    
    def get_word_count(self) :
        return self.word_count
    

In [604]:
class GraphMatrix() :
    def __init__(self):
        self.tfidf = TfidfVectorizer()
        self.cnt_vec = CountVectorizer()
        self.graph_sentence = []
    
    def build_sent_graph(self, sentence):
        tfidf_mat = self.tfidf.fit_transform(sentence).toarray()
        self.graph_sentence = np.dot(tfidf_mat, tfidf_mat.T)
        
        return self.graph_sentence
    
    def build_words_graph(self, sentence):
        cnt_vec_mat = normalize(self.cnt_vec.fit_transform(sentence).toarray().astype(float), axis=0)
        vocab = self.cnt_vec.vocabulary_
        
        return np.dot(cnt_vec_mat.T, cnt_vec_mat), {vocab[word] : word for word in vocab}

In [605]:
class Rank(object):
    def get_ranks(self, graph, d=0.85): # d = damping factor
        A = graph
        matrix_size = A.shape[0]
        
        for id in range(matrix_size):
            A[id, id] = 0 # diagonal 부분을 0으로
            link_sum = np.sum(A[:,id]) # A[:, id] = A[:][id]
            
            if link_sum != 0:
                A[:, id] /= link_sum
                A[:, id] *= -d
                A[id, id] = 1
        B = (1-d) * np.ones((matrix_size, 1))
        
        new_A = A+0.00001*np.random.rand(len(A), len(A)) # noise 발생
        ranks = np.linalg.solve(new_A, B) # 연립방정식 Ax = b
        
        return {idx: r[0] for idx, r in enumerate(ranks)}



In [606]:
class TextRank(object):
    def __init__(self, text):
        self.text = SentenceTokenizer(text)
        self.text.making_sentence()
        self.text.making_noun()
        self.text.word_counter()
            
        self.sentences = self.text.get_sentence()
        self.nouns = self.text.get_noun()
        self.counts = self.text.get_word_count()
        
        self.graph_matrix = GraphMatrix()
        self.sent_graph = self.graph_matrix.build_sent_graph(self.nouns)
        self.words_graph, self.idx2word = self.graph_matrix.build_words_graph(self.nouns)
        
        self.rank = Rank()
        self.sent_rank_idx = self.rank.get_ranks(self.sent_graph)
        self.sorted_sent_rank_idx = sorted(self.sent_rank_idx, key=lambda k: self.sent_rank_idx[k], reverse=True)
        
        self.word_rank_idx = self.rank.get_ranks(self.words_graph)
        self.sorted_word_rank_idx = sorted(self.word_rank_idx, key=lambda k: self.word_rank_idx[k], reverse=True)
        
    def summarize(self, sent_num=3):
        summary = []
        index=[]
        
        if sent_num > len(self.sorted_sent_rank_idx) :
            sent_num = len(self.sorted_sent_rank_idx)
        
        for idx in self.sorted_sent_rank_idx[:sent_num]:
            index.append(idx)
        index.sort()
        
        for idx in index:
            summary.append(self.sentences[idx])
            
        return summary
    
    def keywords(self, word_num=20):
        
        if word_num > len(self.sorted_word_rank_idx) :
            word_num = len(self.sorted_word_rank_idx)
        
        rank = Rank()
        rank_idx = rank.get_ranks(self.words_graph)
        sorted_rank_idx = sorted(rank_idx, key=lambda k: rank_idx[k], reverse=True)
        keywords = []
        index=[]
        
        for idx in sorted_rank_idx[:word_num]:
            index.append(idx)
            
        #index.sort()
        for idx in index:
            keywords.append((self.idx2word[idx], rank_idx[idx]))
            
        return keywords



In [607]:
 def making_sentence(text) :
        result = []
        
        if type(text) != str or len(text) == 0 :
            return []
        
        text = text.replace("?", "").replace("=", "")
        sentence = nltk.sent_tokenize(text)
        
        for sent in sentence :
            sent = [x.strip().lower() for x in re.split(r'([&~"\'\(.;,\) ])', sent) if x not in [" ", ""]] 
            result.append(sent)
               
        return result

In [608]:
def w2v(place, s_idx, w_idx) :
    result = []
    
    try :
        sentences = place[1][s_idx]
        words =  place[1][w_idx]
        model = word2vec.Word2Vec(sentences)

        for idx in words :
            try :
                vec = model.wv[idx[0]]
                result.append((idx[0], vec))
            except :
                continue

        return result
    
    except :
        return []

In [609]:
def w2v2(place, s_idx, w_idx, model) :
    result = []
    
    try :
        sentences = place[1][s_idx]
        words =  place[1][w_idx]

        for idx in words :
            try :
                vec = model.wv[idx[0]]
                result.append((idx[0], vec))
            except :
                continue

        return result
    
    except :
        return []

In [610]:
def making_vector(idx, vector) :
    return vector[idx]

In [709]:
def dist (A,B):
    return (B[0], np.linalg.norm(np.asarray(A[1])-np.asarray(B[1])))

In [746]:
def dist2 (A,B):
    return np.dot(A[1], B[1]) / (math.sqrt(np.dot(A[1],A[1])) * math.sqrt(np.dot(B[1],B[1]))) 

In [611]:
df["f_split"] = df.f_wiki.map(lambda x : making_sentence(x))
df["s_split"] = df.s_wiki.map(lambda x : making_sentence(x))

df.head()

Unnamed: 0,place,nature,city,star,review,short_info,long_info,f_wiki,s_wiki,f_split,s_split
0,Buen Retiro Park,스페인,Madrid,4.6,24568,Vast 19th-century park with fountains,"Expansive, 19th-century park with boating lake...",The Buen Retiro Park (Spanish: Parque del Buen...,The Buen Retiro Park (Spanish: Parque del Buen...,"[[the, buen, retiro, park, (, spanish:, parque...","[[the, buen, retiro, park, (, spanish:, parque..."
1,Royal Palace of Madrid,스페인,Madrid,4.4,6566,"Regal 2,000-room palace, armory &amp; garden","18th-century, ridge-top palace for state occas...",The Royal Palace of Madrid (Spanish: Palacio R...,The Royal Palace of Madrid (Spanish: Palacio R...,"[[the, royal, palace, of, madrid, (, spanish:,...","[[the, royal, palace, of, madrid, (, spanish:,..."
2,"Plaza Mayor, Madrid",스페인,Madrid,4.3,17375,Madrid's vibrant main square,Cafes &amp; restaurants line the arches of thi...,The Plaza Mayor (English Main Square) was buil...,The Plaza Mayor (English Main Square) was buil...,"[[the, plaza, mayor, (, english, main, square,...","[[the, plaza, mayor, (, english, main, square,..."
3,Museo Nacional Del Prado,스페인,Madrid,4.6,13871,World-class European art collection,"Art museum with Velazquez, Goya and El Greco m...",The Prado Museum (Spanish pronunciation: [mu?s...,The Prado Museum (Spanish pronunciation: [mu?s...,"[[the, prado, museum, (, spanish, pronunciatio...","[[the, prado, museum, (, spanish, pronunciatio..."
4,Puerta del Sol,스페인,Madrid,4.3,26932,Vast pedestrianized public square,Public square with an equine statue of King Ca...,"The Puerta del Sol (Spanish for ""Gate of the S...","The Puerta del Sol (Spanish for ""Gate of the S...","[[the, puerta, del, sol, (, spanish, for, "", g...","[[the, puerta, del, sol, (, spanish, for, "", g..."


In [612]:
result = []

for idx in range(len(df)):
    temp = df.iloc[idx]
    result.append((temp[0],(temp[1], temp[2], temp[3], temp[4], temp[5], temp[6], temp[7], temp[8], temp[9], temp[10])))

In [613]:
err_lst = []
result2 = []

for idx, place in enumerate(result) :
    if place[1][6] in ("", " ")  : 
        continue
    
    try :
        if place[1][7] in ("", " ") :
            continue
            
        else :
            f_textrank = TextRank(place[1][6])
            s_textrank = TextRank(place[1][7])   

            f_keyword = f_textrank.keywords()
            s_keyword = s_textrank.keywords()

            result2.append((place[0], (place[1][0], place[1][1], place[1][2], place[1][3], place[1][4], place[1][5], place[1][6], place[1][7], place[1][8], place[1][9], f_keyword, s_keyword)))    

    except :
        err_lst.append((place[0], (place[1][0], place[1][1])))
        print(place[0],"err")
    
    

Barrio de La Latina err
Museum of the Americas err
Fundaci? Joan Mir? err
Arc de Triomf err
Gaudi House Museum err
Pla?a d'Espanya, Barcelona err
Parc del Laberint d'Horta err
Magic Fountain of Montju?c err
Casa Vicens err
Barcelona Museum of Contemporary Art err
Pla?a Reial err
Port of Barcelona err
FC Barcelona Museum err
CosmoCaixa Barcelona err
CaixaForum Barcelona err
Casa Amatller err
Barcelona Pavilion err
Palau Nacional err
Avinguda Diagonal err
Centre de Cultura Contempor?nia de Barcelona err
Monastery of Pedralbes err
Barcelona City History Museum err
Museu Mar?tim de Barcelona err
Museu Frederic Mar?s err
Design Museum of Barcelona err
Museum of the History of Catalonia err
Casa Batll? err
Parc de la Ciutadella err
Barcelona Cathedral err
Santa Maria del Mar, Barcelona err
Passeig de Gr?cia, Barcelona err
Barcelona Zoo err
El Ba?uelo err
House of Shots err
Casa del Chapiz err
Centro Jos? Guerrero err
Peinador de la Reina err
Aljibe De Trillo err
Mirador San Miguel Alto err
P

CAAM - Atlantic Center of Modern Art err
Santa Catalina Park err
Doramas Park err
El Confital err
La Puntilla (Playa de Las Canteras) err
Bah?a del Confital err
Cicca Cultural Center err
Fundaci?n de Arte y Pensamiento Mart?n Chirino - Castillo de la Luz err
Palacete Rodriguez Quegles err
Castle Of San Critobal err
Mar?timo Santa Catalina Park err
Columbus Square err
Farrujia Exposici?n Las Palmas err
Monumento a las actividades primitivas canarias, de Luis Alem?n Montull err
WAYOUT Room Escape Las Palmas err
Albaola Factor?a Mar?tima Vasca err
Eureka! Zientzia Museoa err
Aiete Park err
Chillida-Leku err
Torreon de igueldo err
Alderdi Eder Parkea err
Zurriola badia / Bah?a Zurriola err
San Vicente err
Mota Castle err
Izurun escape room err
MATER Museoa err
GUARDETXEA err
LA CONCHA err
Museo de la Sidra Vasca err
Mirador del monte Igueldo err
Puerto Pasajes err
Galer?a Kur err
Bateria De Las Damas err
Paseo de los curas. Mirador err
Vigozoo err
Verbum - Casa das Palabras err
Villa Roman

Sewing kit. Museum of M?rida err
Zona Arqueol?gica de Morer?a err
Presa Romana de Proserpina err
Temple of Diana err
Arch of Trajan err
Casa Mitreo err
Cornalvo Natural Park err
Portico of the Municipal Forum of Augusta Emerita err
Club de Senderismo Em?rita Augusta err
Centro de Interpretacion los Columbarios err
Centro De Interpretacion Del Circo Romano err
Iglesia de Santa Luc?a del Trampal err
Roman Wall and Albarrana Islamic Tower UNESCO WHS err
Convento De Las Freylas err
Termas Romanas err
Torre Romana De Agua De Decantacion Del Acueducto De Proserpina err
Obelisco De Santa Eulalia err
Parque de los Centollos err
Praemerita y Geomerita err
Cripta Santa Eulalia err
Gimnasio y Termas romanas err
Palacio polentinos err
Convento de San Jos? err
Capilla de Mos?n Rub? err
Church of San Juan Bautista err
Palacio de los Superunda err
Hornos postmedievales err
Church of San Andr?s err
Puerta de San Isidro o de la Malaventura en la Muralla de ?vila err
Santo Tom? (Almac?n Visible Del Muse

Los Toru?os Natural Park err
Sala Museo Hospitalito err
PICOBARRO err
Grant err
Underwater Archaeology Center Headquarters (C.A.S.) err
Playa de la Costilla err
Callej?n del Obispo err
Playa De Las Redes err
Illa das Esculturas err
Mosteiro de Santa Mar?a da Armenteira err
Parroquia de San Bartolom? err
Praza Da Le?a err
Loro Ravachol err
Xard?ns de Vicenti err
Estatua Vendedora de Gallinas err
Monumento Al Violinista Manuel Quiroga err
Praia Da Pinela err
CITA - Centro de Interpretaci?n das Torres Arcebispais err
pazo mugartegui err
Puerto Deportivo de Pontevedra err
Plaza de La Peregrina err
Recinto Ferial de Pontevedra err
BRAINBOX Pontevedra err
Monumento a los H?roes de Ponte Sampaio err
Maranmar Totos Park S L U err
LIC R?o L?rez err
Area Da Covicha err
Ruta da pedra e da auga err
Salinas de Porto Santo err
Praia Da Canteira err
Praia fluvial do L?rez err
Praia de Fontemaior err
Monte Castrove err
Praia do La?o err
Praia de Chancelas err
Praia do Covelo err
Praia do Polvor?n err


Sociedade Martins Sarmento err
Plataforma das Artes e da Criatividade err
Museu de Arte Primitiva Moderna err
Igreja de Nossa Senhora da Consola??o e Santos Passos err
Largo do Toural err
Parque Aqu?tico de Fafe err
Live Science Center of Guimar?es err
Piscinas Scorpio err
Zona de Couros err
Pal?cio da Justi?a err
Largo das Tangerinas err
Homenagem a S. Francisco err
Pra?a da Rep?blica err
Vim?gua, EIM, SA err
Tanques P?blicos err
Torre dos Almadas err
Parque das Termas de Vizela err
Parque de Lazer err
Fornos Olaria Da Cruz De Pedra err
Padr?o de Aljubarrota err
Fafe Theater Film err
Centro de Estudos Camilianos err
Largo Rep?blica do Brasil err
Santu?rio de Santa Luzia err
Santuario de Nossa Senhora da Peneda err
Arnado Park err
Granaries of Soajo err
Cit?nia Santa Luzia err
Museu do Brinquedo Portugu?s err
Museum of Art and Archeology of Viana do Castelo err
Casa dos Nichos err
Museu Municipal err
Canto Marinho beach err
Mezio Door err
Funiculaire de Santa Luzia err
Serra de Arga er

Barranco das Canas Beach err
Prainha err
Phare de Ponta do Altar err
Convento Nossa Senhora De Desterro err
Marina Portim?o err
Alternativtour, Lda err
Alcalar Megalithic Site err
Father Vicente Beach err
Tren tur?stico portimao err
Praia do Torrado err
AQUAFUN PRODUCTS | Water Parks err
MONUMENTO DAS MULHERES err
Passadi?os de Alvor err
Three Castles beach in Portim?o err
Parque Do Gato Preto-parques De Recreio Lda err
Museu Cargaleiro err
Museu Francisco Tavares Proen?a J?nior err
Miradouro de S?o Gens err
Informais do CAACB err
Solar dos Bejas err
Loki Park err
Museu do Canteiro err
Solar dos Motas err
Museu da Seda err
Barragem de Santa ?gueda err
Miradouro do Suberco err
North Beach err
Norpark - Aquatic Amusement Nazar? err
Museu Dr. Joaquim Manso err
Salgado beach err
Canh?o da Nazar? err
Mirador err
Victory Walls Beach err
Sitio Clasificado del Monte de S?o Bartolomeu err
L?gua the beach err
Praia da Gralha err
Praia de Vale Furado err
Parque de Merendas Pinhal da Casa de Nossa

In [614]:
wiki_w2v = pd.DataFrame.from_items(result2).T

print(len(wiki_w2v))
wiki_w2v.head()

2040


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
Buen Retiro Park,스페인,Madrid,4.6,24568,Vast 19th-century park with fountains,"Expansive, 19th-century park with boating lake...",The Buen Retiro Park (Spanish: Parque del Buen...,The Buen Retiro Park (Spanish: Parque del Buen...,"[[the, buen, retiro, park, (, spanish:, parque...","[[the, buen, retiro, park, (, spanish:, parque...","[(gardens, 2.74041635524), (retiro, 2.62102849...","[(park, 1.15077210271), (madrid, 1.0921542155)..."
Royal Palace of Madrid,스페인,Madrid,4.4,6566,"Regal 2,000-room palace, armory &amp; garden","18th-century, ridge-top palace for state occas...",The Royal Palace of Madrid (Spanish: Palacio R...,The Royal Palace of Madrid (Spanish: Palacio R...,"[[the, royal, palace, of, madrid, (, spanish:,...","[[the, royal, palace, of, madrid, (, spanish:,...","[(palace, 3.53693471549), (royal, 2.8566952040...","[(days, 38223.0232702), (ii, 0.131119025127), ..."
"Plaza Mayor, Madrid",스페인,Madrid,4.3,17375,Madrid's vibrant main square,Cafes &amp; restaurants line the arches of thi...,The Plaza Mayor (English Main Square) was buil...,The Plaza Mayor (English Main Square) was buil...,"[[the, plaza, mayor, (, english, main, square,...","[[the, plaza, mayor, (, english, main, square,...","[(plaza, 3.08842559224), (mayor, 2.46901197894...","[(plaza, 1.71915441689), (mayor, 1.60124506815..."
Museo Nacional Del Prado,스페인,Madrid,4.6,13871,World-class European art collection,"Art museum with Velazquez, Goya and El Greco m...",The Prado Museum (Spanish pronunciation: [mu?s...,The Prado Museum (Spanish pronunciation: [mu?s...,"[[the, prado, museum, (, spanish, pronunciatio...","[[the, prado, museum, (, spanish, pronunciatio...","[(historians, 26781.0014629), (clsicas, 8266.5...","[(museum, 1.81501657445), (collection, 1.69899..."
Puerta del Sol,스페인,Madrid,4.3,26932,Vast pedestrianized public square,Public square with an equine statue of King Ca...,"The Puerta del Sol (Spanish for ""Gate of the S...","The Puerta del Sol (Spanish for ""Gate of the S...","[[the, puerta, del, sol, (, spanish, for, "", g...","[[the, puerta, del, sol, (, spanish, for, "", g...","[(square, 2.4363239799), (city, 1.93191707588)...","[(square, 1.36724962855), (new, 0.999176127687..."


In [615]:
result3 = []

for place in result2 :
    result3.append((place[0], (place[1][0],place[1][1], place[1][2], place[1][3], w2v(place, 8, 10), w2v(place, 9, 11), place[1][10], place[1][11])))
    

In [616]:
wiki_w2v_tfidf = pd.DataFrame.from_items(result3).T

print(len(wiki_w2v_tfidf))
wiki_w2v_tfidf = wiki_w2v_tfidf.rename(columns = {0 : "nature", 1 : "city", 2 : "star", 3 : "review", 4 : "f_w2v", 5 : "s_w2v", 6 : "f_tfidf", 7 : "s_tfidf"})
wiki_w2v_tfidf.head(30)

2040


Unnamed: 0,nature,city,star,review,f_w2v,s_w2v,f_tfidf,s_tfidf
Buen Retiro Park,스페인,Madrid,4.6,24568,"[(gardens, [2.56225e-05, 0.00369755, -0.000636...",[],"[(gardens, 2.74041635524), (retiro, 2.62102849...","[(park, 1.15077210271), (madrid, 1.0921542155)..."
Royal Palace of Madrid,스페인,Madrid,4.4,6566,"[(palace, [-0.0255477, 0.000696889, 0.0213513,...",[],"[(palace, 3.53693471549), (royal, 2.8566952040...","[(days, 38223.0232702), (ii, 0.131119025127), ..."
"Plaza Mayor, Madrid",스페인,Madrid,4.3,17375,"[(plaza, [-0.00331517, -0.0046148, 0.00355219,...","[(plaza, [-0.00319899, -0.00460728, 0.00340466...","[(plaza, 3.08842559224), (mayor, 2.46901197894...","[(plaza, 1.71915441689), (mayor, 1.60124506815..."
Museo Nacional Del Prado,스페인,Madrid,4.6,13871,[],"[(museum, [-0.00181003, -0.00478166, 0.0040637...","[(historians, 26781.0014629), (clsicas, 8266.5...","[(museum, 1.81501657445), (collection, 1.69899..."
Puerta del Sol,스페인,Madrid,4.3,26932,"[(square, [-0.00351785, -3.81661e-05, 0.001037...",[],"[(square, 2.4363239799), (city, 1.93191707588)...","[(square, 1.36724962855), (new, 0.999176127687..."
Museo Nacional Centro de Arte Reina Sof?a,스페인,Madrid,4.3,5500,"[(museum, [-0.00186638, -0.0048302, 0.00423442...","[(art, [0.00485215, 0.00136403, 0.00144055, 0....","[(museum, 2.02431245209), (collection, 1.62662...","[(collection, 1.59193627516), (lucio, 1.406124..."
Temple of Debod,스페인,Madrid,4.5,5189,"[(en, [-0.0025698, -0.000160637, -0.00052282, ...","[(temple, [-0.00225771, 0.00449442, -0.0031738...","[(pp, 31765.1521912), (july, 15294.7842843), (...","[(temple, 1.84474327867), (construction, 1.428..."
El Escorial,스페인,Madrid,4.5,1072,"[(el, [-0.0494335, 0.0680808, -0.00638605, 0.0...",[],"[(el, 3.50536361597), (escorial, 3.30009921883...","[(school, 38336.585844), (century, 0.080419256..."
Thyssen-Bornemisza Museum,스페인,Madrid,4.4,4382,[],"[(museum, [-0.00181172, -0.00478293, 0.0040500...","[(loan, 79060.4444489), (visitors, 0.008160612...","[(thyssen, 1.49474104508), (bornemisza, 1.3627..."
Gran V?a,스페인,Madrid,0.0,0,"[(gran, [0.000606109, 0.00400535, -0.000989276...","[(street, [0.00400025, -0.00343774, 0.00024387...","[(gran, 2.57478768056), (va, 2.53058220227), (...","[(street, 1.64910484668), (shopping, 1.4249973..."


In [839]:
wiki_w2v_tfidf = wiki_w2v_tfidf.reset_index().rename(columns = {"index" : "place"})
wiki_w2v_tfidf.head(10).f_w2v[9]

[('gran', array([  6.06109446e-04,   4.00535436e-03,  -9.89276450e-04,
          -4.18361014e-04,   4.05710330e-03,  -2.49189953e-03,
          -3.81678366e-03,   3.35630332e-03,   2.86242203e-03,
           4.58239840e-04,  -4.15466959e-03,  -2.36535864e-03,
           2.86282599e-03,  -5.39950840e-03,  -5.24647813e-03,
           2.76123756e-04,  -2.97647412e-03,  -3.30485171e-03,
           1.87940558e-03,  -4.52602981e-03,  -4.83016111e-03,
          -6.40220521e-03,  -1.54085469e-03,   3.06912727e-04,
           2.82704481e-03,  -2.64249765e-03,  -3.45905870e-03,
           1.91733870e-03,   3.92225059e-03,   2.95994151e-03,
           3.21496325e-03,   3.19459150e-03,   4.14034585e-03,
           3.26633191e-04,  -2.27039587e-03,  -3.77334352e-03,
          -1.60584623e-05,  -4.11748979e-03,  -8.21251306e-04,
          -5.11269365e-03,   1.86774007e-03,  -5.61997993e-04,
          -4.73591127e-03,   7.52889377e-04,  -2.84779374e-03,
          -4.36374638e-03,  -3.40603292e-03,  -

In [618]:
wiki_w2v_tfidf.iloc[0]

place                                       Buen Retiro Park
nature                                                   스페인
city                                                  Madrid
star                                                     4.6
review                                                 24568
f_w2v      [(gardens, [2.56225e-05, 0.00369755, -0.000636...
s_w2v                                                     []
f_tfidf    [(gardens, 2.74041635524), (retiro, 2.62102849...
s_tfidf    [(park, 1.15077210271), (madrid, 1.0921542155)...
Name: 0, dtype: object

In [619]:
vector = []


for idx in range(len(wiki_w2v_tfidf)) :
    V = np.asarray([float(0)]*100)
    
    w2v = wiki_w2v_tfidf.iloc[idx][5]
    tfidf = dict(wiki_w2v_tfidf.iloc[idx][7])
    
    for w in w2v :
        if  tfidf[w[0]] > 0 :
            #constant = wiki_w2v.iloc[idx][6].count(w[0])
            constant = 1
            v = constant * tfidf[w[0]] * np.asarray(w[1])
            V += v
    
    vector.append(V)

In [620]:
wiki_w2v_tfidf["vector"] = 0
wiki_w2v_tfidf["vector"] = wiki_w2v_tfidf.index.map(lambda x : making_vector(x, vector))
wiki_w2v_tfidf.head()

Unnamed: 0,place,nature,city,star,review,f_w2v,s_w2v,f_tfidf,s_tfidf,vector
0,Buen Retiro Park,스페인,Madrid,4.6,24568,"[(gardens, [2.56225e-05, 0.00369755, -0.000636...",[],"[(gardens, 2.74041635524), (retiro, 2.62102849...","[(park, 1.15077210271), (madrid, 1.0921542155)...","[-0.061719946294, 0.0268895732879, 0.000591166..."
1,Royal Palace of Madrid,스페인,Madrid,4.4,6566,"[(palace, [-0.0255477, 0.000696889, 0.0213513,...",[],"[(palace, 3.53693471549), (royal, 2.8566952040...","[(days, 38223.0232702), (ii, 0.131119025127), ...","[-0.705642409623, -0.0428558629937, 0.80886895..."
2,"Plaza Mayor, Madrid",스페인,Madrid,4.3,17375,"[(plaza, [-0.00331517, -0.0046148, 0.00355219,...","[(plaza, [-0.00319899, -0.00460728, 0.00340466...","[(plaza, 3.08842559224), (mayor, 2.46901197894...","[(plaza, 1.71915441689), (mayor, 1.60124506815...","[-0.0305745969526, 0.00276253220272, 0.0229227..."
3,Museo Nacional Del Prado,스페인,Madrid,4.6,13871,[],"[(museum, [-0.00181003, -0.00478166, 0.0040637...","[(historians, 26781.0014629), (clsicas, 8266.5...","[(museum, 1.81501657445), (collection, 1.69899...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,Puerta del Sol,스페인,Madrid,4.3,26932,"[(square, [-0.00351785, -3.81661e-05, 0.001037...",[],"[(square, 2.4363239799), (city, 1.93191707588)...","[(square, 1.36724962855), (new, 0.999176127687...","[-0.0303156118898, -0.00380954073626, 0.011569..."


In [621]:
wiki_w2v_tfidf2 = wiki_w2v_tfidf.drop(["s_w2v", "s_tfidf"], axis = 1)
print(len(wiki_w2v_tfidf2))
wiki_w2v_tfidf2.head()

2040


Unnamed: 0,place,nature,city,star,review,f_w2v,f_tfidf,vector
0,Buen Retiro Park,스페인,Madrid,4.6,24568,"[(gardens, [2.56225e-05, 0.00369755, -0.000636...","[(gardens, 2.74041635524), (retiro, 2.62102849...","[-0.061719946294, 0.0268895732879, 0.000591166..."
1,Royal Palace of Madrid,스페인,Madrid,4.4,6566,"[(palace, [-0.0255477, 0.000696889, 0.0213513,...","[(palace, 3.53693471549), (royal, 2.8566952040...","[-0.705642409623, -0.0428558629937, 0.80886895..."
2,"Plaza Mayor, Madrid",스페인,Madrid,4.3,17375,"[(plaza, [-0.00331517, -0.0046148, 0.00355219,...","[(plaza, 3.08842559224), (mayor, 2.46901197894...","[-0.0305745969526, 0.00276253220272, 0.0229227..."
3,Museo Nacional Del Prado,스페인,Madrid,4.6,13871,[],"[(historians, 26781.0014629), (clsicas, 8266.5...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,Puerta del Sol,스페인,Madrid,4.3,26932,"[(square, [-0.00351785, -3.81661e-05, 0.001037...","[(square, 2.4363239799), (city, 1.93191707588)...","[-0.0303156118898, -0.00380954073626, 0.011569..."


In [622]:
vector2 = list(map(lambda x: (0, x.tolist()), vector))

vector_df = pd.DataFrame.from_items(vector2).T
vector_df = vector_df.reset_index().drop("index", axis =1)
vector_df.index.names = ["index"]
vector_df = vector_df.reset_index()

print(len(vector_df.columns))
vector_df.head(30)

101


Unnamed: 0,index,0,1,2,3,4,5,6,7,8,...,90,91,92,93,94,95,96,97,98,99
0,0,-0.06172,0.02689,0.000591,0.024331,-0.071546,-0.063647,0.002086,-0.023568,0.00731,...,0.046319,0.03803,0.02695,0.089011,0.08475,0.107621,-0.016921,0.024135,0.04732,0.030627
1,1,-0.705642,-0.042856,0.808869,-0.207935,0.499195,-1.679368,-1.008534,-0.82247,0.755285,...,1.476892,2.096998,-0.615106,0.864907,1.069866,1.502155,0.133394,1.275935,0.867267,1.282705
2,2,-0.030575,0.002763,0.022923,-0.022205,0.014999,0.001847,-0.010259,0.023433,0.012934,...,0.014598,0.002821,-0.003264,-0.019943,-0.002694,0.007512,0.00237,0.012325,-0.015821,0.015184
3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,-0.030316,-0.00381,0.01157,0.020908,-0.014236,-0.041524,-0.017729,0.0149,0.000894,...,0.01309,0.025467,0.016436,0.046832,0.020377,0.042822,0.025549,-0.007886,0.029003,-0.004963
5,5,0.021252,-0.016863,0.012896,0.001287,0.006267,-0.004923,-0.016963,0.014903,-0.023038,...,0.005173,-0.030247,-0.007084,-0.006501,-0.006374,0.007085,-0.006198,0.01871,0.008118,0.008812
6,6,-0.00172,-0.000107,-0.00035,-0.001296,0.001128,-0.002502,0.000968,0.001076,-0.001699,...,-0.002387,0.003108,0.000353,0.003066,0.001505,6.2e-05,-0.0024,6.3e-05,-0.001259,-0.000387
7,7,-1.303948,1.693123,-0.044307,0.385268,1.084179,-3.342061,-2.978087,-2.109627,4.003357,...,2.010747,3.228021,0.574354,3.269741,1.887131,2.676504,1.496499,3.391541,1.174889,2.638389
8,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,9,0.022242,-0.03478,-0.018129,0.004928,0.013913,-0.048099,-0.035196,0.01305,0.003032,...,0.001653,0.03431,0.000798,0.01752,-0.00713,0.058072,0.003919,0.002573,0.009679,0.031597


In [623]:
target = ["index"]
feature = list(range(100))

In [624]:
cluster = hdbscan.HDBSCAN()

In [625]:
cluster.fit(vector_df[feature], vector_df[target])

HDBSCAN(algorithm='best', allow_single_cluster=False, alpha=1.0,
    approx_min_span_tree=True, cluster_selection_method='eom',
    core_dist_n_jobs=4, gen_min_span_tree=False, leaf_size=40,
    match_reference_implementation=False, memory=Memory(cachedir=None),
    metric='euclidean', min_cluster_size=5, min_samples=None, p=None,
    prediction_data=False)

In [626]:
label = {}

for idx in list(cluster.labels_) :
    if idx in label :
        label[idx] += 1
    else :
        label[idx] = 1

label_lst = sorted(label.items(), reverse = True, key = lambda x : x[1])
label_lst

[(-1, 1291),
 (7, 689),
 (3, 12),
 (1, 10),
 (5, 7),
 (8, 7),
 (2, 7),
 (6, 7),
 (0, 5),
 (4, 5)]

In [627]:
list(cluster.labels_)

[-1,
 -1,
 -1,
 7,
 -1,
 -1,
 -1,
 -1,
 7,
 -1,
 -1,
 7,
 -1,
 -1,
 -1,
 7,
 -1,
 3,
 7,
 -1,
 -1,
 -1,
 7,
 -1,
 -1,
 -1,
 -1,
 -1,
 7,
 7,
 -1,
 7,
 3,
 7,
 7,
 -1,
 -1,
 -1,
 7,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 7,
 -1,
 7,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 7,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 5,
 -1,
 7,
 7,
 -1,
 7,
 7,
 7,
 7,
 -1,
 -1,
 -1,
 7,
 -1,
 7,
 7,
 7,
 7,
 -1,
 7,
 -1,
 -1,
 -1,
 7,
 -1,
 -1,
 -1,
 -1,
 0,
 7,
 -1,
 7,
 7,
 -1,
 7,
 -1,
 7,
 -1,
 7,
 7,
 7,
 7,
 -1,
 -1,
 -1,
 7,
 -1,
 7,
 -1,
 -1,
 -1,
 7,
 -1,
 -1,
 7,
 -1,
 -1,
 -1,
 -1,
 -1,
 7,
 7,
 7,
 -1,
 -1,
 7,
 -1,
 7,
 7,
 -1,
 7,
 7,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 7,
 -1,
 -1,
 7,
 -1,
 -1,
 7,
 -1,
 -1,
 -1,
 -1,
 7,
 7,
 -1,
 7,
 -1,
 5,
 -1,
 5,
 -1,
 -1,
 7,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 7,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 7,
 -1,
 7,
 -1,
 7,
 -1,
 7,
 -1,
 -1,
 -1,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 -1,
 -1,
 -1,
 7,
 -1,
 -1,
 -1,
 7,
 -1,
 -1,
 -1,
 

 # 모든 데이터를 한 번에 word2vec에 학습

In [628]:
extended_lst = []

for row in result3 :
    if type(row[1][6]) == list :
        extended_lst.extend(row[1][6])
    
print(len(extended_lst))

39971


In [629]:
model = word2vec.Word2Vec(extended_lst)

In [630]:
result4 = []

for place in result2 :
    result4.append((place[0], (place[1][0],place[1][1], place[1][2], place[1][3], w2v2(place, 8, 10, model), w2v2(place, 9, 11, model), place[1][10], place[1][11])))
    

In [631]:
wiki_w2v_tfidf_exd = pd.DataFrame.from_items(result4).T

print(len(wiki_w2v_tfidf_exd))
wiki_w2v_tfidf_exd = wiki_w2v_tfidf_exd.rename(columns = {0 : "nature", 1 : "city", 2 : "star", 3 : "review", 4 : "f_w2v", 5 : "s_w2v", 6 : "f_tfidf", 7 : "s_tfidf"})
wiki_w2v_tfidf_exd.head(30)

2040


Unnamed: 0,nature,city,star,review,f_w2v,s_w2v,f_tfidf,s_tfidf
Buen Retiro Park,스페인,Madrid,4.6,24568,"[(gardens, [0.00267356, 0.00339824, -0.0011210...","[(park, [0.00235135, 0.00398113, -0.00222354, ...","[(gardens, 2.74041635524), (retiro, 2.62102849...","[(park, 1.15077210271), (madrid, 1.0921542155)..."
Royal Palace of Madrid,스페인,Madrid,4.4,6566,"[(palace, [-0.00450117, 0.00233519, -0.0028386...","[(days, [-0.00482194, -0.00441774, 0.00288038,...","[(palace, 3.53693471549), (royal, 2.8566952040...","[(days, 38223.0232702), (ii, 0.131119025127), ..."
"Plaza Mayor, Madrid",스페인,Madrid,4.3,17375,"[(plaza, [-0.00319899, -0.00460728, 0.00340466...","[(plaza, [-0.00319899, -0.00460728, 0.00340466...","[(plaza, 3.08842559224), (mayor, 2.46901197894...","[(plaza, 1.71915441689), (mayor, 1.60124506815..."
Museo Nacional Del Prado,스페인,Madrid,4.6,13871,"[(historians, [-0.00176146, -0.00446614, 0.004...","[(museum, [-0.00181172, -0.00478293, 0.0040500...","[(historians, 26781.0014629), (clsicas, 8266.5...","[(museum, 1.81501657445), (collection, 1.69899..."
Puerta del Sol,스페인,Madrid,4.3,26932,"[(square, [-0.00258644, 0.000350998, 0.0008284...","[(square, [-0.00258644, 0.000350998, 0.0008284...","[(square, 2.4363239799), (city, 1.93191707588)...","[(square, 1.36724962855), (new, 0.999176127687..."
Museo Nacional Centro de Arte Reina Sof?a,스페인,Madrid,4.3,5500,"[(museum, [-0.00181172, -0.00478293, 0.0040500...","[(collection, [-0.00366707, 0.000573131, -0.00...","[(museum, 2.02431245209), (collection, 1.62662...","[(collection, 1.59193627516), (lucio, 1.406124..."
Temple of Debod,스페인,Madrid,4.5,5189,"[(pp, [-0.0010845, 0.00284758, -0.00461103, 0....","[(temple, [-0.00221131, 0.00443357, -0.0032344...","[(pp, 31765.1521912), (july, 15294.7842843), (...","[(temple, 1.84474327867), (construction, 1.428..."
El Escorial,스페인,Madrid,4.5,1072,"[(el, [0.00323184, -0.000361601, -0.00452492, ...","[(school, [-0.00105183, -0.00134332, -0.004839...","[(el, 3.50536361597), (escorial, 3.30009921883...","[(school, 38336.585844), (century, 0.080419256..."
Thyssen-Bornemisza Museum,스페인,Madrid,4.4,4382,"[(visitors, [0.00242139, -0.000872068, 0.00172...","[(museum, [-0.00181172, -0.00478293, 0.0040500...","[(loan, 79060.4444489), (visitors, 0.008160612...","[(thyssen, 1.49474104508), (bornemisza, 1.3627..."
Gran V?a,스페인,Madrid,0.0,0,"[(gran, [0.000868466, 0.00443478, -0.00113002,...","[(street, [0.00400025, -0.00343774, 0.00024387...","[(gran, 2.57478768056), (va, 2.53058220227), (...","[(street, 1.64910484668), (shopping, 1.4249973..."


In [632]:
wiki_w2v_tfidf_exd = wiki_w2v_tfidf_exd.reset_index().rename(columns = {"index" : "place"})
wiki_w2v_tfidf_exd.head()

Unnamed: 0,place,nature,city,star,review,f_w2v,s_w2v,f_tfidf,s_tfidf
0,Buen Retiro Park,스페인,Madrid,4.6,24568,"[(gardens, [0.00267356, 0.00339824, -0.0011210...","[(park, [0.00235135, 0.00398113, -0.00222354, ...","[(gardens, 2.74041635524), (retiro, 2.62102849...","[(park, 1.15077210271), (madrid, 1.0921542155)..."
1,Royal Palace of Madrid,스페인,Madrid,4.4,6566,"[(palace, [-0.00450117, 0.00233519, -0.0028386...","[(days, [-0.00482194, -0.00441774, 0.00288038,...","[(palace, 3.53693471549), (royal, 2.8566952040...","[(days, 38223.0232702), (ii, 0.131119025127), ..."
2,"Plaza Mayor, Madrid",스페인,Madrid,4.3,17375,"[(plaza, [-0.00319899, -0.00460728, 0.00340466...","[(plaza, [-0.00319899, -0.00460728, 0.00340466...","[(plaza, 3.08842559224), (mayor, 2.46901197894...","[(plaza, 1.71915441689), (mayor, 1.60124506815..."
3,Museo Nacional Del Prado,스페인,Madrid,4.6,13871,"[(historians, [-0.00176146, -0.00446614, 0.004...","[(museum, [-0.00181172, -0.00478293, 0.0040500...","[(historians, 26781.0014629), (clsicas, 8266.5...","[(museum, 1.81501657445), (collection, 1.69899..."
4,Puerta del Sol,스페인,Madrid,4.3,26932,"[(square, [-0.00258644, 0.000350998, 0.0008284...","[(square, [-0.00258644, 0.000350998, 0.0008284...","[(square, 2.4363239799), (city, 1.93191707588)...","[(square, 1.36724962855), (new, 0.999176127687..."


In [633]:
vector = []

for idx in range(len(wiki_w2v_tfidf_exd)) :
    V = np.asarray([float(0)]*100)
    
    w2v = wiki_w2v_tfidf_exd.iloc[idx][5]
    tfidf = dict(wiki_w2v_tfidf_exd.iloc[idx][7])
    
    for w in w2v :
        if  tfidf[w[0]] > 0 :
           # constant = wiki_w2v.iloc[idx][6].count(w[0])
            constnat = 1
            v = constant* tfidf[w[0]] * np.asarray(w[1])
            V += v
    
    vector.append(V)

In [634]:
wiki_w2v_tfidf_exd["vector"] = 0
wiki_w2v_tfidf_exd["vector"] = wiki_w2v_tfidf_exd.index.map(lambda x : making_vector(x, vector))
wiki_w2v_tfidf_exd.head()

Unnamed: 0,place,nature,city,star,review,f_w2v,s_w2v,f_tfidf,s_tfidf,vector
0,Buen Retiro Park,스페인,Madrid,4.6,24568,"[(gardens, [0.00267356, 0.00339824, -0.0011210...","[(park, [0.00235135, 0.00398113, -0.00222354, ...","[(gardens, 2.74041635524), (retiro, 2.62102849...","[(park, 1.15077210271), (madrid, 1.0921542155)...","[0.00607410329394, 0.0150155065166, -0.0194225..."
1,Royal Palace of Madrid,스페인,Madrid,4.4,6566,"[(palace, [-0.00450117, 0.00233519, -0.0028386...","[(days, [-0.00482194, -0.00441774, 0.00288038,...","[(palace, 3.53693471549), (royal, 2.8566952040...","[(days, 38223.0232702), (ii, 0.131119025127), ...","[-0.00857532175723, 0.0110720564408, 0.0218960..."
2,"Plaza Mayor, Madrid",스페인,Madrid,4.3,17375,"[(plaza, [-0.00319899, -0.00460728, 0.00340466...","[(plaza, [-0.00319899, -0.00460728, 0.00340466...","[(plaza, 3.08842559224), (mayor, 2.46901197894...","[(plaza, 1.71915441689), (mayor, 1.60124506815...","[-0.0595356550948, 0.0038067680689, 0.02655557..."
3,Museo Nacional Del Prado,스페인,Madrid,4.6,13871,"[(historians, [-0.00176146, -0.00446614, 0.004...","[(museum, [-0.00181172, -0.00478293, 0.0040500...","[(historians, 26781.0014629), (clsicas, 8266.5...","[(museum, 1.81501657445), (collection, 1.69899...","[-47.1749865339, -119.608532804, 129.087027752..."
4,Puerta del Sol,스페인,Madrid,4.3,26932,"[(square, [-0.00258644, 0.000350998, 0.0008284...","[(square, [-0.00258644, 0.000350998, 0.0008284...","[(square, 2.4363239799), (city, 1.93191707588)...","[(square, 1.36724962855), (new, 0.999176127687...","[-0.0155740795235, -0.00558547343826, 0.023721..."


In [635]:
wiki_w2v_tfidf2_exd = wiki_w2v_tfidf_exd.drop(["s_w2v", "s_tfidf"], axis = 1)
print(len(wiki_w2v_tfidf2_exd))
wiki_w2v_tfidf2_exd.head()

2040


Unnamed: 0,place,nature,city,star,review,f_w2v,f_tfidf,vector
0,Buen Retiro Park,스페인,Madrid,4.6,24568,"[(gardens, [0.00267356, 0.00339824, -0.0011210...","[(gardens, 2.74041635524), (retiro, 2.62102849...","[0.00607410329394, 0.0150155065166, -0.0194225..."
1,Royal Palace of Madrid,스페인,Madrid,4.4,6566,"[(palace, [-0.00450117, 0.00233519, -0.0028386...","[(palace, 3.53693471549), (royal, 2.8566952040...","[-0.00857532175723, 0.0110720564408, 0.0218960..."
2,"Plaza Mayor, Madrid",스페인,Madrid,4.3,17375,"[(plaza, [-0.00319899, -0.00460728, 0.00340466...","[(plaza, 3.08842559224), (mayor, 2.46901197894...","[-0.0595356550948, 0.0038067680689, 0.02655557..."
3,Museo Nacional Del Prado,스페인,Madrid,4.6,13871,"[(historians, [-0.00176146, -0.00446614, 0.004...","[(historians, 26781.0014629), (clsicas, 8266.5...","[-47.1749865339, -119.608532804, 129.087027752..."
4,Puerta del Sol,스페인,Madrid,4.3,26932,"[(square, [-0.00258644, 0.000350998, 0.0008284...","[(square, 2.4363239799), (city, 1.93191707588)...","[-0.0155740795235, -0.00558547343826, 0.023721..."


In [836]:
wiki_w2v_tfidf.f_tfidf[4]

[('square', 2.4363239798987277),
 ('city', 1.9319170758830588),
 ('post', 1.8348521485878446),
 ('madrid', 1.8179051486896216),
 ('puerta', 1.8124707274633427),
 ('side', 1.7707308986926704),
 ('spain', 1.7681972678585047),
 ('office', 1.7600494665931403),
 ('plaza', 1.6918131515310677),
 ('sol', 1.6710636649226653),
 ('corte', 1.6445873928757013),
 ('san', 1.566677739518648),
 ('la', 1.5262610639194467),
 ('commuter', 1.5003438249761327),
 ('el', 1.4647222104453506),
 ('church', 1.4623387040497129),
 ('media', 1.4331064596208674),
 ('mayor', 1.4203632691773609),
 ('system', 1.406436704882075),
 ('spains', 1.3965502505712535)]

In [636]:
vector2 = list(map(lambda x: (0, x.tolist()), vector))

vector_df = pd.DataFrame.from_items(vector2).T
vector_df = vector_df.reset_index().drop("index", axis =1)
vector_df.index.names = ["index"]
vector_df = vector_df.reset_index()

print(len(vector_df.columns))
vector_df.head(30)

101


Unnamed: 0,index,0,1,2,3,4,5,6,7,8,...,90,91,92,93,94,95,96,97,98,99
0,0,0.006074,0.015016,-0.019423,0.018564,-0.009759,-0.006886,-0.018433,-0.013909,-0.017161,...,0.034489,0.021697,-0.00865,-0.006833697,-0.02695,0.027968,-0.008397,-0.025122,0.019847,0.017991
1,1,-0.008575,0.011072,0.021896,0.037498,-0.007062,-0.015904,-0.032011,-0.022889,0.013547,...,-0.015113,0.036001,0.009787,0.01471277,0.012928,0.05961,0.005517,-0.011633,0.039154,0.022247
2,2,-0.059536,0.003807,0.026556,-0.021477,0.013773,0.000883,-0.025927,0.016453,0.000892,...,-0.000753,0.008342,-0.016771,-0.005196823,0.001791,0.001192,0.014492,0.004249,-0.014706,0.040941
3,3,-47.174987,-119.608533,129.087028,94.379929,14.554172,-107.823051,-32.864552,-121.54817,-20.67376,...,125.951244,5.299161,124.593445,-99.3283,18.251329,118.156372,-67.321282,-125.160456,92.10276,-16.275768
4,4,-0.015574,-0.005585,0.023721,0.016333,-0.020572,-0.008376,-0.034828,0.026511,-0.009717,...,0.014649,0.012851,-0.003937,-0.003371122,0.009427,0.048953,0.016913,0.004462,0.025587,0.007159
5,5,0.025814,-0.016574,0.0022,0.00569,0.002874,0.015884,-0.00649,0.015049,-0.037642,...,-0.003952,-0.026903,-0.033882,0.002546558,0.026544,0.029005,-0.019853,0.001688,0.010759,0.011404
6,6,-56.244276,148.679965,-177.320332,-63.96833,-213.113823,-147.442903,-58.729581,-45.715382,-165.566385,...,48.296362,-23.842668,142.742769,-50.80567,-221.850012,80.919827,165.07753,-111.064309,-73.320836,-104.466886
7,7,-0.044378,0.029091,-0.005436,0.012611,-0.001314,-0.015415,-0.058387,-0.002025,-0.012189,...,-0.047647,-0.018377,-0.013185,-0.01001434,-0.021399,0.040469,-0.016658,0.028137,0.044468,0.044538
8,8,2e-05,-7e-06,1.4e-05,1e-05,7e-06,2.5e-05,-3.1e-05,3.6e-05,2.3e-05,...,3.6e-05,-3.7e-05,-3e-06,7.246992e-07,-3.7e-05,2.9e-05,2e-06,1.2e-05,-2.4e-05,-1.8e-05
9,9,0.030244,-0.025798,-0.014483,0.021034,-0.013641,-0.000798,-0.02166,0.004259,-0.026376,...,-0.003495,-0.004922,-0.004027,0.01581086,-0.005928,0.022653,-0.02498,0.016514,-0.00143,0.015321


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,100
0,Buen Retiro Park,0.006074,0.015016,-0.019423,0.018564,-0.009759,-0.006886,-0.018433,-0.013909,-0.017161,...,0.034489,0.021697,-0.00865,-0.006834,-0.02695,0.027968,-0.008397,-0.025122,0.019847,0.017991
1,Royal Palace of Madrid,-0.008575,0.011072,0.021896,0.037498,-0.007062,-0.015904,-0.032011,-0.022889,0.013547,...,-0.015113,0.036001,0.009787,0.014713,0.012928,0.05961,0.005517,-0.011633,0.039154,0.022247
2,"Plaza Mayor, Madrid",-0.059536,0.003807,0.026556,-0.021477,0.013773,0.000883,-0.025927,0.016453,0.000892,...,-0.000753,0.008342,-0.016771,-0.005197,0.001791,0.001192,0.014492,0.004249,-0.014706,0.040941
3,Museo Nacional Del Prado,-47.174987,-119.608533,129.087028,94.379929,14.554172,-107.823051,-32.864552,-121.54817,-20.67376,...,125.951244,5.299161,124.593445,-99.328298,18.251329,118.156372,-67.321282,-125.160456,92.10276,-16.275768
4,Puerta del Sol,-0.015574,-0.005585,0.023721,0.016333,-0.020572,-0.008376,-0.034828,0.026511,-0.009717,...,0.014649,0.012851,-0.003937,-0.003371,0.009427,0.048953,0.016913,0.004462,0.025587,0.007159


In [788]:
vector_df2.to_csv("vector.csv")

In [637]:
target = ["index"]
feature = list(range(100))

In [638]:
cluster = hdbscan.HDBSCAN()

In [639]:
cluster.fit(vector_df[feature], vector_df[target])

HDBSCAN(algorithm='best', allow_single_cluster=False, alpha=1.0,
    approx_min_span_tree=True, cluster_selection_method='eom',
    core_dist_n_jobs=4, gen_min_span_tree=False, leaf_size=40,
    match_reference_implementation=False, memory=Memory(cachedir=None),
    metric='euclidean', min_cluster_size=5, min_samples=None, p=None,
    prediction_data=False)

In [640]:
label = {}

for idx in list(cluster.labels_) :
    if idx in label :
        label[idx] += 1
    else :
        label[idx] = 1

label_lst = sorted(label.items(), reverse = True, key = lambda x : x[1])
label_lst

[(-1, 1552),
 (15, 325),
 (5, 27),
 (10, 16),
 (0, 16),
 (6, 14),
 (13, 10),
 (4, 9),
 (1, 9),
 (16, 8),
 (7, 8),
 (9, 8),
 (8, 8),
 (11, 7),
 (14, 7),
 (2, 6),
 (12, 5),
 (3, 5)]

In [641]:
print(cluster.labels_)

[-1 -1 -1 ..., -1 15 -1]


# 거리 확인

In [645]:
vector_df.head()

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,...,90,91,92,93,94,95,96,97,98,99
0,0,0.006074,0.015016,-0.019423,0.018564,-0.009759,-0.006886,-0.018433,-0.013909,-0.017161,...,0.034489,0.021697,-0.00865,-0.006834,-0.02695,0.027968,-0.008397,-0.025122,0.019847,0.017991
1,1,-0.008575,0.011072,0.021896,0.037498,-0.007062,-0.015904,-0.032011,-0.022889,0.013547,...,-0.015113,0.036001,0.009787,0.014713,0.012928,0.05961,0.005517,-0.011633,0.039154,0.022247
2,2,-0.059536,0.003807,0.026556,-0.021477,0.013773,0.000883,-0.025927,0.016453,0.000892,...,-0.000753,0.008342,-0.016771,-0.005197,0.001791,0.001192,0.014492,0.004249,-0.014706,0.040941
3,3,-47.174987,-119.608533,129.087028,94.379929,14.554172,-107.823051,-32.864552,-121.54817,-20.67376,...,125.951244,5.299161,124.593445,-99.328298,18.251329,118.156372,-67.321282,-125.160456,92.10276,-16.275768
4,4,-0.015574,-0.005585,0.023721,0.016333,-0.020572,-0.008376,-0.034828,0.026511,-0.009717,...,0.014649,0.012851,-0.003937,-0.003371,0.009427,0.048953,0.016913,0.004462,0.025587,0.007159


In [695]:
lst = vector_df.values.tolist()
lst2 = []

for idx in lst :
    lst2.append((idx[0], idx[1:]))

result = []

for idx in range(0, len(lst2)) :
    result.append(dist(lst2[48], lst[idx]))
    
b = sorted(result, key = lambda x : x[1])[:10]
a = list(map(lambda x : wiki_w2v_tfidf_exd.iloc[int(x[0])][0], result))[:10]
print(b)
print(" ")
print(a)

[(939.0, 0.02423675853191453), (427.0, 0.024238743947967902), (1643.0, 0.024239472220987289), (1456.0, 0.024240460420239265), (296.0, 0.024240528367574732), (1478.0, 0.024240547986623048), (53.0, 0.024242259097407987), (19.0, 0.024244634291031058), (1535.0, 0.024245507346544671), (78.0, 0.024245711713650874)]
 
['Buen Retiro Park', 'Royal Palace of Madrid', 'Plaza Mayor, Madrid', 'Museo Nacional Del Prado', 'Puerta del Sol', 'Museo Nacional Centro de Arte Reina Sof?a', 'Temple of Debod', 'El Escorial', 'Thyssen-Bornemisza Museum', 'Gran V?a']


# 거리 계산

### 유클리디안

In [813]:
distance_lst = []

for idx in vector2 :
    temp = []
    
    for idx2 in vector2 :
        d = dist(idx, idx2)
        temp.append(d[1])
        
    distance_lst.append(temp)
     
        

In [814]:
d_df = pd.DataFrame.from_records(distance_lst)
d_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2030,2031,2032,2033,2034,2035,2036,2037,2038,2039
0,0.0,0.296614,0.249949,810.833919,0.276204,0.226086,1155.540746,0.314078,0.185187,0.248037,...,0.185288,0.215921,431.076214,0.193917,0.241728,0.188179,0.210462,0.207171,0.182896,1055.599671
1,0.296614,0.0,0.313546,810.848358,0.293839,0.335474,1155.539676,0.358205,0.290938,0.338258,...,0.291126,0.308431,431.079015,0.326096,0.337636,0.302698,0.299601,0.313121,0.292046,1055.627712
2,0.249949,0.313546,0.0,810.834313,0.236274,0.265153,1155.556424,0.296477,0.198718,0.266672,...,0.197696,0.217921,431.060456,0.22144,0.231203,0.204087,0.223432,0.234354,0.200305,1055.603687
3,810.833919,810.848358,810.834313,0.0,810.808726,810.8323,1418.584387,810.838545,810.836367,810.80275,...,810.835178,810.821645,925.867556,810.835128,810.855911,810.827562,810.827672,810.84408,810.834017,1386.517299
4,0.276204,0.293839,0.236274,810.808726,0.0,0.267238,1155.529514,0.317419,0.212145,0.263799,...,0.213065,0.23068,431.089213,0.249465,0.265949,0.21559,0.239015,0.249308,0.211273,1055.615963


In [716]:
d_df.to_csv("distance.csv")

### 코사인 유사도

In [826]:
distance_lst = []

for idx in vector2 :
    temp = []
    
    for idx2 in vector2 :
        d = dist2(idx, idx2)
        temp.append(d)
        
    distance_lst.append(temp)

  


In [827]:
d_df = pd.DataFrame.from_records(distance_lst)
d_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2030,2031,2032,2033,2034,2035,2036,2037,2038,2039
0,1.0,0.287199,0.153494,0.013278,0.038184,0.185124,-0.017868,0.198381,-0.132231,0.106176,...,0.022549,0.053263,-0.034519,0.251474,0.022516,0.236556,0.102847,0.061221,0.15571,-0.008262
1,0.287199,1.0,0.223263,-0.041075,0.350809,0.004869,-0.007617,0.247589,-0.045811,0.04409,...,0.005047,0.060064,-0.031396,-0.101753,-0.041761,0.048847,0.134965,-0.039126,0.010701,-0.101559
2,0.153494,0.223263,1.0,0.010405,0.340017,-0.036402,-0.095533,0.321794,-0.016638,0.039555,...,0.113253,0.139535,0.047163,0.104456,0.181483,0.194099,0.087033,-0.105885,0.017229,-0.027895
3,0.013278,-0.041075,0.010405,1.0,0.130372,0.02418,-0.010491,-0.007287,-0.042941,0.180915,...,0.095399,0.121272,-0.019961,0.010257,-0.122408,0.088888,0.071908,-0.073517,0.081092,-0.088027
4,0.038184,0.350809,0.340017,0.130372,1.0,0.027578,0.037372,0.242411,0.005642,0.126351,...,-0.045457,0.126721,-0.091347,-0.052159,-0.003568,0.198267,0.048626,-0.137671,0.098201,-0.083981


In [792]:
d_df = d_df.fillna(0)
d_df.to_csv("distance2.csv")

In [828]:
m = 0

l = d_df.iloc[m].tolist()
ll = []
for idx, length in enumerate(l) :
    ll.append((idx, length))
    
#코사인 유사도일 경우 True
ll.sort(key = lambda x : x[1], reverse = True)
print(wiki_w2v_tfidf_exd.iloc[m][0])
print('')
for idx in wiki_w2v_tfidf_exd.iloc[m][5] :
    print(idx[0])
print("")
for idx in wiki_w2v_tfidf_exd.iloc[m][7] :
    print(idx[0])
print("")
print(ll[:10])

Buen Retiro Park

gardens
park
palace
madrid
garden
philip
ricardo
velzquez
location
building
time
palacio
king
architect

gardens
retiro
park
buen
palace
madrid
garden
pond
olivares
philip
ricardo
velzquez
location
layout
building
time
palacio
king
architect
cristal

[(0, 1.0), (28, 0.46313844006755767), (30, 0.3883465459769478), (41, 0.3146051532870574), (242, 0.3122339250338579), (20, 0.3121825125473383), (43, 0.3009648950718212), (49, 0.28900144699597463), (1, 0.28719928333457445), (143, 0.2771278405605799)]


In [831]:
n = 41

print(wiki_w2v_tfidf_exd.iloc[n][0])
print('')
for idx in wiki_w2v_tfidf_exd.iloc[n][5] :
    print(idx[0])
print("")
for idx in wiki_w2v_tfidf_exd.iloc[n][7] :
    print(idx[0])
print("")

Plaza de Espa?a

plaza
espaa
half
parque
mara
building
luisa
edge
location
centre
gardens
styles
park
beds
benches
orange

plaza
espaa
half
parque
mara
building
luisa
edge
location
centre
gardens
styles
park
pines
bowers
palms
beds
benches
ponds
orange

