In [46]:
import os
import re
import pandas as pd
import numpy as np
import random

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# Load Files

In [47]:
kn_reviews_df = pd.read_json("../data/businesses_neighbours_reviews.json",lines=True)
kn_reviews_df.head()

Unnamed: 0,_id,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,5e9664963b612dc2f0e488c0,-86eJuYugapSTs5dvCZKtw,qjX40bTubCboIQ6S4mu54A,5FVBaD1poRZQeqUDxbWWQA,1,3,0,0,Checked in with my family of five for two room...,2013-12-30 14:52:18
1,5e9664973b612dc2f0e4b13d,j57XFIDuzcpBrcptbrJSwA,USljZmbtpya0BEKocSLbUw,5FVBaD1poRZQeqUDxbWWQA,1,1,0,0,AT first it seemed like a good hotel. I'm not ...,2013-01-28 19:03:36
2,5e9664973b612dc2f0e4c04b,Hb_kJxRWevasbqHfD_YBQw,mEXLSpAc0A5zdW5iZEosSg,5FVBaD1poRZQeqUDxbWWQA,1,1,0,0,I have stayed here 7 times in the past few mon...,2017-03-15 22:17:31
3,5e9664983b612dc2f0e511eb,qpdvJX2M17LZrrtamrpgjA,YcdgoZzc9yuGN1pBBxevRQ,5FVBaD1poRZQeqUDxbWWQA,5,0,0,0,Check-in war sehr zügig und der Mitarbeiter ex...,2013-10-06 15:22:27
4,5e9664993b612dc2f0e5448a,ivzJHezSj4594qkg6DbS9g,cpOqABn6YiktZqMrJUB8Aw,5FVBaD1poRZQeqUDxbWWQA,4,2,0,0,"the waffle factor:\n\n i don't work, haven't f...",2012-11-08 15:29:18


# Build TF-IDF for reviews

In [48]:
reviews_texts=kn_reviews_df['text']

### Clean especial characters

In [49]:
clear_text_list=[]
for x in reviews_texts:
    y=re.sub(r'[,.!-?¿¡"&$%#\n\t]','',x.lower())
    clear_text_list.append(y)
clear_text_list[:5]

['checked in with my family of five for two rooms a little after  pm my wife made the reservation with the desk agent directly with the hotel we had a pet and confirmed they accepted pets which they did we also confirmed that we could get two adjoining rooms confirmed this on two different calls directly with the female desk agent at the hotel when we arrived at the hotel at a little after  pm i was told by the male desk agent there were no adjoining rooms i indicated we had confirmed this twice and he told me they were full and he put us in two rooms across the hall from each other not optimal with a family with kids although unhappy we went to our rooms and unpacked once the kids were in bed we noticed that the bathroom smelled of urine and then noticed the pull out pocket door had not been cleaned for sometime assumed this was urine and the bathroom light did not work we then noticed what looked like vomit on the inside wall of the room next to the entrance door to the room since de

### Se crea una matrix de conteo de (textos x palabras), se ignoran las palabras que aparezcan en el 85% de los documentos(no relevantes)

In [50]:
cv=CountVectorizer(max_df=0.85, stop_words="english")
word_count_vector=cv.fit_transform(clear_text_list)

### Se convierte la matriz dispersa a dataframe

In [51]:
keywords_df = pd.DataFrame(word_count_vector.toarray())

In [52]:
keywords_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3458,3459,3460,3461,3462,3463,3464,3465,3466,3467
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,1,0,1,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
237,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
238,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
239,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
240,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [77]:
#Get the number of occurrences of a keyword in all reviews
lista_de_tuplas=list(zip(cv.get_feature_names(),keywords_df.sum().to_list()))
#sort
lista_de_tuplas.sort(key=lambda tup: tup[1], reverse=True)
list_of_lists = [list(elem) for elem in lista_de_tuplas]
list_of_lists [:100]

[['room', 302],
 ['hotel', 274],
 ['stay', 173],
 ['clean', 144],
 ['breakfast', 133],
 ['rooms', 116],
 ['staff', 111],
 ['nice', 107],
 ['just', 99],
 ['place', 94],
 ['good', 91],
 ['night', 89],
 ['great', 83],
 ['like', 76],
 ['area', 71],
 ['bed', 71],
 ['desk', 71],
 ['time', 71],
 ['free', 68],
 ['friendly', 67],
 ['didnt', 66],
 ['stayed', 60],
 ['day', 56],
 ['comfortable', 54],
 ['really', 50],
 ['service', 48],
 ['check', 47],
 ['did', 47],
 ['dont', 47],
 ['told', 45],
 ['checked', 44],
 ['location', 44],
 ['im', 43],
 ['inn', 43],
 ['price', 43],
 ['hotels', 42],
 ['said', 40],
 ['got', 39],
 ['went', 39],
 ['morning', 38],
 ['bathroom', 36],
 ['floor', 35],
 ['hot', 35],
 ['little', 35],
 ['hampton', 34],
 ['pool', 34],
 ['water', 34],
 ['coffee', 33],
 ['door', 33],
 ['small', 33],
 ['super', 33],
 ['called', 32],
 ['food', 32],
 ['way', 32],
 ['bad', 31],
 ['manager', 31],
 ['parking', 31],
 ['shower', 31],
 ['airport', 30],
 ['bit', 30],
 ['booked', 30],
 ['new', 30],

In [54]:
#get_feature_names retorna el nombre de las entidades, no el indice, lista todas las keywords encontradas
feature_names=cv.get_feature_names()
feature_names[:5]

['aaa', 'abandoned', 'abby', 'able', 'abrupt']

In [55]:
#Calculo de TF- IDF sobre la matriz dispersa, smooth_idf modifica la formula matematica False para no ignorar completamente los terminos que aparecen en todos los textos
#Se utiliza normalizacion coseno
#use_idf true para calcular la ponderacion inversa de frecuencia
Tfidf_transformer=TfidfTransformer(smooth_idf=False,use_idf=True)
Tfidf_transformer.fit(word_count_vector)

TfidfTransformer(norm='l2', smooth_idf=False, sublinear_tf=False, use_idf=True)

In [56]:
keyword_tf_idf_df = pd.DataFrame(Tfidf_transformer.transform(word_count_vector).toarray())
keyword_tf_idf_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3458,3459,3460,3461,3462,3463,3464,3465,3466,3467
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00000,0.0,0.0,0.0,0.00000,0.00000,0.0,0.00000,0.00000
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00000,0.0,0.0,0.0,0.00000,0.00000,0.0,0.00000,0.00000
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00000,0.0,0.0,0.0,0.00000,0.00000,0.0,0.00000,0.00000
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.07565,0.0,0.0,0.0,0.07565,0.07565,0.0,0.07565,0.07565
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00000,0.0,0.0,0.0,0.00000,0.00000,0.0,0.00000,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
237,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00000,0.0,0.0,0.0,0.00000,0.00000,0.0,0.00000,0.00000
238,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00000,0.0,0.0,0.0,0.00000,0.00000,0.0,0.00000,0.00000
239,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00000,0.0,0.0,0.0,0.00000,0.00000,0.0,0.00000,0.00000
240,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00000,0.0,0.0,0.0,0.00000,0.00000,0.0,0.00000,0.00000


In [57]:
def sort_coo(coo_matrix):
    tuples=zip(coo_matrix.col,coo_matrix.data)
    return sorted(tuples,key=lambda x:(x[1],x[0]),reverse=True)

#Extra de todas las keywords las n-keywords mas relevantes(TF-IDF)
def extract_topn_from_vector(feature_names, sorted_items,topn=10):
    sorted_items=sorted_items[:topn]
    score_vals=[]
    feature_vals=[]   
    for idx,score in sorted_items:
        score_vals.append(round(score,3))
        feature_vals.append(feature_names[idx])
    results={}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    return results 

In [58]:
#Calculo de TF-IDF y encontrar top 10 de keywords para cada texto de las reviews de los vecinos
keywordsArray=[]
for text in clear_text_list:
    tf_idf_vector=Tfidf_transformer.transform(cv.transform([text]))
    sorted_items=sort_coo(tf_idf_vector.tocoo())
    keywords=extract_topn_from_vector(feature_names,sorted_items,10)
    keywordsArray.append(keywords)
keywordsArray[:5]

[{'agent': 0.476,
  'confirmed': 0.381,
  'adjoining': 0.253,
  'rooms': 0.243,
  'problems': 0.18,
  'noticed': 0.172,
  'male': 0.152,
  'desk': 0.146,
  'told': 0.141,
  'urine': 0.136},
 {'rules': 0.49,
  'explain': 0.325,
  'started': 0.231,
  'just': 0.147,
  'clerk': 0.13,
  'extra': 0.116,
  'charge': 0.116,
  'doesnt': 0.114,
  'desk': 0.112,
  'sure': 0.103},
 {'received': 0.305,
  'verbal': 0.203,
  'topped': 0.203,
  'survey': 0.203,
  'statements': 0.203,
  'reconsider': 0.203,
  'legal': 0.203,
  'forbidding': 0.203,
  'foot': 0.203,
  'downhill': 0.203},
 {'und': 0.681,
  'war': 0.227,
  'sehr': 0.151,
  'mit': 0.151,
  'hier': 0.151,
  'frühstück': 0.151,
  'der': 0.151,
  'alles': 0.151,
  'äpfel': 0.076,
  'zügig': 0.076},
 {'cheapest': 0.3,
  'waffle': 0.21,
  'getting': 0.21,
  'hotels': 0.207,
  'breakfast': 0.177,
  'door': 0.159,
  'hotel': 0.153,
  'worksit': 0.15,
  'troubles': 0.15,
  'tier': 0.15}]

In [59]:
reviews_keywords=[]
for text in clear_text_list:
    tf_idf_vector=Tfidf_transformer.transform(cv.transform([text]))
    sorted_items=sort_coo(tf_idf_vector.tocoo())
    keywords=extract_topn_from_vector(feature_names,sorted_items,10)
    reviews_keywords.append( list( keywords.keys() ) )
reviews_keywords[:5]

[['agent',
  'confirmed',
  'adjoining',
  'rooms',
  'problems',
  'noticed',
  'male',
  'desk',
  'told',
  'urine'],
 ['rules',
  'explain',
  'started',
  'just',
  'clerk',
  'extra',
  'charge',
  'doesnt',
  'desk',
  'sure'],
 ['received',
  'verbal',
  'topped',
  'survey',
  'statements',
  'reconsider',
  'legal',
  'forbidding',
  'foot',
  'downhill'],
 ['und',
  'war',
  'sehr',
  'mit',
  'hier',
  'frühstück',
  'der',
  'alles',
  'äpfel',
  'zügig'],
 ['cheapest',
  'waffle',
  'getting',
  'hotels',
  'breakfast',
  'door',
  'hotel',
  'worksit',
  'troubles',
  'tier']]

In [60]:
df_reviews_keywords = pd.DataFrame(reviews_keywords)
df_reviews_keywords

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,agent,confirmed,adjoining,rooms,problems,noticed,male,desk,told,urine
1,rules,explain,started,just,clerk,extra,charge,doesnt,desk,sure
2,received,verbal,topped,survey,statements,reconsider,legal,forbidding,foot,downhill
3,und,war,sehr,mit,hier,frühstück,der,alles,äpfel,zügig
4,cheapest,waffle,getting,hotels,breakfast,door,hotel,worksit,troubles,tier
...,...,...,...,...,...,...,...,...,...,...
237,aid,seat,modern,toilet,recently,didnt,large,minutes,time,took
238,updated,beds,really,comfortable,friendly,great,rooms,clean,stay,
239,fully,night,refund,paid,away,booked,watched,vacancies,timing,roomed
240,incredible,vegas,strongly,youre,reasonable,staff,coming,rate,looking,recommend


In [61]:
reviews_keywords_scores=[]
for text in clear_text_list:
    tf_idf_vector=Tfidf_transformer.transform(cv.transform([text]))
    sorted_items=sort_coo(tf_idf_vector.tocoo())
    keywords=extract_topn_from_vector(feature_names,sorted_items,10)
    reviews_keywords_scores.append( list( keywords.values() ))
reviews_keywords_scores[:5]

[[0.476, 0.381, 0.253, 0.243, 0.18, 0.172, 0.152, 0.146, 0.141, 0.136],
 [0.49, 0.325, 0.231, 0.147, 0.13, 0.116, 0.116, 0.114, 0.112, 0.103],
 [0.305, 0.203, 0.203, 0.203, 0.203, 0.203, 0.203, 0.203, 0.203, 0.203],
 [0.681, 0.227, 0.151, 0.151, 0.151, 0.151, 0.151, 0.151, 0.076, 0.076],
 [0.3, 0.21, 0.21, 0.207, 0.177, 0.159, 0.153, 0.15, 0.15, 0.15]]

In [62]:
kn_reviews_df['useful_mean'] = kn_reviews_df['useful'].mean()
kn_reviews_df['is_useful'] = kn_reviews_df['useful'] > kn_reviews_df['useful_mean']

In [63]:
kn_reviews_df

Unnamed: 0,_id,review_id,user_id,business_id,stars,useful,funny,cool,text,date,useful_mean,is_useful
0,5e9664963b612dc2f0e488c0,-86eJuYugapSTs5dvCZKtw,qjX40bTubCboIQ6S4mu54A,5FVBaD1poRZQeqUDxbWWQA,1,3,0,0,Checked in with my family of five for two room...,2013-12-30 14:52:18,1.024793,True
1,5e9664973b612dc2f0e4b13d,j57XFIDuzcpBrcptbrJSwA,USljZmbtpya0BEKocSLbUw,5FVBaD1poRZQeqUDxbWWQA,1,1,0,0,AT first it seemed like a good hotel. I'm not ...,2013-01-28 19:03:36,1.024793,False
2,5e9664973b612dc2f0e4c04b,Hb_kJxRWevasbqHfD_YBQw,mEXLSpAc0A5zdW5iZEosSg,5FVBaD1poRZQeqUDxbWWQA,1,1,0,0,I have stayed here 7 times in the past few mon...,2017-03-15 22:17:31,1.024793,False
3,5e9664983b612dc2f0e511eb,qpdvJX2M17LZrrtamrpgjA,YcdgoZzc9yuGN1pBBxevRQ,5FVBaD1poRZQeqUDxbWWQA,5,0,0,0,Check-in war sehr zügig und der Mitarbeiter ex...,2013-10-06 15:22:27,1.024793,False
4,5e9664993b612dc2f0e5448a,ivzJHezSj4594qkg6DbS9g,cpOqABn6YiktZqMrJUB8Aw,5FVBaD1poRZQeqUDxbWWQA,4,2,0,0,"the waffle factor:\n\n i don't work, haven't f...",2012-11-08 15:29:18,1.024793,True
...,...,...,...,...,...,...,...,...,...,...,...,...
237,5e9665cd3b612dc2f0519561,mfC8GXG3j38ip_KErwe1dg,fCExFIeu2BT35FSx2Yz2aA,gjR5bIyL4ETB9CtWgNL64A,5,4,0,2,So most everyone from 'round these parts are v...,2018-02-10 05:36:37,1.024793,True
238,5e9665cd3b612dc2f051c269,LPNViYqzq4N7iwMEFvFN_g,A2P6QP1T0An81jDnB9OnzQ,gjR5bIyL4ETB9CtWgNL64A,5,0,0,1,Great stay. Clean updated rooms. Comfortable b...,2019-03-20 17:42:25,1.024793,False
239,5e9665ce3b612dc2f0520816,-J6g1v3wGtT8VDxlhoPk8w,Z5_2D0duy6vBnMVHooEwEQ,gjR5bIyL4ETB9CtWgNL64A,1,3,0,0,DO NOT BOOK HERE. At least not if you are plan...,2018-09-02 11:12:09,1.024793,True
240,5e9665ce3b612dc2f0521b0b,TcAUyiFCd_oAxNjTEKEL_A,qOlt6hHXxUijPJ8iwlo6hA,gjR5bIyL4ETB9CtWgNL64A,5,0,0,0,Staff was incredible and the room was at a ver...,2019-06-29 03:47:43,1.024793,False


In [64]:
kn_reviews_df['useful'].describe()

count    242.000000
mean       1.024793
std        1.716229
min        0.000000
25%        0.000000
50%        0.000000
75%        1.000000
max       15.000000
Name: useful, dtype: float64

In [65]:
kn_reviews_df['is_useful'].describe()

count       242
unique        2
top       False
freq        186
Name: is_useful, dtype: object

In [66]:
reviews_keywords_df = pd.DataFrame(reviews_keywords)
reviews_keywords_df = pd.concat([kn_reviews_df['review_id'], kn_reviews_df['is_useful'], reviews_keywords_df], axis=1)
reviews_keywords_df

Unnamed: 0,review_id,is_useful,0,1,2,3,4,5,6,7,8,9
0,-86eJuYugapSTs5dvCZKtw,True,agent,confirmed,adjoining,rooms,problems,noticed,male,desk,told,urine
1,j57XFIDuzcpBrcptbrJSwA,False,rules,explain,started,just,clerk,extra,charge,doesnt,desk,sure
2,Hb_kJxRWevasbqHfD_YBQw,False,received,verbal,topped,survey,statements,reconsider,legal,forbidding,foot,downhill
3,qpdvJX2M17LZrrtamrpgjA,False,und,war,sehr,mit,hier,frühstück,der,alles,äpfel,zügig
4,ivzJHezSj4594qkg6DbS9g,True,cheapest,waffle,getting,hotels,breakfast,door,hotel,worksit,troubles,tier
...,...,...,...,...,...,...,...,...,...,...,...,...
237,mfC8GXG3j38ip_KErwe1dg,True,aid,seat,modern,toilet,recently,didnt,large,minutes,time,took
238,LPNViYqzq4N7iwMEFvFN_g,False,updated,beds,really,comfortable,friendly,great,rooms,clean,stay,
239,-J6g1v3wGtT8VDxlhoPk8w,True,fully,night,refund,paid,away,booked,watched,vacancies,timing,roomed
240,TcAUyiFCd_oAxNjTEKEL_A,False,incredible,vegas,strongly,youre,reasonable,staff,coming,rate,looking,recommend


In [67]:
reviews_keywords_scores_df = pd.DataFrame(reviews_keywords_scores).fillna(0)
#reviews_keywords_scores_df = pd.concat([kn_reviews_df['is_useful'], reviews_keywords_scores_df], axis=1)
reviews_keywords_scores_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.476,0.381,0.253,0.243,0.180,0.172,0.152,0.146,0.141,0.136
1,0.490,0.325,0.231,0.147,0.130,0.116,0.116,0.114,0.112,0.103
2,0.305,0.203,0.203,0.203,0.203,0.203,0.203,0.203,0.203,0.203
3,0.681,0.227,0.151,0.151,0.151,0.151,0.151,0.151,0.076,0.076
4,0.300,0.210,0.210,0.207,0.177,0.159,0.153,0.150,0.150,0.150
...,...,...,...,...,...,...,...,...,...,...
237,0.213,0.190,0.168,0.149,0.137,0.132,0.131,0.129,0.120,0.118
238,0.519,0.401,0.344,0.317,0.310,0.295,0.257,0.229,0.223,0.000
239,0.242,0.193,0.184,0.167,0.155,0.140,0.135,0.135,0.135,0.135
240,0.627,0.404,0.280,0.213,0.213,0.195,0.193,0.186,0.174,0.171


## Find reviews for a business

In [68]:
keywords_df_with_class = pd.concat([kn_reviews_df['review_id'], kn_reviews_df['is_useful'], keyword_tf_idf_df], axis=1)
keywords_df_with_class

Unnamed: 0,review_id,is_useful,0,1,2,3,4,5,6,7,...,3458,3459,3460,3461,3462,3463,3464,3465,3466,3467
0,-86eJuYugapSTs5dvCZKtw,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00000,0.0,0.0,0.0,0.00000,0.00000,0.0,0.00000,0.00000
1,j57XFIDuzcpBrcptbrJSwA,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00000,0.0,0.0,0.0,0.00000,0.00000,0.0,0.00000,0.00000
2,Hb_kJxRWevasbqHfD_YBQw,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00000,0.0,0.0,0.0,0.00000,0.00000,0.0,0.00000,0.00000
3,qpdvJX2M17LZrrtamrpgjA,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.07565,0.0,0.0,0.0,0.07565,0.07565,0.0,0.07565,0.07565
4,ivzJHezSj4594qkg6DbS9g,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00000,0.0,0.0,0.0,0.00000,0.00000,0.0,0.00000,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
237,mfC8GXG3j38ip_KErwe1dg,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00000,0.0,0.0,0.0,0.00000,0.00000,0.0,0.00000,0.00000
238,LPNViYqzq4N7iwMEFvFN_g,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00000,0.0,0.0,0.0,0.00000,0.00000,0.0,0.00000,0.00000
239,-J6g1v3wGtT8VDxlhoPk8w,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00000,0.0,0.0,0.0,0.00000,0.00000,0.0,0.00000,0.00000
240,TcAUyiFCd_oAxNjTEKEL_A,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00000,0.0,0.0,0.0,0.00000,0.00000,0.0,0.00000,0.00000


In [69]:
#Con esta configuración se utilizan los 3 vecinos más cercanos, con distancia euclidiana
knn_clasif=KNeighborsClassifier(3)

In [70]:
# Fit recibe la matriz de entrenamiento y la clase objetivo
knn_clasif.fit(keyword_tf_idf_df, keywords_df_with_class['is_useful'])

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [71]:
# llamamos predict sobre  los test , creando una nueva columna en el dataframe de test
keywords_df_with_class['predict']=knn_clasif.predict(keyword_tf_idf_df)
keywords_df_with_class

Unnamed: 0,review_id,is_useful,0,1,2,3,4,5,6,7,...,3459,3460,3461,3462,3463,3464,3465,3466,3467,predict
0,-86eJuYugapSTs5dvCZKtw,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00000,0.0,0.0,0.0,0.00000,0.00000,0.0,0.00000,0.00000,True
1,j57XFIDuzcpBrcptbrJSwA,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00000,0.0,0.0,0.0,0.00000,0.00000,0.0,0.00000,0.00000,False
2,Hb_kJxRWevasbqHfD_YBQw,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00000,0.0,0.0,0.0,0.00000,0.00000,0.0,0.00000,0.00000,False
3,qpdvJX2M17LZrrtamrpgjA,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.07565,0.0,0.0,0.0,0.07565,0.07565,0.0,0.07565,0.07565,False
4,ivzJHezSj4594qkg6DbS9g,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00000,0.0,0.0,0.0,0.00000,0.00000,0.0,0.00000,0.00000,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
237,mfC8GXG3j38ip_KErwe1dg,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00000,0.0,0.0,0.0,0.00000,0.00000,0.0,0.00000,0.00000,True
238,LPNViYqzq4N7iwMEFvFN_g,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00000,0.0,0.0,0.0,0.00000,0.00000,0.0,0.00000,0.00000,False
239,-J6g1v3wGtT8VDxlhoPk8w,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00000,0.0,0.0,0.0,0.00000,0.00000,0.0,0.00000,0.00000,True
240,TcAUyiFCd_oAxNjTEKEL_A,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00000,0.0,0.0,0.0,0.00000,0.00000,0.0,0.00000,0.00000,False


In [72]:
keywords_df_with_class['keywords'] = reviews_keywords

In [76]:
relevant_keywords_df = pd.DataFrame(reviews_keywords)
pd.concat([keywords_df_with_class['review_id'], relevant_keywords_df], axis=1)

Unnamed: 0,review_id,0,1,2,3,4,5,6,7,8,9
0,-86eJuYugapSTs5dvCZKtw,agent,confirmed,adjoining,rooms,problems,noticed,male,desk,told,urine
1,j57XFIDuzcpBrcptbrJSwA,rules,explain,started,just,clerk,extra,charge,doesnt,desk,sure
2,Hb_kJxRWevasbqHfD_YBQw,received,verbal,topped,survey,statements,reconsider,legal,forbidding,foot,downhill
3,qpdvJX2M17LZrrtamrpgjA,und,war,sehr,mit,hier,frühstück,der,alles,äpfel,zügig
4,ivzJHezSj4594qkg6DbS9g,cheapest,waffle,getting,hotels,breakfast,door,hotel,worksit,troubles,tier
...,...,...,...,...,...,...,...,...,...,...,...
237,mfC8GXG3j38ip_KErwe1dg,aid,seat,modern,toilet,recently,didnt,large,minutes,time,took
238,LPNViYqzq4N7iwMEFvFN_g,updated,beds,really,comfortable,friendly,great,rooms,clean,stay,
239,-J6g1v3wGtT8VDxlhoPk8w,fully,night,refund,paid,away,booked,watched,vacancies,timing,roomed
240,TcAUyiFCd_oAxNjTEKEL_A,incredible,vegas,strongly,youre,reasonable,staff,coming,rate,looking,recommend


In [73]:
keywords_df_with_class

Unnamed: 0,review_id,is_useful,0,1,2,3,4,5,6,7,...,3460,3461,3462,3463,3464,3465,3466,3467,predict,keywords
0,-86eJuYugapSTs5dvCZKtw,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.00000,0.00000,0.0,0.00000,0.00000,True,"[agent, confirmed, adjoining, rooms, problems,..."
1,j57XFIDuzcpBrcptbrJSwA,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.00000,0.00000,0.0,0.00000,0.00000,False,"[rules, explain, started, just, clerk, extra, ..."
2,Hb_kJxRWevasbqHfD_YBQw,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.00000,0.00000,0.0,0.00000,0.00000,False,"[received, verbal, topped, survey, statements,..."
3,qpdvJX2M17LZrrtamrpgjA,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.07565,0.07565,0.0,0.07565,0.07565,False,"[und, war, sehr, mit, hier, frühstück, der, al..."
4,ivzJHezSj4594qkg6DbS9g,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.00000,0.00000,0.0,0.00000,0.00000,True,"[cheapest, waffle, getting, hotels, breakfast,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
237,mfC8GXG3j38ip_KErwe1dg,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.00000,0.00000,0.0,0.00000,0.00000,True,"[aid, seat, modern, toilet, recently, didnt, l..."
238,LPNViYqzq4N7iwMEFvFN_g,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.00000,0.00000,0.0,0.00000,0.00000,False,"[updated, beds, really, comfortable, friendly,..."
239,-J6g1v3wGtT8VDxlhoPk8w,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.00000,0.00000,0.0,0.00000,0.00000,True,"[fully, night, refund, paid, away, booked, wat..."
240,TcAUyiFCd_oAxNjTEKEL_A,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.00000,0.00000,0.0,0.00000,0.00000,False,"[incredible, vegas, strongly, youre, reasonabl..."


In [29]:
reviews_ids = keywords_df_with_class[ keywords_df_with_class['predict'] == True ]
list(reviews_ids['review_id'])

['-86eJuYugapSTs5dvCZKtw',
 'ivzJHezSj4594qkg6DbS9g',
 'YK3MYDXrn0_Vodpi3nimBA',
 '4kjyktlG2hyM4YsEqAOmXA',
 'wogUf7CqmRbJ9uIO8urDkg',
 'WwrJGDo7Vv9eSwdeFKPrmQ',
 'niGE-8FPXPJf5EWxon3E-Q',
 '0moAlxjixwZK8rjTXMQmpg',
 'IhfpgO63Irw9lxLEqYMWfg',
 'Lp5Abn8H7DCMlSWCIbbkIA',
 'B-hZP-x4VVdxg5C7vNXeWw',
 'LsNQbGrwWsK_KqU6b2Jivw',
 'qj5oCkr1UdVO587Fico4ww',
 'rij66bi-Am6b4X6fZfJAog',
 'ohegoRSpqEgygkGDHThsFA',
 'Bv802uJfg8jj8PMm82KQVA',
 'r9ScChJMSaf8o87690gLXw',
 'B02Sgz3gcGJUSBrACczI6A',
 'daQt3ldxLn5KykOiyXGnww',
 '-dYcH2NvdmrPQcJ_oK1RpQ',
 'aw0L5K_9Y51QL5PYoAqw3w',
 '5TOcLPE0QrIARWUbY4HCmQ',
 '1JrOpWAd4IJkCWuT9GxwYw',
 'm1lemSEXYnZXXpxeOkwPXw',
 'CusazvEAf-hRyGWvY_FezQ',
 'CHS98KBMYOkeX1KabqhBDA',
 'LSiG-uvfp7mzwJpI_oSCvw',
 'vmNNbQ7cMwchpVz7IJ5KwA',
 '61EU0jV072LwHKYMKrr6-A',
 'SGfzkcIqpYt5o9-9SfoP5g',
 'zWzPj4iyCegJ6PESsEAaFw',
 'l0S3FVCdu4Tl3hlKUX6bmg',
 'TrELqbKWzJVt3FhtILDeZQ',
 'tpjr844-0zAZVsX1aXyqkw',
 '0PKx5rpUmN3Hqc7yz8FsSQ',
 'arxcGFxtCzwNW3GzVg09KQ',
 '6Yjx40PpFzX4uauz4Zcvmw',
 

In [143]:
tn, fp, fn, tp = confusion_matrix(keywords_df_with_class['is_useful'],keywords_df_with_class['predict'], labels=[False,True]).ravel()

In [28]:
print(tn)
print(fp)
print(fn)
print(tp)

37
4
3
15


In [29]:
precision_recall_fscore_support(keywords_df_with_class['is_useful'],keywords_df_with_class['predict'], pos_label=True,average='binary')

(0.7894736842105263, 0.8333333333333334, 0.8108108108108109, None)