In [2]:
import os
import re
import pandas as pd
import numpy as np
import random

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# Load Files

In [4]:
kn_reviews_df = pd.read_json("../data/businesses_neighbours_reviews.json",lines=True)
kn_reviews_df.head()

Unnamed: 0,_id,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,5e9664b23b612dc2f0f009d3,KJ7I6unUOtO4Af8DRGBvCw,8vFdPfwvSrRriEzuRHLQDg,luOWGdtAfsbrZ5kC_cjuYA,3,1,1,0,Nice clean atmosphere set in s newly remodeled...,2014-10-27 01:51:18
1,5e9664b33b612dc2f0f00b87,YI8yUhoTTGbHyyhqtOry7w,NMQnFpYlj1I2z8niP0Td2w,lJh661dhjD5BDz5QrBEUeQ,1,0,0,0,I have lived in this area for 20 years and hav...,2014-08-10 20:34:33
2,5e9664b33b612dc2f0f03b58,q9DUuox-jmfAr80_tAjeTQ,C1uo5b6avxRDnppB1oDlzg,luOWGdtAfsbrZ5kC_cjuYA,5,2,0,1,"Oh my gosh, the chimi w/ baja is the best ever...",2012-12-05 20:16:53
3,5e9664b53b612dc2f0f0f453,pyhQXENH7i05M2_PdAN-LA,qibGLHABNReGeJr2w4_8yQ,lJh661dhjD5BDz5QrBEUeQ,3,3,2,3,"Not the best filbertos, but not the worst. I c...",2011-10-29 22:42:23
4,5e9664b53b612dc2f0f11232,QgCNyWwqo81lOAiPiYTzDQ,tA9wWXliXSWVEW9BgyxzVQ,lJh661dhjD5BDz5QrBEUeQ,4,1,0,1,I have been craving Horchata ever since we mov...,2012-03-04 02:19:10


# Build TF-IDF for reviews

In [5]:
reviews_texts=kn_reviews_df['text']

### Clean especial characters

In [29]:
clear_text_list=[]
for x in reviews_texts:
    y=re.sub(r'[,.!-?¿¡"&$%#\n\t]','',x.lower())
    clear_text_list.append(y)
clear_text_list[:5]

['nice clean atmosphere set in s newly remodeled strip msll service on a friday night was quick and friendly maybe because i was one of  tables with guests at  pm could it be the food i had to ask myself as the parking lot was full ordered the carne asada torta and my friend ordered a green chili burro deep fried enchilada style the food arrived surptisinly fast it almost beat our water drinks i will start with my torta the carne was almost cooked to crisp leather and chopped fine to maybe disguise it the roll was tossted nicely but the rest made it the worst torta i have ever tried to consume my friend let me taste her burrito which was very good i gave up on the torta and claimed half of my friends burro the green chili and service raised my rating to  stars but the torta might explain all the empty seats on this friday night',
 'i have lived in this area for  years and have never gone to this filibertos  my intuition told me noi should have listened  but nooooi finally said  oh what

### Se crea una matrix de conteo de (textos x palabras), se ignoran las palabras que aparezcan en el 85% de los documentos(no relevantes)

In [30]:
cv=CountVectorizer(max_df=0.85, stop_words="english")
word_count_vector=cv.fit_transform(clear_text_list)

### Se convierte la matriz dispersa a dataframe

In [31]:
keywords_df = pd.DataFrame(word_count_vector.toarray())

In [32]:
keywords_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4462,4463,4464,4465,4466,4467,4468,4469,4470,4471
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
644,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
645,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
646,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
647,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
#get_feature_names retorna el nombre de las entidades, no el indice, lista todas las keywords encontradas
feature_names=cv.get_feature_names()
feature_names[:5]

['abandoned', 'abbreviation', 'able', 'abondigastortilla', 'aboutdont']

In [34]:
#Calculo de TF- IDF sobre la matriz dispersa, smooth_idf modifica la formula matematica False para no ignorar completamente los terminos que aparecen en todos los textos
#Se utiliza normalizacion coseno
#use_idf true para calcular la ponderacion inversa de frecuencia
Tfidf_transformer=TfidfTransformer(smooth_idf=False,use_idf=True)
Tfidf_transformer.fit(word_count_vector)

TfidfTransformer(norm='l2', smooth_idf=False, sublinear_tf=False, use_idf=True)

In [35]:
def sort_coo(coo_matrix):
    tuples=zip(coo_matrix.col,coo_matrix.data)
    return sorted(tuples,key=lambda x:(x[1],x[0]),reverse=True)

#Extra de todas las keywords las n-keywords mas relevantes(TF-IDF)
def extract_topn_from_vector(feature_names, sorted_items,topn=10):
    sorted_items=sorted_items[:topn]
    score_vals=[]
    feature_vals=[]   
    for idx,score in sorted_items:
        score_vals.append(round(score,3))
        feature_vals.append(feature_names[idx])
    results={}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    return results 

In [36]:
#Calculo de TF-IDF y encontrar top 10 de keywords para cada texto de las reviews de los vecinos
keywordsArray=[]
for text in clear_text_list:
    tf_idf_vector=Tfidf_transformer.transform(cv.transform([text]))
    sorted_items=sort_coo(tf_idf_vector.tocoo())
    keywords=extract_topn_from_vector(feature_names,sorted_items,10)
    keywordsArray.append(keywords)
keywordsArray[:5]

[{'torta': 0.542,
  'burro': 0.209,
  'friday': 0.17,
  'friend': 0.167,
  'chili': 0.163,
  'maybe': 0.154,
  'night': 0.138,
  'green': 0.137,
  'tossted': 0.133,
  'surptisinly': 0.133},
 {'break': 0.265,
  'importantly': 0.207,
  'breath': 0.207,
  'hes': 0.177,
  'burro': 0.163,
  'dry': 0.146,
  'bean': 0.136,
  'carnitas': 0.134,
  'later': 0.13,
  'room': 0.129},
 {'churros': 0.29,
  'deserts': 0.279,
  'foods': 0.228,
  'flan': 0.228,
  'chimi': 0.202,
  'ice': 0.181,
  'try': 0.164,
  'fried': 0.152,
  'eating': 0.149,
  'cream': 0.142},
 {'filbertos': 0.42,
  'drinknot': 0.268,
  'blaring': 0.243,
  'blah': 0.243,
  'loud': 0.228,
  'crowd': 0.218,
  'tv': 0.21,
  'starving': 0.204,
  'weird': 0.182,
  'shredded': 0.176},
 {'filbertos': 0.459,
  'horchata': 0.289,
  'az': 0.275,
  'fries': 0.256,
  'pleaser': 0.195,
  'belly': 0.195,
  'appreciated': 0.195,
  'asada': 0.186,
  'carne': 0.181,
  'york': 0.177}]

In [38]:
reviews_keywords=[]
for text in clear_text_list:
    tf_idf_vector=Tfidf_transformer.transform(cv.transform([text]))
    sorted_items=sort_coo(tf_idf_vector.tocoo())
    keywords=extract_topn_from_vector(feature_names,sorted_items,10)
    reviews_keywords.append( list( keywords.keys() ) )
reviews_keywords[:5]

[['torta',
  'burro',
  'friday',
  'friend',
  'chili',
  'maybe',
  'night',
  'green',
  'tossted',
  'surptisinly'],
 ['break',
  'importantly',
  'breath',
  'hes',
  'burro',
  'dry',
  'bean',
  'carnitas',
  'later',
  'room'],
 ['churros',
  'deserts',
  'foods',
  'flan',
  'chimi',
  'ice',
  'try',
  'fried',
  'eating',
  'cream'],
 ['filbertos',
  'drinknot',
  'blaring',
  'blah',
  'loud',
  'crowd',
  'tv',
  'starving',
  'weird',
  'shredded'],
 ['filbertos',
  'horchata',
  'az',
  'fries',
  'pleaser',
  'belly',
  'appreciated',
  'asada',
  'carne',
  'york']]

## Find reviews for a business

In [None]:
#Con esta configuración se utilizan los 3 vecinos más cercanos, con distancia euclidiana
knn_clasif=KNeighborsClassifier(3)

In [None]:
# Fit recibe la matriz de entrenamiento y la clase objetivo
knn_clasif.fit(df_temporal_usuario_train[features_usuario_679], df_temporal_usuario_train['class'])

In [None]:
# llamamos predict sobre  los test , creando una nueva columna en el dataframe de test
df_temporal_usuario_test['predict']=knn_clasif.predict(df_temporal_usuario_test[features_usuario_679])