In [34]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.metrics import f1_score, make_scorer, accuracy_score

import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS

import pickle

import warnings
warnings.filterwarnings('ignore')

In [35]:
#Se cargan las reviews de Darden
path_data = '../data/clean'
reviews = pd.read_parquet(f'{path_data}/reviews_darden.parquet')

In [36]:
with open('modelo_ML_negativas2.pkl', 'rb') as archivo:
    grid_svm = pickle.load(archivo)

In [37]:
def predecir_etiqueta(review, grid_svm):
    return grid_svm.predict([review])[0]  # [0] para obtener el elemento de la lista

def etiquetado_reviews(df_reviews, grid_svm):
  reviews_negativas = df_reviews[df_reviews.stars<=3]
  reviews_negativas_etiquetadas = reviews_negativas[['text','name','stars']]
  reviews_negativas_etiquetadas['problem'] = reviews_negativas['text'].apply(lambda x: predecir_etiqueta(x, grid_svm))


  nuevas_columnas = reviews_negativas_etiquetadas['problem'].apply(pd.Series)

  reviews_negativas_etiquetadas = pd.concat([reviews_negativas_etiquetadas, nuevas_columnas], axis=1)
  reviews_negativas_etiquetadas.rename(columns={0: 'Bad_Food', 1: 'Cost', 2: 'Filthy', 3: 'Missing_Food', 4: 'Order_Problem', 5: 'Rude_Service', 6: 'Bad_Neighborhood', 7: 'Slow_Service'}, inplace=True)
  return reviews_negativas_etiquetadas


def lista_palabras_problematica(df_reviews_negativas_etiquetadas, restaurante, problematica, stop_words, n_words):
  # Combina todo el texto en una sola cadena
  df_reviews_negativas_etiquetadas = df_reviews_negativas_etiquetadas
  text = ' '.join(df_reviews_negativas_etiquetadas[(df_reviews_negativas_etiquetadas[problematica] == 1) & (df_reviews_negativas_etiquetadas.name ==restaurante)]['text'])

  # Stopwords personalizadas
  sw = set(STOPWORDS)
  sw.update(stop_words)

  # Crea la nube de palabras con las stopwords personalizadas
  wc = WordCloud(
      width=800,
      height=400,
      background_color='white',
      stopwords=sw,
      max_words=n_words,
  ).generate(text)

  # Obtén las palabras más frecuentes
  word_frequencies = wc.words_
  word_list = [word for word in word_frequencies.keys()]
  word_freq = [freq for freq in word_frequencies.values()]
  return [word_list,word_freq]

def lista_palabras_problematicas(df_reviews, grid_svm):
  # Combina todo el texto en una sola cadena):
  df_reviews_etiquetadas = etiquetado_reviews(df_reviews, grid_svm)
  restaurantes = df_reviews_etiquetadas.name.unique()
  problematicas = ('Bad_Food', 'Rude_Service', 'Slow_Service')

  stop_words_bad_food = (['good','ordered','one','place','service',
  'came','restaurant','got','time','really','great','go',
  'nice', 'better', 'bad', 'table', 'side', 'u', 'server',
  'will', 'order', 'location', 'went', 'eat', 'well',
  'back', 'asked', 'manager', 'first', 'way', 'never',
  'come', 'say', 'us', 'even', 'thing', 'know', 'going',
  'served', 'waitress', 'ok', 'made', 'said', 'think',
  'waiter', 'took', 'hard', 'new', 'horrible', 'long',
  'take', 'make', 'visit', 'still', 'left', 'husband',
  'last', 'want', 'something', 'pretty', 'another', 'try',
  'brought', 'nothing', 'okay', 'best', 'though', 'friend',
  'wife', 'ate', 'always', 'around', 'people', 'feel',
  'staff', 'minute', 'hour', 'wait','two', 'used','give',
  'gone','sides','almost','seated','next','decided',
  'liked','overall', 'maybe','star','definitely',
  'au', 'sent', 'correctly', 'view', 'everything',
  'eating', 'sure', 'area'])

  stop_words_rude_service = (['good','one','place',
  'restaurant','got','really','great','go',
  'better', 'table', 'side', 'u',
  'will', 'location', 'went', 'eat', 'well',
  'back', 'first', 'way', 'never',
  'us', 'even', 'thing', 'know', 'going',
  'ok', 'made', 'think', 'took', 'hard', 'horrible',
  'take', 'make', 'visit', 'still', 'left', 'husband',
  'last', 'want', 'something', 'pretty', 'another', 'try',
  'nothing', 'okay', 'best', 'though', 'friend',
  'wife', 'ate', 'always', 'around', 'people', 'maybe',
  'chicken','cheddar', 'steak','salad' ,'beer', 'meal',
  'steak','pasta','soup', 'breadstick', 'wine',
  'cheese', 'burger', 'breadstick', 'steaks'
  'appetizer', 'minute','waiting', 'min','tow'
  ,'next','minutes','love','hour', 'appetizer',
  'onion rings', 'lemon'])

  stop_words_slow_service = (['good','one','place',
  'restaurant','got','really','great',
  'nice', 'better', 'bad', 'u',
  'will', 'location', 'eat', 'well',
  'first', 'way', 'never',
  'say', 'us', 'even', 'thing', 'know', 'going',
  'ok', 'said', 'think',
  'hard', 'horrible', 'maybe',
  'still', 'left', 'husband',
  'last', 'want', 'something', 'pretty', 'another', 'try'
  'nothing', 'okay', 'best', 'though', 'friend',
  'wife', 'ate', 'always', 'around', 'people', 'feel'])

  problema_stopwords = {
    'Bad_Food':[stop_words_bad_food, 30],
    'Rude_Service': [stop_words_rude_service, 30],
    'Slow_Service': [stop_words_slow_service, 30]
  }
  lista_restaurantes = []
  lista_problematica = []
  lista_palabras = []
  lista_frecuencias = []
  for restaurante in restaurantes:
    lista_restaurantes += [restaurante]*90
    for problematica in problematicas:
      lista_problematica += [problematica]*30
      lista_palabras += lista_palabras_problematica(df_reviews_etiquetadas, restaurante,problematica, problema_stopwords[problematica][0], problema_stopwords[problematica][1])[0]
      lista_frecuencias += lista_palabras_problematica(df_reviews_etiquetadas, restaurante,problematica, problema_stopwords[problematica][0], problema_stopwords[problematica][1])[1]
  dict_problematicas = {
        'Cadena': lista_restaurantes,
        'Etiqueta': lista_problematica,
        'Palabra': lista_palabras,
        'Frecuencia': lista_frecuencias
    }
  return dict_problematicas

In [38]:
palabras_problematicas = lista_palabras_problematicas(reviews, grid_svm)

In [39]:
palabras_problematicas.keys()

dict_keys(['Cadena', 'Etiqueta', 'Palabra', 'Frecuencia'])

In [40]:
df = pd.DataFrame(palabras_problematicas, columns=palabras_problematicas.keys())

In [41]:
df['Frecuencia'] = df['Frecuencia'].apply(lambda x: int(x * 1000))

In [42]:
df.sample(10)

Unnamed: 0,Cadena,Etiqueta,Palabra,Frecuencia
334,Olive Garden Italian Restaurant,Slow_Service,order,655
392,Ruth's Chris Steak House,Rude_Service,food,788
126,Yard House,Rude_Service,bartender,376
658,Seasons 52,Bad_Food,lot,163
459,Eddie V's Prime Seafood,Bad_Food,cooked,315
193,LongHorn Steakhouse,Bad_Food,appetizer,188
583,Bahama Breeze,Rude_Service,bad,358
687,Seasons 52,Rude_Service,wanted,183
689,Seasons 52,Rude_Service,star,166
651,Seasons 52,Bad_Food,sauce,180


In [43]:
df.to_parquet('ml_labels_palabras.parquet')