# Create features from reviews

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from nltk import word_tokenize          
from nltk.stem.snowball import SnowballStemmer

In [2]:
# Load restaurant reviews with hygyene-note
reviews_file = "./data/base_DFG_note"
reviews = pd.read_csv(reviews_file + ".csv")

In [3]:
reviews.head()

Unnamed: 0,Adresse,Code Postal,Commentaire,Date du commentaire,Note,Origine,Resto,Ville,Id,Date,Note_hyg,Note_resto,Note_hygiène_resto,Variance_note_resto,Variance_note_hygiène_resto
0,63 AV MOZART,75016,Un accueil hyper chaleureux! Les gérants sont ...,30/04/2016,5,TripAdvisor,macis cafe,Paris,0,20160430.0,5.0,4.0,4.8,0.666667,0.0
1,63 AV MOZART,75016,"Nous cherchions à déjeuner, seul bémol, l'heur...",21/04/2016,4,TripAdvisor,macis cafe,Paris,1,20160421.0,4.8,4.0,4.8,0.666667,0.0
2,63 AV MOZART,75016,Des plats réalisés à partir de produits frais ...,10/02/2016,3,TripAdvisor,macis cafe,Paris,2,20160210.0,5.0,4.0,4.8,0.666667,0.0
3,90 Rue des Orteaux,75020,Restaurant Sushi plutot correct dans l ensembl...,27/02/2016,3,TripAdvisor,sushi tomi,Paris,3,20160227.0,4.833333,3.0,4.866667,0.0,0.001111
4,90 Rue des Orteaux,75020,Déçue de ma dernière visite car impossible de ...,17/01/2016,3,TripAdvisor,sushi tomi,Paris,4,20160117.0,4.9,3.0,4.866667,0.0,0.001111


In [39]:
# Tokenizer + stemmer

# based on http://www.cs.duke.edu/courses/spring14/compsci290/assignments/lab02.html
stemmer = SnowballStemmer("french")
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    tokens = word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems

vocab = 'poivre loc ventre obligé malade réfrigérateur nori assaisonnées proposé pompon écrevisse gaufres lac nantua criant intoxication lendemain sept scandaleux épicée vomi pulperia sourd infectes fraiches courent horribles marnière exception indigestion malades oreille gluante guère comprenons étonnant 16 chères cru attendaient miushi tombées hawai perte cuisiniers nuits désobligeantes commissariat remarques plas poubelle gave retiendra alimentaire gluant traversent pékinois potage canapés brulé'
vocab_stem = tokenize(vocab)
hyg_dictionary = {}
idx = 0
for i in range(len(vocab_stem)):
    if vocab_stem[i] != "":
        hyg_dictionary[vocab_stem[i]] = idx
        # print(i,vocab_stem[i])
        idx += 1

vect = CountVectorizer(tokenizer=tokenize)
vect.fit(vocab_stem)
vocabulary = vect.get_feature_names()
print('Vocabulary: %s' %vocabulary)
print("Vocabulary length: ", len(vect.get_feature_names()) )

Vocabulary: ['16', 'alimentair', 'assaison', 'attend', 'brul', 'canap', 'cher', 'commissariat', 'comprenon', 'courent', 'cri', 'cru', 'cuisin', 'désoblig', 'except', 'fraich', 'gaufr', 'gav', 'glu', 'gu', 'haw', 'horribl', 'indigest', 'infect', 'intox', 'lac', 'lendemain', 'loc', 'malad', 'marn', 'miush', 'nantu', 'nor', 'nuit', 'oblig', 'oreil', 'pert', 'plas', 'poivr', 'pompon', 'potag', 'poubel', 'propos', 'pulper', 'pékinois', 'remarqu', 'retiendr', 'réfrig', 'scandal', 'sept', 'sourd', 'tomb', 'traversent', 'ventr', 'vom', 'écrev', 'épic', 'éton']
Vocabulary length:  58


In [77]:
# Empty DataFrame where features will be stored
rev_cols = ['Adresse', 'Code Postal', 'Resto', 'Ville', 'Note_resto',
       'Note_hygiène_resto', 'Variance_note_resto',
       'Variance_note_hygiène_resto']
cols = rev_cols + vocabulary + ["rev_cnt"]
features = pd.DataFrame([], columns=cols)
features.head()

Unnamed: 0,Adresse,Code Postal,Resto,Ville,Note_resto,Note_hygiène_resto,Variance_note_resto,Variance_note_hygiène_resto,16,alimentair,...,sept,sourd,tomb,traversent,ventr,vom,écrev,épic,éton,rev_cnt


In [84]:
idx = 0
for (resto, adresse), grp in reviews.groupby(["Resto", "Adresse"]):
    subset = reviews[(reviews["Resto"] == resto) & (reviews["Adresse"] == adresse)]
    reviews_cnt = subset.shape[0]
    merged_reviews = subset["Commentaire"].str.cat(sep=' ')
    
    if idx <= reviews.shape[0]:
        # Count words 
        rev_vect = vect.transform([merged_reviews])
        #print(resto, adresse)
        #print("Number of reviews: ", reviews_cnt)
        # print(subset.head())
        #print(rev_vect)
        #print('Vocabulary: %s' %vect.get_feature_names())
        #print("Length of merged review", len(merged_reviews))
        #print('Merged review:', merged_reviews)
        #print('Merged review vector:', rev_vect.toarray())
        # Original data that we will keep
        #print(subset[rev_cols])
        # Generated features
        countvec_df = pd.DataFrame(rev_vect.toarray(), columns=[vocabulary])
        merged_df = pd.DataFrame([], columns=cols)
        merged_df[rev_cols] = subset[rev_cols].head(1)
        merged_df.reset_index(drop=True, inplace=True)
        merged_df[vocabulary] = countvec_df[vocabulary].head(1)
        merged_df.reset_index(drop=True, inplace=True)
        merged_df["rev_cnt"] = reviews_cnt
        #print(merged_df.head())
        features = features.append(merged_df, ignore_index=True)
        del(merged_df)
    else:
        break
    idx += 1
features.head()

Unnamed: 0,Adresse,Code Postal,Resto,Ville,Note_resto,Note_hygiène_resto,Variance_note_resto,Variance_note_hygiène_resto,16,alimentair,...,sept,sourd,tomb,traversent,ventr,vom,écrev,épic,éton,rev_cnt
0,55 Boulevard Saint Marcel,75013,0 d'attente,Paris,3.9,3.67,0.29,2.2696,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0,10.0
1,"128, rue du Faubourg Saint Martin",75010,0039 ristorante italiano,Paris,3.222222,4.846111,2.17284,0.020053,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0
2,60 rue Albert,75013,015 gang nam,Paris,4.333333,4.844444,0.222222,0.010617,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
3,161 Avenue D'Italie,75013,1 pot,Paris,4.0,4.916667,0.666667,0.000278,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
4,55 Boulevard Saint Marcel,75013,0 d'attente,Paris,3.9,3.67,0.29,2.2696,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0,10.0


In [86]:
features.shape

(9665, 67)

In [88]:
# Save features
features_file = reviews_file + '_feat.csv'
features.to_csv(features_file, index=False)