In [1]:
import mlflow

from restaurant_reviews_allergy.utils.mlflow_ import download_data
from restaurant_reviews_allergy.utils.data import read_review_file

mlflow.set_tracking_uri('../mlruns')
mlflow.set_experiment('restaurant-reviews-allergy')

In [2]:
data = download_data('9abaaa7705734bb082e6886e3d9296b7', 'data_with_sentiment.pkl')

  and should_run_async(code)


### What cities and states appear most commonly in dataset?
City names need normalization if we are going to use these.

In [16]:
businesses = read_review_file("business", n_rows=0)
businesses \
    [['state', 'business_id']] \
    .groupby(['state'], as_index=False) \
    .count()  \
    .sort_values(['business_id'], ascending=False) \
    .head(15)

  and should_run_async(code)


Unnamed: 0,state,business_id
4,AZ,56686
19,NV,36312
22,ON,33412
15,NC,14720
21,OH,14697
23,PA,11216
24,QC,9219
0,AB,8012
32,WI,5154
14,IL,1932


In [23]:
businesses \
    [businesses['state'] == 'AZ'] \
    [['state', 'city', 'business_id']] \
    .groupby(['state', 'city'], as_index=False) \
    .count()  \
    .sort_values(['state', 'city'], ascending=False) \
    .head(20)

  and should_run_async(code)


Unnamed: 0,state,city,business_id
133,AZ,​Chandler,1
132,AZ,tempe,1
131,AZ,surprise,1
130,AZ,scottsdale,5
129,AZ,phoenix,9
128,AZ,peoria,1
127,AZ,mesa,3
126,AZ,glendale,2
125,AZ,cave creek,1
124,AZ,Youngtown,65


### Are logitude and lattitude generally populated? Could we use these for metro areas?
Looks like they are always populated

In [29]:
print(sum(businesses['latitude'].isna()))
print(sum(businesses['longitude'].isna()))

0
0


###  Does it look like we should we require specific allergy keywords or just a mention of an allergen?
Looks like mentions of gluten are fine on their own but most allergens need an allergy flag

In [28]:
allergy_cols = ['is_allergy','is_celiac', 'is_intolerant', 'is_intolerance']
allergen_cols = ['is_dairy', 'is_egg', 'is_gluten', 'is_soy', 'is_peanut', 'is__nut', 
                 'is_shellfish','is_wheat', 'is_seafood']
data['allergy_flag'] = (data[allergy_cols].sum(axis=1) > 0)

data \
    [['review_id', 'allergy_flag']] \
    .groupby('allergy_flag') \
    .agg({'review_id': ['count', 'nunique']})

Unnamed: 0_level_0,review_id,review_id
Unnamed: 0_level_1,count,nunique
allergy_flag,Unnamed: 1_level_2,Unnamed: 2_level_2
False,625137,463396
True,12332,10822


In [55]:
allergy_flag_by_allergen = data \
    [['allergy_flag'] + allergen_cols] \
    .groupby('allergy_flag') \
    .sum() \
    .reset_index() \
    .melt(id_vars=['allergy_flag'], var_name='allergen', value_name='count')
   
allergen_count = data \
    [allergen_cols] \
    .sum(axis=0) \
    .reset_index() \
    .rename(columns={'index': 'allergen', 0:'total_count'})

p_allergy_flag = allergy_flag_by_allergen \
    .merge(
        allergen_count,
        on=['allergen'],
        how='left'
        ) 
p_allergy_flag_by_allergen['p'] = p_allergy_flag['count']/p_allergy_flag['total_count']

p_allergy_flag_by_allergen \
    .pivot(
        index='allergy_flag', 
        columns='allergen', 
        values=['p','count']
        )

  and should_run_async(code)


Unnamed: 0_level_0,p,p,p,p,p,p,p,p,p,count,count,count,count,count,count,count,count,count
allergen,is__nut,is_dairy,is_egg,is_gluten,is_peanut,is_seafood,is_shellfish,is_soy,is_wheat,is__nut,is_dairy,is_egg,is_gluten,is_peanut,is_seafood,is_shellfish,is_soy,is_wheat
allergy_flag,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
False,0.98583,0.934818,0.999193,0.91992,0.986678,0.998383,0.886081,0.990709,0.983274,42090.0,12133.0,376581.0,35657.0,35328.0,92628.0,2139.0,29962.0,14050.0
True,0.01417,0.065182,0.000807,0.08008,0.013322,0.001617,0.113919,0.009291,0.016726,605.0,846.0,304.0,3104.0,477.0,150.0,275.0,281.0,239.0


In [59]:
sentences_no_allergy_flag = data[data['allergy_flag']==False]['sentences']
sentences_with_allergy_flag = data[data['allergy_flag']==True]['sentences']
n = 10

print('\n\n---- No allergy flag ----')
for x in sentences_no_allergy_flag[0:10]:
    print(x)
    
print('\n\n---- With allergy flag ----')
for x in sentences_with_allergy_flag[0:10]:
    print(x)



---- No allergy flag ----
powdered scrambled eggs, not cooked throughly.
i have trouble deciding between the chicken parmesan and eggplant parmesan.
not a lot of places do eggplant right, so when you do find it house made done correctly it's a must.
neither sounded appealing since i had my heart set on one of their standard dishes, chicken jolene, this is a huge chicken breast wrapped with eggplant lightly sauteed then finished in the oven.
the eggplant appetizer and calamari diavolo were delicious.
our apps of the breaded eggplant and the artichokes (served cold) were a great start.
unfortunately i had veal piccata and specifically asked about the sauce as i am gluten sensitive - she told me lemon juice, white wine and capers.
they had no more gluten free pasta, and the waitress was clueless as to what was gluten free.
she brought out my meal..with chicken on top of a plate of spaghetti and told me it was gluten free.
i told her it was not gluten free, and she took it to the kitchen

  and should_run_async(code)
