In [74]:
from IPython.core.display import HTML
HTML("<style>.container { width:95% !important; float:center}</style>")

In [75]:
from __future__ import division, print_function

import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
import textblob as tb
from textblob import TextBlob
from wordcloud import WordCloud


from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer



%matplotlib inline

sns.set_context('notebook', font_scale=1.5)
sns.set_style('ticks')

punctuation = ['.',',',':','!',';','-','?','"',"'",'(',')','—']   
other = ['ive','ve', "i've", "i'v", 'i’ll', 'i’ve', 'i’v']  # 'deb','hideb','don','didn','twaittry','doesn','thank','heydeb',
mystops = stopwords.words('english') + punctuation + other


In [76]:
comments_classified = pd.read_csv('comments_classified_SK_filtered2000_additional.csv',index_col=0)

comments_with_sentences_100 = pd.read_csv('comments_with_sentences_100.csv',index_col=0)
comments_only_100 = pd.read_csv('comments_only_100.csv',index_col=0)

comments_with_sentences_all = pd.read_csv('comments_with_sentences_all.csv',index_col=0)
comments_only_all = pd.read_csv('comments_only_all.csv',index_col=0)

In [77]:
recipes = pd.read_csv('/Users/kateliea/Documents/Insight/project/webscrapers/recipes_smittenkitchen_100.csv', index_col=0)

In [78]:
recipes['recipe_ID'] = recipes.index

# recipes['recipe_ID'] = recipes['recipe_ID'].apply(lambda x: 'R'+str(x))

In [80]:
comments_classified.columns

Index(['category', 'sentence', 'commentID', 'child_id', 'children',
       'comment_time', 'recipenumber', 'title', 'url', 'usercomment',
       'username', 'usersite', 'usercomment_lower', 'tokens', 'tokens_stemmed',
       'sentence_bigrams', 'sentence_trigrams', 'sentence_tokens',
       'sentence_tokens_stemmed', 'category_label'],
      dtype='object')

In [81]:
def comment_sentiment(comment): 
    comment = tb.TextBlob(comment)
    return comment.sentiment.polarity

In [82]:
comments_with_sentences_100['sentiment_polarity_comment'] = comments_with_sentences_100.usercomment.apply(comment_sentiment)
comments_with_sentences_100['sentiment_polarity_sentence'] = comments_with_sentences_100.sentence.apply(comment_sentiment)

exploratory analysis of TextBlob

Sentiment(polarity, subjectivity). The polarity score is a float within the range [-1.0, 1.0]. The subjectivity is a float within the range [0.0, 1.0] where 0.0 is very objective and 1.0 is very subjective.m

In [106]:
txt = tb.TextBlob(comments_classified.usercomment[130])
txt

TextBlob("I am so making this next week-end. I agree with you about lemon and here’s a suggestion that I am going to do when I bake this pie: move the lemon flavor to the crust. Just zest the lemon in the dough while pulsing the butter and flour. i think it should give that lemony edge without intercepting anything in the filling!")

In [None]:
txt.tags, txt.noun_phrases

In [85]:
txt.sentiment

Sentiment(polarity=0.17348484848484846, subjectivity=0.5348484848484848)

In [86]:
comments_with_sentences_100[(comments_with_sentences_100.sentiment_polarity_sentence > 0) & (comments_with_sentences_100.title=='best cocoa brownies')].sentiment_polarity_sentence.mean()

0.4595538414182737

calculate positive/negative mean sentiment for recipe

In [87]:
title = recipes.title[2]

positive = 100*comments_with_sentences_100[
    (comments_with_sentences_100.sentiment_polarity_sentence > 0) 
    & (comments_with_sentences_100.title==title)].sentiment_polarity_sentence.mean()
negative = 100*comments_with_sentences_100[
    (comments_with_sentences_100.sentiment_polarity_sentence < 0) 
    & (comments_with_sentences_100.title==title)].sentiment_polarity_sentence.mean()


In [88]:
recipes['numberofcomments'].loc[recipes.title=='best cocoa brownies'].values[0]

993

In [89]:
(positive - negative)*(recipes[recipes.title==title].numberofcomments)

2    3506.751321
Name: numberofcomments, dtype: float64

In [90]:
def happiness_graph(recipe):
    width = 0.2
    lw=0
    fig, ax = plt.subplots()
    
    name = recipe['recipe_title']
    positive = recipe['mean_positivity_sentences']
    negative = recipe['mean_negativity_sentences']
    
    with sns.axes_style({'font.family': [u'sans-serif']}):
        sns.set_context('notebook', font_scale=1.5)
        ax.barh(0, 100*positive, color='orange', linewidth=lw)
        ax.barh(0, 100*negative, color='blue', linewidth=lw)

        ax.set_xlim((-60,60))
        ax.set_yticks(ticks=[])
        ax.set_title('recipe happiness (%)')
        sns.despine(offset=20, trim=True, left=True)
        
    plt.savefig('/Users/kateliea/Documents/Insight/project/images/' + name + '.jpg')
    plt.close()

In [91]:
recipes.columns

Index(['article_id', 'author', 'author_url', 'directions', 'firstimageURL',
       'ingredients', 'introductory_text', 'numberofcomments', 'published',
       'recipe_notes', 'servings', 'time', 'title', 'updated', 'url',
       'recipe_ID'],
      dtype='object')

create a data table with the recipe sentiment, ranked by happiness score 

include name for happiness plot

In [92]:
# recipe_ranks = pd.DataFrame()
recipe_ranks = {}

for i, recipe_title in enumerate(comments_with_sentences_100.title.unique()):
        recipe_sentiment = {}
        
        recipe_sentiment['recipe_title'] = recipe_title

        recipe_sentiment['recipe_ID'] = recipes[recipes.title == recipe_title].recipe_ID.values[0]
        
        recipe_sentiment['numberofcomments'] = recipes[recipes.title == recipe_title].numberofcomments.values[0]
        
        recipe_sentiment['mean_positivity_sentences'] = comments_with_sentences_100[
                    (comments_with_sentences_100.sentiment_polarity_sentence > 0) 
                    & (comments_with_sentences_100.title == recipe_title)].sentiment_polarity_sentence.mean()
        
        recipe_sentiment['mean_positivity_comment'] = comments_with_sentences_100[
                    (comments_with_sentences_100.sentiment_polarity_comment > 0) 
                    & (comments_with_sentences_100.title == recipe_title)].sentiment_polarity_comment.mean()

        recipe_sentiment['mean_negativity_sentences'] = comments_with_sentences_100[
                    (comments_with_sentences_100.sentiment_polarity_sentence < 0) 
                    & (comments_with_sentences_100.title == recipe_title)].sentiment_polarity_sentence.mean()
        
        recipe_sentiment['mean_negativity_comment'] = comments_with_sentences_100[
                    (comments_with_sentences_100.sentiment_polarity_comment < 0) 
                    & (comments_with_sentences_100.title == recipe_title)].sentiment_polarity_comment.mean()
        
        recipe_sentiment['sentence_rank'] = (recipe_sentiment['mean_positivity_sentences'] + recipe_sentiment['mean_negativity_sentences'])*(recipes[recipes.title==recipe_title].numberofcomments.values[0]/recipes.numberofcomments.mean())
        recipe_sentiment['comment_rank'] = (recipe_sentiment['mean_positivity_comment'] + recipe_sentiment['mean_negativity_comment'])*(recipes[recipes.title==recipe_title].numberofcomments.values[0]/recipes.numberofcomments.mean())
        
        recipe_sentiment['plot_name'] = recipe_title + '.jpg'
        
        
        recipe_ranks[i] = recipe_sentiment
               
ranks = pd.DataFrame.from_dict(recipe_ranks,orient='index')

ranks.to_csv('Recipe_sentiment_ranks.csv')

In [93]:
for recipe in recipe_ranks: 
    happiness_graph(recipe_ranks[recipe])

In [94]:
ranks

Unnamed: 0,mean_negativity_comment,recipe_ID,plot_name,recipe_title,sentence_rank,numberofcomments,mean_positivity_comment,mean_negativity_sentences,comment_rank,mean_positivity_sentences
0,-0.241741,0,sweet potato buttermilk pie.jpg,sweet potato buttermilk pie,0.174760,259,0.320051,-0.227809,0.073021,0.415227
1,-0.191508,1,best cocoa brownies.jpg,best cocoa brownies,0.848226,993,0.341266,-0.222290,0.535390,0.459554
2,-0.062500,2,sav’h.jpg,sav’h,0.016778,47,0.255622,-0.323480,0.032678,0.422637
3,-0.160319,3,classic cobb salad.jpg,classic cobb salad,0.113329,217,0.330228,-0.298049,0.132741,0.443110
4,-0.137902,4,cumin seed roasted cauliflower with yogurt.jpg,cumin seed roasted cauliflower with yogurt,0.182365,222,0.368366,-0.248137,0.184199,0.476307
5,-0.163575,5,chocolate pavlova.jpg,chocolate pavlova,0.070291,112,0.262694,-0.217418,0.039967,0.391740
6,-0.206981,6,homemade merguez with herby yogurt.jpg,homemade merguez with herby yogurt,0.028154,44,0.367544,-0.300111,0.025435,0.477839
7,-0.145611,7,fudgy chocolate sheet cake.jpg,fudgy chocolate sheet cake,0.240947,327,0.314451,-0.242268,0.198771,0.446933
8,-0.087379,8,israeli salad + pita chips.jpg,israeli salad + pita chips,0.060714,77,0.330390,-0.207342,0.067367,0.426355
9,-0.140018,9,failproof crepes + a crepe party.jpg,failproof crepes + a crepe party,0.114078,157,0.317312,-0.221274,0.100213,0.423098


In [95]:
ranks.numberofcomments.max(), ranks.numberofcomments.min(), ranks.numberofcomments.mean()

(1237, 31, 277.76)

In [96]:
ranks.sentence_rank.max(), ranks.sentence_rank.min(), ranks.sentence_rank.mean()

(1.0014616243669396, 0.016778357286231269, 0.2082712382563939)

In [100]:
ranks.comment_rank.max(), ranks.comment_rank.min(), ranks.comment_rank.mean()

(0.86852582425090097, -0.008230820515504382, 0.17752501259600553)

In [104]:
ranks[['recipe_title','sentence_rank','comment_rank']].sort_values('comment_rank').tail(6)

Unnamed: 0,recipe_title,sentence_rank,comment_rank
82,"how to poach an egg, smitten kitchen-style",0.335601,0.42779
1,best cocoa brownies,0.848226,0.53539
20,tomato sauce with onion and butter,0.558428,0.616788
74,chocolate peanut butter cake,1.001462,0.776213
12,red wine chocolate cake,0.798623,0.816769
55,cinnamon toast french toast + book preview,0.747399,0.868526


In [105]:
ranks[['recipe_title','sentence_rank','comment_rank']].sort_values('sentence_rank').tail(6)

Unnamed: 0,recipe_title,sentence_rank,comment_rank
24,perfect blueberry muffins,0.526798,0.398965
20,tomato sauce with onion and butter,0.558428,0.616788
55,cinnamon toast french toast + book preview,0.747399,0.868526
12,red wine chocolate cake,0.798623,0.816769
1,best cocoa brownies,0.848226,0.53539
74,chocolate peanut butter cake,1.001462,0.776213


In [99]:
ranks.to_csv('Recipe_sentiment_ranks.csv')

In [141]:
comments_with_sentences_100.loc[comments_with_sentences_100.sentence.str.contains(phrase) == True, 'category'] = 'substitute'

In [186]:
phrase = 'reduc'

In [None]:
comments_with_sentences_100[comments_with_sentences_100.sentence.str.contains(phrase)] # = 'add'

## a dictionary of related words to classify with

In [178]:
subs = ['had trouble','had issue','exchange','recommend','change','easier','gluten free','gluten-free','vegan','vegetarian','easier','replace with','replac','adjust','suggest','swap','switch','instead of','in place of','substitute','replace','i use','to make it','customiz','adjust','instead','exchang','opted to use','better with']

add = ['will ad','will do','next time','going to add','add more','better with','includ','i up','should have ad']

omit = ['omit','leave out','remov','left out','eliminat','add less','forgot','decreas','dispensed with','delet','reduc','should have left out']

In [195]:
# for _, row in comments_with_sentences_100[:1].iterrows():
#     print(row['usercomment'])
# #     for phrase in subs: 
        
#         if row.usercomment.str.contains(phrase):
#             row.category = 'substitute'

comments_with_sentences_100['category'] = 'other'

for phrase in omit: 
    comments_with_sentences_100.loc[comments_with_sentences_100.sentence.str.contains(phrase) == True, 'category'] = 'omission'
    
for phrase in subs: 
    comments_with_sentences_100.loc[comments_with_sentences_100.sentence.str.contains(phrase) == True, 'category'] = 'substitution'
    
for phrase in add: 
    comments_with_sentences_100.loc[comments_with_sentences_100.sentence.str.contains(phrase) == True, 'category'] = 'addition'
    


In [190]:
comments_with_sentences_100.loc[comments_with_sentences_100.sentence.str.contains(phrase) == True]

Unnamed: 0,sentence,commentID,child_id,children,comment_time,recipenumber,title,url,usercomment,username,usersite,usercomment_lower,tokens,tokens_stemmed,sentence_tokens,sentence_tokens_stemmed,sentiment_polarity_comment,sentiment_polarity_sentence,category
623,i should have added in at least 1 tbsp for ext...,196,0,no,2009-11-17 11:44:00,0,sweet potato buttermilk pie,https://smittenkitchen.com/2009/11/sweet-potat...,I made this for Thanksgiving and it was enjoye...,Jessie,none,i made this for thanksgiving and it was enjoye...,made thanksgiving enjoyed even pumpkin pie dev...,made thanksgiv enjoy even pumpkin pie devout a...,added least 1 tbsp extra tartness buttermilk a...,ad least 1 tbsp extra tart buttermilk almost o...,0.175,-0.15,addition
14362,"it came out a bit dry, so i probably should ha...",4012,0,no,2011-09-12 10:09:00,12,red wine chocolate cake,https://smittenkitchen.com/2011/09/red-wine-ch...,This was delicious! And super easy… one bowl! ...,Jen (Toronto),none,this was delicious! and super easy… one bowl! ...,delicious super easy… one bowl big cake-maker ...,delici super easy… one bowl big cake-mak idiot...,came bit dry probably adjusted bake time bit g...,came bit dri probabl adjust bake time bit give...,0.25125,0.216667,addition
25278,"so sorry, i should have added that the beans s...",7137,0,no,2007-08-28 13:58:00,23,white bean roasted red pepper dip,https://smittenkitchen.com/2007/08/dorkalicious/,"So sorry, I should have added that the beans s...",deb,http://smittenkitchen.com,"so sorry, i should have added that the beans s...",sorry added beans drained wrote recipe um 2003...,sorri ad bean drain wrote recip um 2003 recip ...,sorry added beans drained,sorri ad bean drain,-0.15,-0.5,addition


In [204]:
print('other, %i; additions, %i; substitutions, %i; omissions, %i' % (comments_with_sentences_100[comments_with_sentences_100.category == 'other'].sentence.count(), comments_with_sentences_100[comments_with_sentences_100.category == 'addition'].sentence.count(), comments_with_sentences_100[comments_with_sentences_100.category == 'substitution'].sentence.count(), comments_with_sentences_100[comments_with_sentences_100.category == 'omission'].sentence.count()))

other, 90688; additions, 1049; substitutions, 5006; omissions, 573


In [205]:
comments_with_sentences_100.to_csv('comments_with_sentences_100_classified_4classes.csv')