In [7]:
from IPython.core.display import HTML
HTML("<style>.container { width:95% !important; float:center}</style>")

In [33]:
from __future__ import division, print_function

import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
import textblob as tb
from textblob import TextBlob


from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer



%matplotlib inline

sns.set_context('notebook', font_scale=1.5)
sns.set_style('ticks')

punctuation = ['.',',',':','!',';','-','?','"',"'",'(',')','—']   
other = ['ive','ve', "i've", "i'v", 'i’ll', 'i’ve', 'i’v']  # 'deb','hideb','don','didn','twaittry','doesn','thank','heydeb',
mystops = stopwords.words('english') + punctuation + other


In [3]:
comments_classified = pd.read_csv('comments_classified_SK_filtered2000_additional.csv',index_col=0)

comments_with_sentences_100 = pd.read_csv('comments_with_sentences_100.csv',index_col=0)
comments_only_100 = pd.read_csv('comments_only_100.csv',index_col=0)

comments_with_sentences_all = pd.read_csv('comments_with_sentences_all.csv',index_col=0)
comments_only_all = pd.read_csv('comments_only_all.csv',index_col=0)

In [166]:
recipes = pd.read_csv('/Users/kateliea/Documents/Insight/project/webscrapers/recipes_smittenkitchen_100.csv', index_col=0)

In [178]:
comments_with_sentences_100.columns

Index(['sentence', 'commentID', 'child_id', 'children', 'comment_time',
       'recipenumber', 'title', 'url', 'usercomment', 'username', 'usersite',
       'usercomment_lower', 'tokens', 'tokens_stemmed', 'sentence_tokens',
       'sentence_tokens_stemmed', 'sentiment_polarity_comment',
       'sentiment_polarity_sentence'],
      dtype='object')

In [176]:
recipes['recipe_ID'] = recipes.index

recipes['recipe_ID'] = recipes['recipe_ID'].apply(lambda x: 'R'+str(x))

In [231]:
recipes.introductory_text

'As many of you have figured out, I’ve got a megawatt crush on Southern food. It comes out with a vengeance all summer when I want nothing more than to dry-rub ribs, make corn bread and buttermilk dressing salads, dive headfirst into tomato pie and douse pretty much everything in bourbon then usually goes into a soft hibernation over the winter save a fried chicken or chicken and dumplings run-in or two.Given this infatuation, it seems only right and proper that I’d get in a recipe for sweet potato pie at a time of year when sweet potatoes are exactly everywhere. But while I do love me some sweet potato pie, there’s a heaviness about it that is exactly what some people like about it but leaves me feeling kind of lukewarm. So you can imagine when I spied this fluffier, tangier and [here’s the part I think you’re really going to remember] almost cheesecake-like version of it a cookbook written by and I’d like to believe for Manhattanites with a thing for Southern home cooking, I bookmark

In [4]:
comments_classified.columns

Index(['category', 'sentence', 'commentID', 'child_id', 'children',
       'comment_time', 'recipenumber', 'title', 'url', 'usercomment',
       'username', 'usersite', 'usercomment_lower', 'tokens', 'tokens_stemmed',
       'sentence_bigrams', 'sentence_trigrams', 'sentence_tokens',
       'sentence_tokens_stemmed', 'category_label'],
      dtype='object')

In [58]:
def comment_sentiment(comment): 
    comment = tb.TextBlob(comment)
    return comment.sentiment.polarity

exploratory analysis of TextBlob

In [221]:
comments_classified.title[4]

'sweet potato buttermilk\xa0pie'

In [51]:
txt = tb.TextBlob(comments_classified.usercomment[4])
txt

TextBlob("This is definitely going on my Must Make list. Here in North Carolina, sweet potatoes are in season year-round so I am always looking for new ways to eat them!")

In [27]:
txt.tags, txt.noun_phrases

([('Ive', 'NNP'),
  ('never', 'RB'),
  ('made', 'VBD'),
  ('sweet', 'JJ'),
  ('potato', 'JJ'),
  ('pie', 'NN'),
  ('I’d', 'NNP'),
  ('love', 'NN'),
  ('to', 'TO'),
  ('try', 'VB'),
  ('for', 'IN'),
  ('the', 'DT'),
  ('Holidays', 'NNP')],
 WordList(['ive', 'sweet potato pie', 'i’d', 'holidays']))

Sentiment(polarity, subjectivity). The polarity score is a float within the range [-1.0, 1.0]. The subjectivity is a float within the range [0.0, 1.0] where 0.0 is very objective and 1.0 is very subjective.m

In [77]:
txt.sentiment

Sentiment(polarity=0.17348484848484846, subjectivity=0.5348484848484848)

In [None]:
comments_with_sentences_100['sentiment_polarity_sentence'] = comments_with_sentences_100.sentence.apply(comment_sentiment)

In [87]:
comments_with_sentences_100[(comments_with_sentences_100.sentiment_polarity_sentence > 0) & (comments_with_sentences_100.title=='best cocoa brownies')].sentiment_polarity_sentence.mean()

0.4595538414182737

In [53]:
comment

TextBlob("This is definitely going on my Must Make list. Here in North Carolina, sweet potatoes are in season year-round so I am always looking for new ways to eat them!")

In [189]:
title = recipes.title[2]

positive = 100*comments_with_sentences_100[
    (comments_with_sentences_100.sentiment_polarity_sentence > 0) 
    & (comments_with_sentences_100.title==title)].sentiment_polarity_sentence.mean()
negative = 100*comments_with_sentences_100[
    (comments_with_sentences_100.sentiment_polarity_sentence < 0) 
    & (comments_with_sentences_100.title==title)].sentiment_polarity_sentence.mean()


In [218]:
recipes.loc[recipes.title==title, 'numberofcomments']

2    47
Name: numberofcomments, dtype: int64

In [190]:
(positive - negative)*(recipes[recipes.title==title].numberofcomments)

2    3506.751321
Name: numberofcomments, dtype: float64

In [222]:
def happiness_graph(recipe):
    width = 0.2
    lw=0
    fig, ax = plt.subplots(figsize=(5,1))
    
    with sns.axes_style({'font.family': [u'sans-serif']}):
        sns.set_context('notebook', font_scale=1.5)
        ax.barh(0, positive, color='orange', linewidth=lw)
        ax.barh(0, negative, color='blue', linewidth=lw)

        ax.set_xlim((-60,60))
        ax.set_yticks(ticks=[])
        ax.set_title('recipe happiness (%)')
        sns.despine(offset=20, trim=True, left=True)
        
    plt.savefig('/Users/kateliea/Documents/Insight/project/images/' + name + '.jpg')

In [179]:
recipes.columns

Index(['article_id', 'author', 'author_url', 'directions', 'firstimageURL',
       'ingredients', 'introductory_text', 'numberofcomments', 'published',
       'recipe_notes', 'servings', 'time', 'title', 'updated', 'url',
       'recipe_ID'],
      dtype='object')

create a data table with the recipe sentiment, ranked by happiness score 

include name for happiness plot

In [202]:
# recipe_ranks = pd.DataFrame()
reicpe_ranks = {}

for i, recipe_title in enumerate(comments_with_sentences_100.title.unique()):
        recipe_sentiment = {}
        
        recipe_sentiment['recipe_title'] = recipe_title

        recipe_sentiment['recipe_ID'] = recipes[recipes.title == recipe_title].recipe_ID
        
        recipe_sentiment['numberofcomments'] = recipes[recipes.title == recipe_title].numberofcomments
        
        recipe_sentiment['mean_positivity_sentences'] = 100*comments_with_sentences_100[
                    (comments_with_sentences_100.sentiment_polarity_sentence > 0) 
                    & (comments_with_sentences_100.title == recipe_title)].sentiment_polarity_sentence.mean()
        
        recipe_sentiment['mean_positivity_comment'] = 100*comments_with_sentences_100[
                    (comments_with_sentences_100.sentiment_polarity_comment > 0) 
                    & (comments_with_sentences_100.title == recipe_title)].sentiment_polarity_comment.mean()

        recipe_sentiment['mean_negativity_sentences'] = 100*comments_with_sentences_100[
                    (comments_with_sentences_100.sentiment_polarity_sentence < 0) 
                    & (comments_with_sentences_100.title == recipe_title)].sentiment_polarity_sentence.mean()
        
        recipe_sentiment['mean_negativity_comment'] = 100*comments_with_sentences_100[
                    (comments_with_sentences_100.sentiment_polarity_comment < 0) 
                    & (comments_with_sentences_100.title == recipe_title)].sentiment_polarity_comment.mean()
        
        recipe_sentiment['sentence_rank'] = (positive - negative)*(recipes[recipes.title==title].numberofcomments)
        
        recipe_sentiment['plot_name'] = recipe_title + '.jpg'
        
        
        recipe_ranks[i] = recipe_sentiment
        
        
# ranks = pd.DataFrame(recipe_ranks)

In [199]:
recipe_ranks[0] = recipe_sentiment

In [208]:
ranks = pd.DataFrame.from_dict(recipe_ranks, orient='index')
ranks

Unnamed: 0,mean_positivity_comment,numberofcomments,mean_negativity_sentences,recipe_ID,mean_negativity_comment,sentence_rank,recipe_title,plot_name,mean_positivity_sentences
0,32.005106,"0 259 Name: numberofcomments, dtype: int64",-22.780852,"0 R0 Name: recipe_ID, dtype: object",-24.174082,"2 3506.751321 Name: numberofcomments, dtype...",sweet potato buttermilk pie,sweet potato buttermilk pie.jpg,41.522664
1,34.126572,"1 993 Name: numberofcomments, dtype: int64",-22.228985,"1 R1 Name: recipe_ID, dtype: object",-19.150751,"2 3506.751321 Name: numberofcomments, dtype...",best cocoa brownies,best cocoa brownies.jpg,45.955384
2,25.562205,"2 47 Name: numberofcomments, dtype: int64",-32.348039,"2 R2 Name: recipe_ID, dtype: object",-6.250000,"2 3506.751321 Name: numberofcomments, dtype...",sav’h,sav’h.jpg,42.263691
3,33.022786,"3 217 Name: numberofcomments, dtype: int64",-29.804901,"3 R3 Name: recipe_ID, dtype: object",-16.031882,"2 3506.751321 Name: numberofcomments, dtype...",classic cobb salad,classic cobb salad.jpg,44.310971
4,36.836628,"4 222 Name: numberofcomments, dtype: int64",-24.813667,"4 R4 Name: recipe_ID, dtype: object",-13.790238,"2 3506.751321 Name: numberofcomments, dtype...",cumin seed roasted cauliflower with yogurt,cumin seed roasted cauliflower with yogurt.jpg,47.630698
5,26.269360,"5 112 Name: numberofcomments, dtype: int64",-21.741757,"5 R5 Name: recipe_ID, dtype: object",-16.357477,"2 3506.751321 Name: numberofcomments, dtype...",chocolate pavlova,chocolate pavlova.jpg,39.174003
6,36.754380,"6 44 Name: numberofcomments, dtype: int64",-30.011111,"6 R6 Name: recipe_ID, dtype: object",-20.698121,"2 3506.751321 Name: numberofcomments, dtype...",homemade merguez with herby yogurt,homemade merguez with herby yogurt.jpg,47.783939
7,31.445060,"7 327 Name: numberofcomments, dtype: int64",-24.226824,"7 R7 Name: recipe_ID, dtype: object",-14.561116,"2 3506.751321 Name: numberofcomments, dtype...",fudgy chocolate sheet cake,fudgy chocolate sheet cake.jpg,44.693298
8,33.038985,"8 77 Name: numberofcomments, dtype: int64",-20.734196,"8 R8 Name: recipe_ID, dtype: object",-8.737875,"2 3506.751321 Name: numberofcomments, dtype...",israeli salad + pita chips,israeli salad + pita chips.jpg,42.635496
9,31.731212,"9 157 Name: numberofcomments, dtype: int64",-22.127426,"9 R9 Name: recipe_ID, dtype: object",-14.001807,"2 3506.751321 Name: numberofcomments, dtype...",failproof crepes + a crepe party,failproof crepes + a crepe party.jpg,42.309816


In [216]:
recipes[recipes.title == recipe_title].author

99    deb
Name: author, dtype: object

TypeError: cannot do label indexing on <class 'pandas.indexes.base.Index'> with these indexers [0] of <class 'int'>

In [228]:
recipes.columns

Index(['article_id', 'author', 'author_url', 'directions', 'firstimageURL',
       'ingredients', 'introductory_text', 'numberofcomments', 'published',
       'recipe_notes', 'servings', 'time', 'title', 'updated', 'url',
       'recipe_ID'],
      dtype='object')