In [1]:
from pymongo import MongoClient
from bs4 import BeautifulSoup
import urllib2
import pprint
import pickle
import pandas as pd
import numpy as np
import random
import nltk
import matplotlib.pyplot as plt
from operator import itemgetter
from textblob import TextBlob as tb
from textblob.sentiments import NaiveBayesAnalyzer
from sklearn import metrics
from sklearn.feature_extraction import text
from sklearn.cluster import MiniBatchKMeans
% matplotlib inline

In [2]:
client = MongoClient()
wine_info = client.wines.wine_info

In [19]:
def create_main_collection(filename):
    
    """
    Load data in from textfile into a mongo database; each record is a review.  If running make sure to drop the wine_info collection before the data is reloaded.
    """
    wine_info = client.wines.wine_info    
    
    with open(filename, 'r') as f:
        single_wine = {}
        i = 0
        for line in f:
            if i == 9:
                wine_info.save(single_wine)
                single_wine = {}
                i = 0
            else:
                row = line.split(':')
                try:
                    if row[0] == 'review/text':
                        row[1] = row[1].decode('latin-1').strip()
                        single_wine[row[0]] = row[1]
                    else:
                        single_wine[row[0]] = row[1].strip()
                except:
                    print row[1]
                    break
                i += 1

#### Originally I wanted to look into food and wine pairings.  At first, it seemed the best thing to do was to just search for reviews containing the food in the review.  However, it became clear there was not much data science used in this process; even so, I learned a lot in trying to rank how well a wine paired with a particular food

In [3]:
steak_cursor = wine_info.find({ "$text" : { "$search": 'steak strip porterhouse ribeye mignon -tuna -salmon'}}, {'_id': 0})

In [4]:
steak_cursor.count()

21356

In [5]:
locate_words = ['pair', 'match']
steak_wines = []

for wine in steak_cursor:
    if any(word in wine['review/text'] for word in locate_words):
        steak_wines.append(wine)

In [6]:
len(steak_wines)

3411

In [7]:
steakdf = pd.DataFrame(steak_wines)

In [8]:
steakdf.head()

Unnamed: 0,review/points,review/text,review/time,review/userId,review/userName,wine/name,wine/variant,wine/wineId,wine/year
0,89,"Initially PnP, then consumed half the 750ml bo...",1334102400,181298,drrobvino,2008 L'Aventure C&#244;te-&#224;-C&#244;te Estate,Red Rhone Blend,809034,2008
1,94,Did not know what to expect of a 27+ year bott...,1270339200,108108,Anonymous,1982 E. Guigal C&#244;te-R&#244;tie La Landonne,Syrah,65396,1982
2,90,This was a jeroboam I purchased last December....,1344816000,199258,M. Anthony Lee,1998 Sterling Vineyards Merlot,Merlot,45266,1998
3,94,Excellent wine! I was a bit concerned about ca...,1298764800,9181,Vinophiler,1996 Whitehall Lane Cabernet Sauvignon Reserve,Cabernet Sauvignon,2064,1996
4,95,This wine was brought to dinner by Mark and Li...,1183334400,2758,Vino Me,1987 Spring Mountain Vineyard Cabernet Sauvignon,Cabernet Sauvignon,2122,1987


#### Here I am looking at one particular wine and its reviews.  I'm then investigating the text for the food pairing.

In [9]:
name_cursor = wine_info.find({'wine/name': '2003 Joseph Phelps Insignia'}, {'_id': 0})

In [10]:
name_cursor.count()

102

In [11]:
for wine in name_cursor:
    if any(word in wine['review/text'] for word in ['steak', 'strip', 'porterhouse', 'ribeye', 'mignon']):
        print wine['review/text']

took to ruth chris after decanting. cherry, alcohol laden fruit bomb with decent tannin and strong long lasting after taste. while decent paired with a steak, the fruit is a bit too strong for my taste and reminds me almost of a new world or zinfandel. the fruit are very good but lacks complexity. seems best between 1.5 hr to 2.5 hr after decanting
Brought my own bottle to a Brazilian steak house where copious amounts of red meat grace your plate every few minutes. My initial impressions were thin, short, and unsatisfactory. After a 1/2 hour in the bottle it seemed to open up a bit with sweet red fruit and mild oak. By the last few glasses this had evolved to an enjoyable Insignia albeit no comparison to the 02'. I've had much better for less, Verite La Joie comes to mind. Overall this wine stood up to the meat and made the meal an all around success. Seemed to match best with the spit roasted garlic top sirloin although my food obsessions are a compeletly different issue.
Opened along

#### Above is my attempt at only bringing in wines that have a food name in the review.  I am successful, however, I cannot pull ALL the wine reviews for each wine that gets returned.  It is only returning the one review that has the food term in it.  Maybe come back to this, but for now work with all wines in memory.

In [3]:
# Pull in all wines for analysis
all_cursor = wine_info.find({},{"_id": 0})
all_cursor.count()

2025995

In [4]:
# Turn cursor object into a list so I can turn it into a pandas dataframe
all_wines = list(all_cursor)

In [5]:
# Create dataframe of all wines
allwine_df = pd.DataFrame(all_wines)

In [6]:
# Seems to be ordered by review/userName? Let's randomly shuffle to mix it up
allwine_df = allwine_df.iloc[np.random.permutation(len(allwine_df))]

In [7]:
# Total count of individual wines that have reviews
len(allwine_df['wine/name'].value_counts())

479750

In [17]:
# Change N/A's to nan to convert column to float
allwine_df.replace('N/A', np.nan, inplace=True)
allwine_df['review/points'] = allwine_df['review/points'].astype(float)

In [8]:
allwine_df.head()

Unnamed: 0,review/points,review/text,review/time,review/userId,review/userName,wine/name,wine/variant,wine/wineId,wine/year
1647093,95.0,Wow. This is fantastic pinot. Perfectly integr...,1342137600,209798,JenTomHank,2003 Dehlinger Pinot Noir Goldridge,Pinot Noir,97300,2003
1093567,98.0,This beautiful sophisticated Pomerol left me a...,1299283200,166661,rogerheaton,2000 Ch&#226;teau La Providence,Red Bordeaux Blend,278601,2000
1595590,88.0,"Distinctively Ramey, enjoyable.",1330905600,72805,rremiker,2007 Ramey Cabernet Sauvignon Napa Valley,Cabernet Sauvignon,947109,2007
545374,94.0,Savory and dripping with fruit. Terrific wine.,1170547200,12187,silton,2004 Sea Smoke Pinot Noir Southing,Pinot Noir,195510,2004
1564560,,corked and foul - cork broke during extraction,1231632000,69796,IMeredith,2003 Chalk Hill Wines Cabernet Sauvignon,Cabernet Sauvignon,625969,2003


In [9]:
# Take a random sample of 250000 reviews to analyze
sample_df = allwine_df.iloc[np.random.permutation(250000)]

In [10]:
# Initiate food list from scraping
food_list = ['asparagus', 'baked', 'barbecue', 'beef', 'broccoli', 'sprouts', 'brisket', \
             'burrito', 'casserole', 'cheese', 'chicken', 'chili', 'greens', 'crab', \
             'dessert', 'fillet', 'fish', 'gyro', 'halibut', 'burger', 'hot dog', 'ice cream', 'kale', \
             'beans', 'lamb', 'lasagna', 'legumes', 'lentils', 'lobster', 'macaroni', \
             'potato', 'mozzarella', 'pizza', 'mussels', 'noodles', 'oyster', 'pasta', 'squash', \
             'poached', 'pork', 'prime rib', 'pretzel', 'quinoa', 'ravioli', 'roast', \
             'salami', 'salmon', 'sausage', 'scallops', 'shrimp', 'soup', 'spaghetti', 'spareribs', 'spinach', \
             'peas', 'squid', 'steak', 'stir-fry', 'sushi', 'tapioca', 'teriyaki', 'turkey', 'turnip', \
             'veal', 'venison', 'yam', 'zucchini', 'fried']

red_meat = ['steak', 'mignon', 'strip', 'rib', 'rib-eye', 'spareribs']

#### The loop below basically is a starting point for filtering.  It goes through each row in the dataframe and checks if the word "pair" or "match" is in the text and also if any food from a pre-compiled food list is present.  

####From there, I then check to see if the word "well" is in the text potentially indicating a good pairing (will have to add negative words to avoid "didn't pair well" etc.).  If the review itself scored equal to or higher than 85, randomly assign a score between 0.8 to 1.0, otherwise 0.25 to 0.50.  

#### If the word "well" did not appear, if the review was 85 or higher, I give a random score between 0.5 to 0.8, otherwise 0.05 to 0.25.  If none of these words are in the text I just append a 0.

In [11]:
manual_scoring = []

for index, row in sample_df.iterrows():
    if ('pair' in row['review/text'] or 'match' in row['review/text']) and any(food in row['review/text'] for food in food_list):
        if 'well' in row['review/text']:
            if row['review/points'] >= '85':
                manual_scoring.append((index, random.uniform(0.8, 1.0)))
            else:
                manual_scoring.append((index, random.uniform(0.25, 0.5)))
        else:
            if row['review/points'] >= '85':
                manual_scoring.append((index, random.uniform(0.5, 0.8)))
            else:
                manual_scoring.append((index, random.uniform(0.05, 0.25)))
    else:
        manual_scoring.append((index, 0))

#### I make sure the scores are floats and create a list of them, followed by creating a column in the dataframe

In [12]:
scores = [float(score) for (index, score) in manual_scoring]
sample_df['score'] = scores
print 'Number of wines assigned a score: {0}'.format(len(sample_df[sample_df['score'] > 0][['review/text', 'score']]))

Number of wines assigned a score: 2840


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from IPython.kernel.zmq import kernelapp as app


In [18]:
# A little exploration on the scoring method
i = 0
for (index, score) in manual_scoring:
    if score > 0:
        print index, score
        i += 1
        if i == 6:
            break

512613 0.649198116744
530817 0.956458793836
218694 0.613008330297
1272277 0.521278653407
1529188 0.705546676528
99224 0.605483026736


#### The loop below looks through every review and appends a 1 if a word associated with steak appears in the review, otherwise appends a 0

In [16]:
steak_reviews = []

for review in sample_df['review/text'].values:
    if any(food in review for food in red_meat):
        steak_reviews.append(1)
    else:
        steak_reviews.append(0)

In [50]:
#foods = urllib2.urlopen('http://www.enchantedlearning.com/wordlist/food.shtml')
#page = BeautifulSoup(foods)

#food_list = []
#for font in page.find_all('font', {'size': '+0'}):
#    if len(font.text) > 1:
#        #food_list.append(font.text)
#        foods = font.text.split('\n')
#        for food in foods:
#            if food:
#                food_list.append(food.strip())

#food_list = food_list[1:]

### This is where I moved on to another angle.  I decided to analyze the descriptions of flavors in various wines and try to recommend wines that best showcase those flavors as well as whether or not they pair well with food.