In [267]:
from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; float:center}</style>")

In [268]:
from __future__ import division, print_function

import os
import glob
import math
import re
from collections import Counter
import datetime
from itertools import groupby

import pandas as pd
import numpy as np
import sklearn as skl
# import scipy as sp
# import scipy.stats as sps
import matplotlib.pyplot as plt
import seaborn as sns
import wordcloud as wc


import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
# from nltk import Text

import time

porter = PorterStemmer()

%matplotlib inline

sns.set_context('notebook')
sns.set_style('ticks')

### set styles, stopwords, define functions

In [269]:
stops = set(stopwords.words('english'))
    # using a set will make it faster to run through...

punctuation = ['.',',',':','!',';','-','?','"',"'",'(',')','—']   
other = ['ive','ve', "i've", "i'v", 'i’ll', 'i’ve', 'i’v']  # 'deb','hideb','don','didn','twaittry','doesn','thank','heydeb',
    
stops_punc = set(stopwords.words('english') + punctuation)

mystops = stopwords.words('english') + punctuation + other
mystops_set = set(stopwords.words('english') + punctuation + other)

In [282]:
def tokenize_sentences(comment):
    return nltk.sent_tokenize(remove_newlines(comment.lower()))
    
    
# def separate_sentences(frame, identifier, paragraph, how='merge'):
#     sentences = pd.concat([pd.Series(row[identifier], tokenize_sentences((row[paragraph]))) for _, row in frame.iterrows()]).reset_index()
#     sentences.columns = ['sentence', identifier]
    
#     if how == 'merge':
#         return sentences.merge(frame, left_on=identifier, right_on=identifier, how='outer', sort=False, suffixes=('_l','_r')) 
#     elif how == 'nomerge': 
#         return sentences
#     else: 
#         return sentences.merge(frame, left_on=identifier, right_on=identifier, how='outer', sort=False, suffixes=('_l','_r')) 

# slightly faster version
def separate_sentences(frame, identifier, paragraph, how='merge'):
    sentences = pd.DataFrame((tokenize_sentences(row[paragraph]) for _, row in frame.iterrows()), index=frame[identifier]).stack()
    sentences = sentences.reset_index() [[0, identifier]] # var1 variable is currently labeled 0
    sentences.columns = ['sentence', identifier] # renaming var1
    
    if how == 'merge':
        return sentences.merge(frame, left_on=identifier, right_on=identifier, how='outer', sort=False, suffixes=('_l','_r')) 
    elif how == 'nomerge': 
        return sentences
    else: 
        return sentences.merge(frame, left_on=identifier, right_on=identifier, how='outer', sort=False, suffixes=('_l','_r')) 


def make_lowercase(comment):
    return remove_newlines(comment.lower())


# def tokenize_aslist(comment):
#     comment = remove_newlines(comment)
#     if comment == []:
#         return None
#     else: 
#         return [word for word in word_tokenize(remove_newlines(comment).lower()) if word not in mystops]

    
def tokenize(comment):
    comment = remove_newlines(comment)
    if comment == []:
        return None
    else: 
        return ' '.join([word for word in word_tokenize(remove_newlines(comment).lower()) if word not in mystops])

def ngram(comment):
    comment = remove_newlines(comment)
    if comment == []:
        return None
    else: 
        return [word for word in ngrams(remove_newlines(comment).lower().split(),gram) if word not in mystops]

    
# def tokenize_stem_aslist(comment):
#     tokens = word_tokenize(remove_newlines(comment).lower())
#     if tokens == []:
#         return None
#     else: 
#         return[porter.stem(word) for word in tokens if word not in mystops]
    
def tokenize_stem(comment):
    tokens = word_tokenize(remove_newlines(comment).lower())
    if tokens == []:
        return None
    else: 
        return ' '.join([porter.stem(word) for word in tokens if word not in mystops])


def remove_newlines(comment):    
    return re.sub(r"\n", " ", comment)



def preprocess_comments_data(frame):
    # make sure commentIDs are unique ( = row identity)
    frame.loc[:,'commentID'] = frame.index

    # remove any frame with no comment text
    frame = frame.loc[pd.notnull(frame['usercomment']),:]

    # replace NaN usernames with 'anon'
    frame.loc[:,'username'].fillna('anon', inplace=True)

    # tokenize data
    frame.loc[:,'usercomment'] = frame.loc[:,'usercomment'].apply(remove_newlines)
    frame.loc[:,'usercomment_lower'] = frame.loc[:,'usercomment'].apply(make_lowercase)

    frame.loc[:,'tokens'] = frame.loc[:,'usercomment'].apply(tokenize)
    frame.loc[:,'tokens_stemmed'] = frame.loc[:,'usercomment'].apply(tokenize_stem)
    
    frame.dropna(inplace=True)
    frame.drop([],axis=0, inplace=True)

    
    frame2 = separate_sentences(frame, 'commentID','usercomment',how='merge')
    frame2.dropna(inplace=True)
    frame2.drop([],axis=0, inplace=True)
    

    frame2.loc[:,'sentence_tokens'] = frame2.loc[:,'sentence'].apply(tokenize)
    frame2.loc[:,'sentence_tokens_stemmed'] = frame2.loc[:,'sentence'].apply(tokenize_stem)
    
    gram = 2
    comments_classified.loc[:,'sentence_bigrams'] = comments_classified.loc[:,'sentence'].apply(ngram)
    gram = 3
    comments_classified.loc[:,'sentence_trigrams'] = comments_classified.loc[:,'sentence'].apply(ngram)
    
    frame2.dropna(inplace=True)
    frame2.drop([],axis=0, inplace=True)

#     frame2['sentence_tokens_aslist'] = frame2.sentence.apply(tokenize_aslist)
#     frame2 = frame2.dropna()
#     frame2 = frame2.drop([],axis=0)

#     frame2['sentence_tokens_stemmed_aslist'] = frame2.sentence.apply(tokenize_stem_aslist)
#     frame2 = frame2.dropna()
#     frame2 = frame2.drop([],axis=0)

    return frame, frame2 


import & sanity check

In [286]:
comments = pd.read_csv('/Users/kateliea/Documents/Insight/project/webscrapers/comments_smittenkitchen_100.csv', index_col=0)

In [287]:
comments.columns

Index(['child_id', 'children', 'commentID', 'comment_time', 'recipenumber',
       'title', 'url', 'usercomment', 'username', 'usersite'],
      dtype='object')

In [271]:
comments_classified = pd.read_csv('comments_classified_SK_filtered2000.csv', index_col=0)

In [15]:
comments_classified.columns

Index(['category', 'sentence', 'commentID', 'child_id', 'children',
       'comment_time', 'recipenumber', 'title', 'url', 'usercomment',
       'username', 'usersite', 'usercomment_lower', 'tokens',
       'tokens_stemmed'],
      dtype='object')

## make new dataframe with sentences separated, tokenize everything

In [288]:
comments_only, comments_with_sentences = preprocess_comments_data(comments)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a 

In [289]:
comments_with_sentences.to_csv('comments_with_sentences_100.csv'), comments_only.to_csv('comments_only_100.csv')

(None, None)

In [290]:
comments_only.shape, comments_with_sentences.shape

((27775, 13), (97316, 16))

In [62]:
gram = 2
comments_classified.loc[:,'sentence_bigrams'] = comments_classified.loc[:,'sentence'].apply(ngram)
gram = 3
comments_classified.loc[:,'sentence_trigrams'] = comments_classified.loc[:,'sentence'].apply(ngram)

comments_classified.loc[:,'sentence_tokens'] = comments_classified.loc[:,'sentence'].apply(tokenize)
comments_classified.loc[:,'sentence_tokens_stemmed'] = comments_classified.loc[:,'sentence'].apply(tokenize_stem)

comments_classified.dropna(inplace=True)
comments_classified.drop([],axis=0, inplace=True)

comments_classified.to_csv('comments_classified_SK_filtered2000.csv')

In [63]:
comments_classified.head()

Unnamed: 0,category,sentence,commentID,child_id,children,comment_time,recipenumber,title,url,usercomment,username,usersite,usercomment_lower,tokens,tokens_stemmed,sentence_bigrams,sentence_trigrams,sentence_tokens,sentence_tokens_stemmed
0,other,oh my – a little piece of heaven right there a...,0,0,no,2009-11-17 11:44:00,0,sweet potato buttermilk pie,https://smittenkitchen.com/2009/11/sweet-potat...,Oh my – a little piece of heaven right there a...,Tabitha (From Single to Married),http://www.fromsingletomarried.com,oh my – a little piece of heaven right there a...,"['oh', '–', 'little', 'piece', 'heaven', 'righ...","['oh', '–', 'littl', 'piec', 'heaven', 'right'...","[(oh, my), (my, –), (–, a), (a, little), (litt...","[(oh, my, –), (my, –, a), (–, a, little), (a, ...",oh – little piece heaven right time holidays,oh – littl piec heaven right time holiday
1,other,"yes, please!",1,0,no,2009-11-17 11:44:00,0,sweet potato buttermilk pie,https://smittenkitchen.com/2009/11/sweet-potat...,"yes, please!",Cari,none,"yes, please!","['yes', 'please']","['ye', 'pleas']","[(yes,, please!)]",[],yes please,ye pleas
2,other,ive never made sweet potato pie!,2,0,no,2009-11-17 11:44:00,0,sweet potato buttermilk pie,https://smittenkitchen.com/2009/11/sweet-potat...,Ive never made sweet potato pie! I’d love to t...,Jessica at How Sweet It Is,http://howsweeteats.com,ive never made sweet potato pie! i’d love to t...,"['ive', 'never', 'made', 'sweet', 'potato', 'p...","['ive', 'never', 'made', 'sweet', 'potato', 'p...","[(ive, never), (never, made), (made, sweet), (...","[(ive, never, made), (never, made, sweet), (ma...",never made sweet potato pie,never made sweet potato pie
3,other,i’d love to try for the holidays!,2,0,no,2009-11-17 11:44:00,0,sweet potato buttermilk pie,https://smittenkitchen.com/2009/11/sweet-potat...,Ive never made sweet potato pie! I’d love to t...,Jessica at How Sweet It Is,http://howsweeteats.com,ive never made sweet potato pie! i’d love to t...,"['ive', 'never', 'made', 'sweet', 'potato', 'p...","['ive', 'never', 'made', 'sweet', 'potato', 'p...","[(i’d, love), (love, to), (to, try), (try, for...","[(i’d, love, to), (love, to, try), (to, try, f...",i’d love try holidays,i’d love tri holiday
4,other,this is definitely going on my must make list.,3,0,no,2009-11-17 11:44:00,0,sweet potato buttermilk pie,https://smittenkitchen.com/2009/11/sweet-potat...,This is definitely going on my Must Make list....,LauraC,http://JonAndLaura.blogspot.com,this is definitely going on my must make list....,"['definitely', 'going', 'must', 'make', 'list'...","['definit', 'go', 'must', 'make', 'list', 'nor...","[(this, is), (is, definitely), (definitely, go...","[(this, is, definitely), (is, definitely, goin...",definitely going must make list,definit go must make list


## define test, train data

In [265]:
comments_classified.to_csv('comments_classified_SK_filtered2000_additional.csv')

In [241]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
comments_classified.loc[:,'category_label'] = le.fit_transform(comments_classified.category)

In [229]:
comments_classified.category.unique()
# comments_classified.category.replace('try','other',inplace=True)
# comments_classified.category.replace('addition','suggestion',inplace=True)
# comments_classified.category.replace('subtraction','substitution',inplace=True)
# comments_classified.category.replace('related','other',inplace=True)
# comments_classified.category.replace('question','other',inplace=True)

In [252]:
X_train = comments_classified[comments_classified.category != 'other'].sentence #\.as_matrix()
# target = comments_classified.category.as_matrix()
target = comments_classified[comments_classified.category != 'other'].category_label #.as_matrix()

In [243]:
X_test = comments_with_sentences.loc[2000:10000, 'sentence'].as_matrix()

In [253]:
list(le.classes_)

['love', 'other', 'substitution', 'suggestion']

In [245]:
le.inverse_transform([2,1,0])

array(['substitution', 'other', 'love'], dtype=object)

## word relevancy - counter, tf-idf

In [254]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [255]:
vectorizer = CountVectorizer(ngram_range=(2,3), stop_words=mystops)

X_train_counts = vectorizer.fit_transform(X_train)
X_train_counts.shape

(310, 3967)

In [256]:
transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)

X_train_tf = transformer.transform(X_train_counts)
X_train_tf.shape

(310, 3967)

## classifier

In [260]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB

train

In [261]:
classifier = BernoulliNB().fit(X_train_counts, target)

test

In [262]:
X_test_counts = vectorizer.transform(X_test)
# X_test_tf = transformer.transform(X_test_counts)

In [263]:
predicted = classifier.predict(X_test_counts)

In [264]:
for doc, category in zip(X_test[:100], predicted[:100]):
    print('%r => %s' % (doc, category))

'except we’ll make them in a mini muffin pan so they’ll be brownie bites (otherwise they won’t survive until the weekend).' => 0
'oh my goodness!' => 0
'these were wonderful!' => 0
'i have since discovering your website made the mushroom and brioche bites (the were like crack!' => 0
'i couldn’t put them down!).' => 0
'i made the caramel pudding.' => 0
'yummy.' => 0
'and now these brownies!' => 0
'about 2 weeks ago all i wanted was brownies and went into whole food and bought what i thought were the most wonderful brownies ever!' => 0
'until i made these!' => 0
'my boyfriend and i have loved every last bite (though when i’m making them they made 9 per pan!!)' => 0
'i think if i cut them into 25 squares i would actually end up eating more!' => 0
'thanks for the awesome recipes!' => 0
'this is quite similar to the alton brown recipe for cocoa brownies which i just happened to make today.' => 0
'i don’t follow his mixing method though in favor for the king arthur one similar to alice’s rec

In [116]:
predicted

array(['other', 'other', 'other', ..., 'other', 'other', 'other'], 
      dtype='<U12')

In [None]:
comments_classified.head()

## PCA 

In [56]:
from sklearn import decomposition

In [266]:
pca = decomposition.PCA(n_components=10)

pca.fit(X_train_count)
X_trans = pca.transform(X_train_count)

NameError: name 'X_train_count' is not defined

In [64]:
def encode_target(df, target_column):
    """Add column to df with integers for the target.

    Args
    ----
    df -- pandas DataFrame.
    target_column -- column to map to int, producing
                     new Target column.

    Returns
    -------
    df_mod -- modified DataFrame.
    targets -- list of target names.
    """
    df_mod = df.copy()
    targets = df_mod[target_column].unique()
    map_to_int = {name: n for n, name in enumerate(targets)}
    df_mod["Target"] = df_mod[target_column].replace(map_to_int)

    return (df_mod, targets)


In [65]:
x,y = encode_target(comments_classified, 'category')

In [165]:
comments_classified.loc[comments_classified.sentence.str.contains('instead of') == True, 'category'] = 'substitution'

# comments_classified

## Decision tree classifier

In [143]:
from sklearn.cross_validation import cross_val_score
from sklearn.tree import DecisionTreeClassifier

In [144]:
tree_classifier = DecisionTreeClassifier(max_depth=3, max_features=10)

In [272]:
comments_classified.shape

(1999, 19)

In [275]:
comments.columns

Index(['child_id', 'children', 'commentID', 'comment_time', 'recipenumber',
       'title', 'url', 'usercomment', 'username', 'usersite'],
      dtype='object')